diff --git a/RNBroadcast.podspec b/RNBroadcast.podspec
new file mode 100644
index 0000000..54c7ff5
--- /dev/null
+++ b/RNBroadcast.podspec
@@ -0,0 +1,19 @@
+require 'json'
+
+package = JSON.parse(File.read(File.join(__dir__, 'package.json')))
+
+Pod::Spec.new do |s|
+  s.name         = "RNBroadcast"
+  s.version      = package['version']
+  s.summary      = package['description']
+  s.license      = package['license']
+
+  s.authors      = package['author']
+  s.homepage     = package['homepage']
+  s.platform     = :ios, "9.0"
+
+  s.source       = { :git => "https://github.com/BehaviorCloud/react-native-broadcast.git", :tag => "#{s.version}" }
+  s.source_files  = "ios/RNBroadcast/**/*.{c,cpp,h,m,mm}"
+
+  s.dependency 'React'
+end
diff --git a/android/build.gradle b/android/build.gradle
index 8520571..6afed90 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -28,8 +28,10 @@ android {
 
 repositories {
     mavenCentral()
+    maven { url 'https://jitpack.io' }
 }
 
 dependencies {
     compile 'com.facebook.react:react-native:+'
+    implementation 'com.github.behaviorcloud.rtmp-rtsp-stream-client-java:rtplibrary:1.7.3'
 }
diff --git a/android/src/main/AndroidManifest.xml b/android/src/main/AndroidManifest.xml
index 43abd1c..4f0742d 100644
--- a/android/src/main/AndroidManifest.xml
+++ b/android/src/main/AndroidManifest.xml
@@ -1,8 +1,10 @@
 
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="com.reactlibrary">
+          package="com.pedrolibrary">
+    <uses-permission android:name="android.permission.INTERNET" />
     <uses-permission android:name="android.permission.CAMERA" />
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
     <uses-feature android:name="android.hardware.camera" />
     <uses-feature android:name="android.hardware.camera.autofocus" />
 </manifest>
-  
\ No newline at end of file
+  
diff --git a/android/src/main/assets/filter/amaro_mask1.jpg b/android/src/main/assets/filter/amaro_mask1.jpg
deleted file mode 100755
index b1f7216..0000000
Binary files a/android/src/main/assets/filter/amaro_mask1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/amaro_mask2.jpg b/android/src/main/assets/filter/amaro_mask2.jpg
deleted file mode 100755
index ecebdb9..0000000
Binary files a/android/src/main/assets/filter/amaro_mask2.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/amaromap.png b/android/src/main/assets/filter/amaromap.png
deleted file mode 100755
index 4a5353c..0000000
Binary files a/android/src/main/assets/filter/amaromap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/blackboard1024.png b/android/src/main/assets/filter/blackboard1024.png
deleted file mode 100755
index e46d825..0000000
Binary files a/android/src/main/assets/filter/blackboard1024.png and /dev/null differ
diff --git a/android/src/main/assets/filter/blend1.jpg b/android/src/main/assets/filter/blend1.jpg
deleted file mode 100755
index 5576483..0000000
Binary files a/android/src/main/assets/filter/blend1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/bluevintage_mask1.jpg b/android/src/main/assets/filter/bluevintage_mask1.jpg
deleted file mode 100755
index 3d6511d..0000000
Binary files a/android/src/main/assets/filter/bluevintage_mask1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/brannan_blowout.png b/android/src/main/assets/filter/brannan_blowout.png
deleted file mode 100755
index 4eabfa5..0000000
Binary files a/android/src/main/assets/filter/brannan_blowout.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brannan_contrast.png b/android/src/main/assets/filter/brannan_contrast.png
deleted file mode 100755
index 2bcb1d3..0000000
Binary files a/android/src/main/assets/filter/brannan_contrast.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brannan_luma.png b/android/src/main/assets/filter/brannan_luma.png
deleted file mode 100755
index 3dc01a6..0000000
Binary files a/android/src/main/assets/filter/brannan_luma.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brannan_process.png b/android/src/main/assets/filter/brannan_process.png
deleted file mode 100755
index 5986f1b..0000000
Binary files a/android/src/main/assets/filter/brannan_process.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brannan_screen.png b/android/src/main/assets/filter/brannan_screen.png
deleted file mode 100755
index 0b26461..0000000
Binary files a/android/src/main/assets/filter/brannan_screen.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brooklynCurves1.png b/android/src/main/assets/filter/brooklynCurves1.png
deleted file mode 100755
index 4a8e922..0000000
Binary files a/android/src/main/assets/filter/brooklynCurves1.png and /dev/null differ
diff --git a/android/src/main/assets/filter/brooklynCurves2.png b/android/src/main/assets/filter/brooklynCurves2.png
deleted file mode 100755
index 24d6776..0000000
Binary files a/android/src/main/assets/filter/brooklynCurves2.png and /dev/null differ
diff --git a/android/src/main/assets/filter/calm_mask1.jpg b/android/src/main/assets/filter/calm_mask1.jpg
deleted file mode 100755
index 540307e..0000000
Binary files a/android/src/main/assets/filter/calm_mask1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/calm_mask2.jpg b/android/src/main/assets/filter/calm_mask2.jpg
deleted file mode 100755
index 0f444ce..0000000
Binary files a/android/src/main/assets/filter/calm_mask2.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/earlybirdOverlayMap.png b/android/src/main/assets/filter/earlybirdOverlayMap.png
deleted file mode 100755
index 7354055..0000000
Binary files a/android/src/main/assets/filter/earlybirdOverlayMap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/earlybirdblowout.png b/android/src/main/assets/filter/earlybirdblowout.png
deleted file mode 100755
index 4ecc68a..0000000
Binary files a/android/src/main/assets/filter/earlybirdblowout.png and /dev/null differ
diff --git a/android/src/main/assets/filter/earlybirdcurves.png b/android/src/main/assets/filter/earlybirdcurves.png
deleted file mode 100755
index e067ac7..0000000
Binary files a/android/src/main/assets/filter/earlybirdcurves.png and /dev/null differ
diff --git a/android/src/main/assets/filter/earlybirdmap.png b/android/src/main/assets/filter/earlybirdmap.png
deleted file mode 100755
index a2677a5..0000000
Binary files a/android/src/main/assets/filter/earlybirdmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/earlybirdoverlaymap_new.png b/android/src/main/assets/filter/earlybirdoverlaymap_new.png
deleted file mode 100755
index 7354055..0000000
Binary files a/android/src/main/assets/filter/earlybirdoverlaymap_new.png and /dev/null differ
diff --git a/android/src/main/assets/filter/edgeburn.png b/android/src/main/assets/filter/edgeburn.png
deleted file mode 100755
index 04f831b..0000000
Binary files a/android/src/main/assets/filter/edgeburn.png and /dev/null differ
diff --git a/android/src/main/assets/filter/fairy_tale.png b/android/src/main/assets/filter/fairy_tale.png
deleted file mode 100755
index 6091b10..0000000
Binary files a/android/src/main/assets/filter/fairy_tale.png and /dev/null differ
diff --git a/android/src/main/assets/filter/filter_map_first.png b/android/src/main/assets/filter/filter_map_first.png
deleted file mode 100755
index e7b4d7f..0000000
Binary files a/android/src/main/assets/filter/filter_map_first.png and /dev/null differ
diff --git a/android/src/main/assets/filter/flower_layer2c.jpg b/android/src/main/assets/filter/flower_layer2c.jpg
deleted file mode 100755
index f734fda..0000000
Binary files a/android/src/main/assets/filter/flower_layer2c.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/freud_rand.png b/android/src/main/assets/filter/freud_rand.png
deleted file mode 100755
index d3c9fd9..0000000
Binary files a/android/src/main/assets/filter/freud_rand.png and /dev/null differ
diff --git a/android/src/main/assets/filter/healthy_mask_1.jpg b/android/src/main/assets/filter/healthy_mask_1.jpg
deleted file mode 100755
index 7127a91..0000000
Binary files a/android/src/main/assets/filter/healthy_mask_1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/hefegradientmap.png b/android/src/main/assets/filter/hefegradientmap.png
deleted file mode 100755
index 8014346..0000000
Binary files a/android/src/main/assets/filter/hefegradientmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/hefemap.png b/android/src/main/assets/filter/hefemap.png
deleted file mode 100755
index 1b7fcad..0000000
Binary files a/android/src/main/assets/filter/hefemap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/hefemetal.png b/android/src/main/assets/filter/hefemetal.png
deleted file mode 100755
index 09e996f..0000000
Binary files a/android/src/main/assets/filter/hefemetal.png and /dev/null differ
diff --git a/android/src/main/assets/filter/hefesoftlight.png b/android/src/main/assets/filter/hefesoftlight.png
deleted file mode 100755
index 47c3b9d..0000000
Binary files a/android/src/main/assets/filter/hefesoftlight.png and /dev/null differ
diff --git a/android/src/main/assets/filter/hudsonbackground.png b/android/src/main/assets/filter/hudsonbackground.png
deleted file mode 100755
index 505896e..0000000
Binary files a/android/src/main/assets/filter/hudsonbackground.png and /dev/null differ
diff --git a/android/src/main/assets/filter/hudsonmap.png b/android/src/main/assets/filter/hudsonmap.png
deleted file mode 100755
index 00cb1db..0000000
Binary files a/android/src/main/assets/filter/hudsonmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/inkwellmap.png b/android/src/main/assets/filter/inkwellmap.png
deleted file mode 100755
index e9541f1..0000000
Binary files a/android/src/main/assets/filter/inkwellmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/kelvinmap.png b/android/src/main/assets/filter/kelvinmap.png
deleted file mode 100755
index e5eb5af..0000000
Binary files a/android/src/main/assets/filter/kelvinmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/line_layer_c.jpg b/android/src/main/assets/filter/line_layer_c.jpg
deleted file mode 100755
index 83dbf52..0000000
Binary files a/android/src/main/assets/filter/line_layer_c.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/lomomap_new.png b/android/src/main/assets/filter/lomomap_new.png
deleted file mode 100755
index fcd10aa..0000000
Binary files a/android/src/main/assets/filter/lomomap_new.png and /dev/null differ
diff --git a/android/src/main/assets/filter/lookup_amatorka_02.png b/android/src/main/assets/filter/lookup_amatorka_02.png
deleted file mode 100755
index 917b71f..0000000
Binary files a/android/src/main/assets/filter/lookup_amatorka_02.png and /dev/null differ
diff --git a/android/src/main/assets/filter/lookup_highkey.png b/android/src/main/assets/filter/lookup_highkey.png
deleted file mode 100755
index 13f7937..0000000
Binary files a/android/src/main/assets/filter/lookup_highkey.png and /dev/null differ
diff --git a/android/src/main/assets/filter/n1977blowout.png b/android/src/main/assets/filter/n1977blowout.png
deleted file mode 100755
index f97b39c..0000000
Binary files a/android/src/main/assets/filter/n1977blowout.png and /dev/null differ
diff --git a/android/src/main/assets/filter/n1977map.png b/android/src/main/assets/filter/n1977map.png
deleted file mode 100755
index 1c4cc63..0000000
Binary files a/android/src/main/assets/filter/n1977map.png and /dev/null differ
diff --git a/android/src/main/assets/filter/nashvillemap.png b/android/src/main/assets/filter/nashvillemap.png
deleted file mode 100755
index 490fece..0000000
Binary files a/android/src/main/assets/filter/nashvillemap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/overlaymap.png b/android/src/main/assets/filter/overlaymap.png
deleted file mode 100755
index ea92b16..0000000
Binary files a/android/src/main/assets/filter/overlaymap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/pixar_curves.png b/android/src/main/assets/filter/pixar_curves.png
deleted file mode 100755
index 36db2c3..0000000
Binary files a/android/src/main/assets/filter/pixar_curves.png and /dev/null differ
diff --git a/android/src/main/assets/filter/rise_mask1.jpg b/android/src/main/assets/filter/rise_mask1.jpg
deleted file mode 100755
index 1b4508c..0000000
Binary files a/android/src/main/assets/filter/rise_mask1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/rise_mask2.jpg b/android/src/main/assets/filter/rise_mask2.jpg
deleted file mode 100755
index 8d9ee69..0000000
Binary files a/android/src/main/assets/filter/rise_mask2.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/risemap.png b/android/src/main/assets/filter/risemap.png
deleted file mode 100755
index 4903c9f..0000000
Binary files a/android/src/main/assets/filter/risemap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/sierramap.png b/android/src/main/assets/filter/sierramap.png
deleted file mode 100755
index 8efe2a0..0000000
Binary files a/android/src/main/assets/filter/sierramap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/sierravignette.png b/android/src/main/assets/filter/sierravignette.png
deleted file mode 100755
index d5ec720..0000000
Binary files a/android/src/main/assets/filter/sierravignette.png and /dev/null differ
diff --git a/android/src/main/assets/filter/softlight.png b/android/src/main/assets/filter/softlight.png
deleted file mode 100755
index 352dbb7..0000000
Binary files a/android/src/main/assets/filter/softlight.png and /dev/null differ
diff --git a/android/src/main/assets/filter/sutrocurves.png b/android/src/main/assets/filter/sutrocurves.png
deleted file mode 100755
index c3bf949..0000000
Binary files a/android/src/main/assets/filter/sutrocurves.png and /dev/null differ
diff --git a/android/src/main/assets/filter/sutroedgeburn.png b/android/src/main/assets/filter/sutroedgeburn.png
deleted file mode 100755
index 8400007..0000000
Binary files a/android/src/main/assets/filter/sutroedgeburn.png and /dev/null differ
diff --git a/android/src/main/assets/filter/sutrometal.png b/android/src/main/assets/filter/sutrometal.png
deleted file mode 100755
index 39f35c4..0000000
Binary files a/android/src/main/assets/filter/sutrometal.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toastercolorshift.png b/android/src/main/assets/filter/toastercolorshift.png
deleted file mode 100755
index c006889..0000000
Binary files a/android/src/main/assets/filter/toastercolorshift.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toastercurves.png b/android/src/main/assets/filter/toastercurves.png
deleted file mode 100755
index dcf8808..0000000
Binary files a/android/src/main/assets/filter/toastercurves.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toastermetal.png b/android/src/main/assets/filter/toastermetal.png
deleted file mode 100755
index 76e69dc..0000000
Binary files a/android/src/main/assets/filter/toastermetal.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toasteroverlaymapwarm.png b/android/src/main/assets/filter/toasteroverlaymapwarm.png
deleted file mode 100755
index 32c5f4d..0000000
Binary files a/android/src/main/assets/filter/toasteroverlaymapwarm.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toastersoftlight.png b/android/src/main/assets/filter/toastersoftlight.png
deleted file mode 100755
index 0801e20..0000000
Binary files a/android/src/main/assets/filter/toastersoftlight.png and /dev/null differ
diff --git a/android/src/main/assets/filter/toy_mask1.jpg b/android/src/main/assets/filter/toy_mask1.jpg
deleted file mode 100755
index e09d038..0000000
Binary files a/android/src/main/assets/filter/toy_mask1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/toy_mask2.jpg b/android/src/main/assets/filter/toy_mask2.jpg
deleted file mode 100755
index 961de26..0000000
Binary files a/android/src/main/assets/filter/toy_mask2.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/toy_mask3.jpg b/android/src/main/assets/filter/toy_mask3.jpg
deleted file mode 100755
index 90fdbc5..0000000
Binary files a/android/src/main/assets/filter/toy_mask3.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/valenciagradientmap.png b/android/src/main/assets/filter/valenciagradientmap.png
deleted file mode 100755
index d7a4cd3..0000000
Binary files a/android/src/main/assets/filter/valenciagradientmap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/valenciamap.png b/android/src/main/assets/filter/valenciamap.png
deleted file mode 100755
index 5cc1331..0000000
Binary files a/android/src/main/assets/filter/valenciamap.png and /dev/null differ
diff --git a/android/src/main/assets/filter/vignette_map.png b/android/src/main/assets/filter/vignette_map.png
deleted file mode 100755
index 548201b..0000000
Binary files a/android/src/main/assets/filter/vignette_map.png and /dev/null differ
diff --git a/android/src/main/assets/filter/vignettemap_new.png b/android/src/main/assets/filter/vignettemap_new.png
deleted file mode 100755
index d6617d7..0000000
Binary files a/android/src/main/assets/filter/vignettemap_new.png and /dev/null differ
diff --git a/android/src/main/assets/filter/walden_map.png b/android/src/main/assets/filter/walden_map.png
deleted file mode 100755
index 19a283c..0000000
Binary files a/android/src/main/assets/filter/walden_map.png and /dev/null differ
diff --git a/android/src/main/assets/filter/warm_layer1.jpg b/android/src/main/assets/filter/warm_layer1.jpg
deleted file mode 100755
index 0734114..0000000
Binary files a/android/src/main/assets/filter/warm_layer1.jpg and /dev/null differ
diff --git a/android/src/main/assets/filter/xpromap.png b/android/src/main/assets/filter/xpromap.png
deleted file mode 100755
index 893fb62..0000000
Binary files a/android/src/main/assets/filter/xpromap.png and /dev/null differ
diff --git a/android/src/main/java/com/coremedia/iso/AbstractBoxParser.java b/android/src/main/java/com/coremedia/iso/AbstractBoxParser.java
deleted file mode 100755
index 6d92acd..0000000
--- a/android/src/main/java/com/coremedia/iso/AbstractBoxParser.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.coremedia.iso.boxes.UserBox;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.ReadableByteChannel;
-import java.util.logging.Logger;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This BoxParser handles the basic stuff like reading size and extracting box type.
- */
-public abstract class AbstractBoxParser implements BoxParser {
-
-    private static Logger LOG = Logger.getLogger(AbstractBoxParser.class.getName());
-
-    public abstract Box createBox(String type, byte[] userType, String parent);
-
-    /**
-     * Parses the next size and type, creates a box instance and parses the box's content.
-     *
-     * @param byteChannel the FileChannel pointing to the ISO file
-     * @param parent      the current box's parent (null if no parent)
-     * @return the box just parsed
-     * @throws java.io.IOException if reading from <code>in</code> fails
-     */
-    public Box parseBox(ReadableByteChannel byteChannel, ContainerBox parent) throws IOException {
-
-
-        ByteBuffer header = ChannelHelper.readFully(byteChannel, 8);
-
-        long size = IsoTypeReader.readUInt32(header);
-        // do plausibility check
-        if (size < 8 && size > 1) {
-            LOG.severe("Plausibility check failed: size < 8 (size = " + size + "). Stop parsing!");
-            return null;
-        }
-
-
-        String type = IsoTypeReader.read4cc(header);
-        byte[] usertype = null;
-        long contentSize;
-
-        if (size == 1) {
-            ByteBuffer bb = ByteBuffer.allocate(8);
-            byteChannel.read(bb);
-            bb.rewind();
-            size = IsoTypeReader.readUInt64(bb);
-            contentSize = size - 16;
-        } else if (size == 0) {
-            if (byteChannel instanceof FileChannel) {
-                size = ((FileChannel) byteChannel).size() - ((FileChannel) byteChannel).position() - 8;
-            } else {
-                throw new RuntimeException("Only FileChannel inputs may use size == 0 (box reaches to the end of file)");
-            }
-            contentSize = size - 8;
-        } else {
-            contentSize = size - 8;
-        }
-        if (UserBox.TYPE.equals(type)) {
-            ByteBuffer bb = ByteBuffer.allocate(16);
-            byteChannel.read(bb);
-            bb.rewind();
-            usertype = bb.array();
-            contentSize -= 16;
-        }
-        Box box = createBox(type, usertype, parent.getType());
-        box.setParent(parent);
-        LOG.finest("Parsing " + box.getType());
-        // System.out.println("parsing " + Arrays.toString(box.getType()) + " " + box.getClass().getName() + " size=" + size);
-
-
-        if (l2i(size - contentSize) == 8) {
-            // default - no large box - no uuid
-            // do nothing header's already correct
-            header.rewind();
-        } else if (l2i(size - contentSize) == 16) {
-            header = ByteBuffer.allocate(16);
-            IsoTypeWriter.writeUInt32(header, 1);
-            header.put(IsoFile.fourCCtoBytes(type));
-            IsoTypeWriter.writeUInt64(header, size);
-        } else if (l2i(size - contentSize) == 24) {
-            header = ByteBuffer.allocate(24);
-            IsoTypeWriter.writeUInt32(header, size);
-            header.put(IsoFile.fourCCtoBytes(type));
-            header.put(usertype);
-        } else if (l2i(size - contentSize) == 32) {
-            header = ByteBuffer.allocate(32);
-            IsoTypeWriter.writeUInt32(header, size);
-            header.put(IsoFile.fourCCtoBytes(type));
-            IsoTypeWriter.writeUInt64(header, size);
-            header.put(usertype);
-        } else {
-            throw new RuntimeException("I didn't expect that");
-        }
-
-
-        box.parse(byteChannel, header, contentSize, this);
-        // System.out.println("box = " + box);
-
-
-        assert size == box.getSize() :
-                "Reconstructed Size is not x to the number of parsed bytes! (" +
-                        box.getType() + ")"
-                        + " Actual Box size: " + size + " Calculated size: " + box.getSize();
-        return box;
-    }
-
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/BoxParser.java b/android/src/main/java/com/coremedia/iso/BoxParser.java
deleted file mode 100755
index cbe9a6f..0000000
--- a/android/src/main/java/com/coremedia/iso/BoxParser.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-
-import java.io.IOException;
-import java.nio.channels.ReadableByteChannel;
-
-/**
- * Basic interface to create boxes from a <code>IsoBufferWrapper</code> and its parent.
- */
-public interface BoxParser {
-    Class<? extends Box> getClassForFourCc(String type, byte[] userType, String parent);
-
-    Box parseBox(ReadableByteChannel in, ContainerBox parent) throws IOException;
-}
diff --git a/android/src/main/java/com/coremedia/iso/ChannelHelper.java b/android/src/main/java/com/coremedia/iso/ChannelHelper.java
deleted file mode 100755
index 2ec1d05..0000000
--- a/android/src/main/java/com/coremedia/iso/ChannelHelper.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.SelectionKey;
-import java.nio.channels.WritableByteChannel;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-
-public class ChannelHelper {
-    public static ByteBuffer readFully(final ReadableByteChannel channel, long size) throws IOException {
-
-        if (channel instanceof FileChannel && size > 1024 * 1024) {
-            ByteBuffer bb = ((FileChannel) channel).map(FileChannel.MapMode.READ_ONLY, ((FileChannel) channel).position(), size);
-            ((FileChannel) channel).position(((FileChannel) channel).position() + size);
-            return bb;
-        } else {
-            ByteBuffer buf = ByteBuffer.allocate(l2i(size));
-            readFully(channel, buf, buf.limit());
-            buf.rewind();
-            assert buf.limit() == size;
-
-            return buf;
-        }
-
-    }
-
-
-    public static void readFully(final ReadableByteChannel channel, final ByteBuffer buf)
-            throws IOException {
-        readFully(channel, buf, buf.remaining());
-    }
-
-    public static int readFully(final ReadableByteChannel channel, final ByteBuffer buf, final int length)
-            throws IOException {
-        int n, count = 0;
-        while (-1 != (n = channel.read(buf))) {
-            count += n;
-            if (count == length) {
-                break;
-            }
-        }
-        if (n == -1) {
-            throw new EOFException("End of file. No more boxes.");
-        }
-        return count;
-    }
-
-
-    public static void writeFully(final WritableByteChannel channel, final ByteBuffer buf)
-            throws IOException {
-        do {
-            int written = channel.write(buf);
-            if (written < 0) {
-                throw new EOFException();
-            }
-        } while (buf.hasRemaining());
-    }
-
-
-    public static void close(SelectionKey key) {
-        try {
-            key.channel().close();
-        } catch (IOException e) {
-            // nop
-        }
-
-    }
-
-
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/coremedia/iso/Hex.java b/android/src/main/java/com/coremedia/iso/Hex.java
deleted file mode 100755
index b3d55ef..0000000
--- a/android/src/main/java/com/coremedia/iso/Hex.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-Extracted from commons-codec
- */
-package com.coremedia.iso;
-
-import java.io.ByteArrayOutputStream;
-
-/**
- * Converts hexadecimal Strings.
- */
-public class Hex {
-    private static final char[] DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
-
-    public static String encodeHex(byte[] data) {
-        return encodeHex(data, 0);
-    }
-
-    public static String encodeHex(byte[] data, int group) {
-        int l = data.length;
-        char[] out = new char[(l << 1) + (group > 0 ? (l / group) : 0)];
-        // two characters form the hex value.
-        for (int i = 0, j = 0; i < l; i++) {
-            if ((group > 0) && ((i % group) == 0) && j > 0) {
-                out[j++] = '-';
-            }
-
-            out[j++] = DIGITS[(0xF0 & data[i]) >>> 4];
-            out[j++] = DIGITS[0x0F & data[i]];
-        }
-        return new String(out);
-    }
-
-    public static byte[] decodeHex(String hexString) {
-        ByteArrayOutputStream bas = new ByteArrayOutputStream();
-        for (int i = 0; i < hexString.length(); i += 2) {
-            int b = Integer.parseInt(hexString.substring(i, i + 2), 16);
-            bas.write(b);
-        }
-        return bas.toByteArray();
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/IsoFile.java b/android/src/main/java/com/coremedia/iso/IsoFile.java
deleted file mode 100755
index a6f4b2b..0000000
--- a/android/src/main/java/com/coremedia/iso/IsoFile.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso;
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.MovieBox;
-import com.googlecode.mp4parser.annotations.DoNotParseDetail;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-
-/**
- * The most upper container for ISO Boxes. It is a container box that is a file.
- * Uses IsoBufferWrapper  to access the underlying file.
- */
-@DoNotParseDetail
-public class IsoFile extends AbstractContainerBox implements Closeable {
-    protected BoxParser boxParser = new PropertyBoxParserImpl();
-    ReadableByteChannel byteChannel;
-
-    public IsoFile() {
-        super("");
-    }
-
-    public IsoFile(File f) throws IOException {
-        super("");
-        this.byteChannel = new FileInputStream(f).getChannel();
-        boxParser = createBoxParser();
-        parse();
-    }
-
-    public IsoFile(ReadableByteChannel byteChannel) throws IOException {
-        super("");
-        this.byteChannel = byteChannel;
-        boxParser = createBoxParser();
-        parse();
-    }
-
-    public IsoFile(ReadableByteChannel byteChannel, BoxParser boxParser) throws IOException {
-        super("");
-        this.byteChannel = byteChannel;
-        this.boxParser = boxParser;
-        parse();
-
-
-    }
-
-    protected BoxParser createBoxParser() {
-        return new PropertyBoxParserImpl();
-    }
-
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        // there are no details to parse we should be just file
-    }
-
-    public void parse(ReadableByteChannel inFC, ByteBuffer header, long contentSize, AbstractBoxParser abstractBoxParser) throws IOException {
-        throw new IOException("This method is not meant to be called. Use #parse() directly.");
-    }
-
-    private void parse() throws IOException {
-
-        boolean done = false;
-        while (!done) {
-            try {
-                Box box = boxParser.parseBox(byteChannel, this);
-                if (box != null) {
-                    //  System.err.println(box.getType());
-                    boxes.add(box);
-                } else {
-                    done = true;
-                }
-            } catch (EOFException e) {
-                done = true;
-            }
-        }
-    }
-
-    @DoNotParseDetail
-    public String toString() {
-        StringBuilder buffer = new StringBuilder();
-        buffer.append("IsoFile[");
-        if (boxes == null) {
-            buffer.append("unparsed");
-        } else {
-            for (int i = 0; i < boxes.size(); i++) {
-                if (i > 0) {
-                    buffer.append(";");
-                }
-                buffer.append(boxes.get(i).toString());
-            }
-        }
-        buffer.append("]");
-        return buffer.toString();
-    }
-
-    @DoNotParseDetail
-    public static byte[] fourCCtoBytes(String fourCC) {
-        byte[] result = new byte[4];
-        if (fourCC != null) {
-            for (int i = 0; i < Math.min(4, fourCC.length()); i++) {
-                result[i] = (byte) fourCC.charAt(i);
-            }
-        }
-        return result;
-    }
-
-    @DoNotParseDetail
-    public static String bytesToFourCC(byte[] type) {
-        byte[] result = new byte[]{0, 0, 0, 0};
-        if (type != null) {
-            System.arraycopy(type, 0, result, 0, Math.min(type.length, 4));
-        }
-        try {
-            return new String(result, "ISO-8859-1");
-        } catch (UnsupportedEncodingException e) {
-            throw new Error("Required character encoding is missing", e);
-        }
-    }
-
-
-    @Override
-    public long getNumOfBytesToFirstChild() {
-        return 0;
-    }
-
-    @Override
-    public long getSize() {
-        long size = 0;
-        for (Box box : boxes) {
-            size += box.getSize();
-        }
-        return size;
-    }
-
-    @Override
-    public IsoFile getIsoFile() {
-        return this;
-    }
-
-
-    /**
-     * Shortcut to get the MovieBox since it is often needed and present in
-     * nearly all ISO 14496 files (at least if they are derived from MP4 ).
-     *
-     * @return the MovieBox or <code>null</code>
-     */
-    @DoNotParseDetail
-    public MovieBox getMovieBox() {
-        for (Box box : boxes) {
-            if (box instanceof MovieBox) {
-                return (MovieBox) box;
-            }
-        }
-        return null;
-    }
-
-    public void getBox(WritableByteChannel os) throws IOException {
-        for (Box box : boxes) {
-
-            if (os instanceof FileChannel) {
-                long startPos = ((FileChannel) os).position();
-                box.getBox(os);
-                long size = ((FileChannel) os).position() - startPos;
-                assert size == box.getSize();
-            } else {
-                box.getBox(os);
-            }
-
-        }
-    }
-
-    public void close() throws IOException {
-        this.byteChannel.close();
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/IsoTypeReader.java b/android/src/main/java/com/coremedia/iso/IsoTypeReader.java
deleted file mode 100755
index a534f2a..0000000
--- a/android/src/main/java/com/coremedia/iso/IsoTypeReader.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import java.io.ByteArrayOutputStream;
-import java.nio.ByteBuffer;
-
-public final class IsoTypeReader {
-
-
-    public static long readUInt32BE(ByteBuffer bb) {
-        long ch1 = readUInt8(bb);
-        long ch2 = readUInt8(bb);
-        long ch3 = readUInt8(bb);
-        long ch4 = readUInt8(bb);
-        return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0));
-
-    }
-
-
-    public static long readUInt32(ByteBuffer bb) {
-        long i = bb.getInt();
-        if (i < 0) {
-            i += 1l<<32;
-        }
-        return i;
-    }
-
-    public static int readUInt24(ByteBuffer bb) {
-        int result = 0;
-        result += readUInt16(bb) << 8;
-        result += byte2int(bb.get());
-        return result;
-    }
-
-
-    public static int readUInt16(ByteBuffer bb) {
-        int result = 0;
-        result += byte2int(bb.get()) << 8;
-        result += byte2int(bb.get());
-        return result;
-    }
-
-    public static int readUInt16BE(ByteBuffer bb) {
-        int result = 0;
-        result += byte2int(bb.get());
-        result += byte2int(bb.get()) << 8;
-        return result;
-    }
-
-    public static int readUInt8(ByteBuffer bb) {
-        return byte2int(bb.get());
-    }
-
-    public static int byte2int(byte b) {
-        return b < 0 ? b + 256 : b;
-    }
-
-
-    /**
-     * Reads a zero terminated UTF-8 string.
-     *
-     * @param byteBuffer the data source
-     * @return the string readByte
-     * @throws Error in case of an error in the underlying stream
-     */
-    public static String readString(ByteBuffer byteBuffer) {
-
-        ByteArrayOutputStream out = new ByteArrayOutputStream();
-        int read;
-        while ((read = byteBuffer.get()) != 0) {
-            out.write(read);
-        }
-        return Utf8.convert(out.toByteArray());
-    }
-
-    public static String readString(ByteBuffer byteBuffer, int length) {
-        byte[] buffer = new byte[length];
-        byteBuffer.get(buffer);
-        return Utf8.convert(buffer);
-
-    }
-
-    public static long readUInt64(ByteBuffer byteBuffer) {
-        long result = 0;
-        // thanks to Erik Nicolas for finding a bug! Cast to long is definitivly needed
-        result += readUInt32(byteBuffer) << 32;
-        if (result < 0) {
-            throw new RuntimeException("I don't know how to deal with UInt64! long is not sufficient and I don't want to use BigInt");
-        }
-        result += readUInt32(byteBuffer);
-
-        return result;
-    }
-
-    public static double readFixedPoint1616(ByteBuffer bb) {
-        byte[] bytes = new byte[4];
-        bb.get(bytes);
-
-        int result = 0;
-        result |= ((bytes[0] << 24) & 0xFF000000);
-        result |= ((bytes[1] << 16) & 0xFF0000);
-        result |= ((bytes[2] << 8) & 0xFF00);
-        result |= ((bytes[3]) & 0xFF);
-        return ((double) result) / 65536;
-
-    }
-
-    public static double readFixedPoint0230(ByteBuffer bb) {
-        byte[] bytes = new byte[4];
-        bb.get(bytes);
-
-        int result = 0;
-        result |= ((bytes[0] << 24) & 0xFF000000);
-        result |= ((bytes[1] << 16) & 0xFF0000);
-        result |= ((bytes[2] << 8) & 0xFF00);
-        result |= ((bytes[3]) & 0xFF);
-        return ((double) result) / (1 << 30);
-
-    }
-
-    public static float readFixedPoint88(ByteBuffer bb) {
-        byte[] bytes = new byte[2];
-        bb.get(bytes);
-        short result = 0;
-        result |= ((bytes[0] << 8) & 0xFF00);
-        result |= ((bytes[1]) & 0xFF);
-        return ((float) result) / 256;
-    }
-
-    public static String readIso639(ByteBuffer bb) {
-        int bits = readUInt16(bb);
-        StringBuilder result = new StringBuilder();
-        for (int i = 0; i < 3; i++) {
-            int c = (bits >> (2 - i) * 5) & 0x1f;
-            result.append((char) (c + 0x60));
-        }
-        return result.toString();
-    }
-
-    public static String read4cc(ByteBuffer bb) {
-        byte[] b = new byte[4];
-        bb.get(b);
-        return IsoFile.bytesToFourCC(b);
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/IsoTypeWriter.java b/android/src/main/java/com/coremedia/iso/IsoTypeWriter.java
deleted file mode 100755
index 7394abc..0000000
--- a/android/src/main/java/com/coremedia/iso/IsoTypeWriter.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import java.nio.ByteBuffer;
-
-public final class IsoTypeWriter {
-
-    public static void writeUInt64(ByteBuffer bb, long u) {
-        bb.putLong(u);
-    }
-
-    public static void writeUInt32(ByteBuffer bb, long u) {
-        bb.putInt((int) u);
-
-    }
-
-    public static void writeUInt32BE(ByteBuffer bb, long u) {
-        assert u >= 0 && u <= 1L << 32 : "The given long is not in the range of uint32 (" + u + ")";
-        writeUInt16BE(bb, (int) u & 0xFFFF);
-        writeUInt16BE(bb, (int) ((u >> 16) & 0xFFFF));
-
-    }
-
-
-    public static void writeUInt24(ByteBuffer bb, int i) {
-        i = i & 0xFFFFFF;
-        writeUInt16(bb, i >> 8);
-        writeUInt8(bb, i);
-
-    }
-
-
-    public static void writeUInt16(ByteBuffer bb, int i) {
-        i = i & 0xFFFF;
-        writeUInt8(bb, i >> 8);
-        writeUInt8(bb, i & 0xFF);
-    }
-
-    public static void writeUInt16BE(ByteBuffer bb, int i) {
-        i = i & 0xFFFF;
-        writeUInt8(bb, i & 0xFF);
-        writeUInt8(bb, i >> 8);
-    }
-
-    public static void writeUInt8(ByteBuffer bb, int i) {
-        i = i & 0xFF;
-        bb.put((byte) i);
-    }
-
-
-    public static void writeFixedPoint1616(ByteBuffer bb, double v) {
-        int result = (int) (v * 65536);
-        bb.put((byte) ((result & 0xFF000000) >> 24));
-        bb.put((byte) ((result & 0x00FF0000) >> 16));
-        bb.put((byte) ((result & 0x0000FF00) >> 8));
-        bb.put((byte) ((result & 0x000000FF)));
-    }
-
-    public static void writeFixedPoint0230(ByteBuffer bb, double v) {
-        int result = (int) (v * (1 << 30));
-        bb.put((byte) ((result & 0xFF000000) >> 24));
-        bb.put((byte) ((result & 0x00FF0000) >> 16));
-        bb.put((byte) ((result & 0x0000FF00) >> 8));
-        bb.put((byte) ((result & 0x000000FF)));
-    }
-
-    public static void writeFixedPont88(ByteBuffer bb, double v) {
-        short result = (short) (v * 256);
-        bb.put((byte) ((result & 0xFF00) >> 8));
-        bb.put((byte) ((result & 0x00FF)));
-    }
-
-    public static void writeIso639(ByteBuffer bb, String language) {
-        if (language.getBytes().length != 3) {
-            throw new IllegalArgumentException("\"" + language + "\" language string isn't exactly 3 characters long!");
-        }
-        int bits = 0;
-        for (int i = 0; i < 3; i++) {
-            bits += (language.getBytes()[i] - 0x60) << (2 - i) * 5;
-        }
-        writeUInt16(bb, bits);
-    }
-
-    public static void writeUtf8String(ByteBuffer bb, String string) {
-
-        bb.put(Utf8.convert(string));
-        writeUInt8(bb, 0);
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/PropertyBoxParserImpl.java b/android/src/main/java/com/coremedia/iso/PropertyBoxParserImpl.java
deleted file mode 100755
index f1bcc01..0000000
--- a/android/src/main/java/com/coremedia/iso/PropertyBoxParserImpl.java
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import com.googlecode.mp4parser.AbstractBox;
-import com.coremedia.iso.boxes.Box;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-import java.net.URL;
-import java.util.Enumeration;
-import java.util.Properties;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A Property file based BoxFactory
- */
-public class PropertyBoxParserImpl extends AbstractBoxParser {
-    Properties mapping;
-
-    public PropertyBoxParserImpl(String... customProperties) {
-        InputStream is = new BufferedInputStream(getClass().getResourceAsStream("/isoparser-default.properties"));
-        try {
-            mapping = new Properties();
-            try {
-                mapping.load(is);
-                Enumeration<URL> enumeration = Thread.currentThread().getContextClassLoader().getResources("isoparser-custom.properties");
-
-                while (enumeration.hasMoreElements()) {
-                    URL url = enumeration.nextElement();
-                    InputStream customIS = new BufferedInputStream(url.openStream());
-                    try {
-                        mapping.load(customIS);
-                    } finally {
-                        customIS.close();
-                    }
-                }
-                for (String customProperty : customProperties) {
-                    mapping.load(new BufferedInputStream(getClass().getResourceAsStream(customProperty)));
-                }
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-        } finally {
-            try {
-                is.close();
-            } catch (IOException e) {
-                e.printStackTrace();
-                // ignore - I can't help
-            }
-        }
-    }
-
-    public PropertyBoxParserImpl(Properties mapping) {
-        this.mapping = mapping;
-    }
-
-    Pattern p = Pattern.compile("(.*)\\((.*?)\\)");
-
-    @SuppressWarnings("unchecked")
-    public Class<? extends Box> getClassForFourCc(String type, byte[] userType, String parent) {
-        FourCcToBox fourCcToBox = new FourCcToBox(type, userType, parent).invoke();
-        try {
-            return (Class<? extends Box>) Class.forName(fourCcToBox.clazzName);
-        } catch (ClassNotFoundException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    @Override
-    public Box createBox(String type, byte[] userType, String parent) {
-
-        FourCcToBox fourCcToBox = new FourCcToBox(type, userType, parent).invoke();
-        String[] param = fourCcToBox.getParam();
-        String clazzName = fourCcToBox.getClazzName();
-        try {
-            if (param[0].trim().length() == 0) {
-                param = new String[]{};
-            }
-            Class clazz = Class.forName(clazzName);
-
-            Class[] constructorArgsClazz = new Class[param.length];
-            Object[] constructorArgs = new Object[param.length];
-            for (int i = 0; i < param.length; i++) {
-
-                if ("userType".equals(param[i])) {
-                    constructorArgs[i] = userType;
-                    constructorArgsClazz[i] = byte[].class;
-                } else if ("type".equals(param[i])) {
-                    constructorArgs[i] = type;
-                    constructorArgsClazz[i] = String.class;
-                } else if ("parent".equals(param[i])) {
-                    constructorArgs[i] = parent;
-                    constructorArgsClazz[i] = String.class;
-                } else {
-                    throw new InternalError("No such param: " + param[i]);
-                }
-
-
-            }
-            Constructor<AbstractBox> constructorObject;
-            try {
-                if (param.length > 0) {
-                    constructorObject = clazz.getConstructor(constructorArgsClazz);
-                } else {
-                    constructorObject = clazz.getConstructor();
-                }
-
-                return constructorObject.newInstance(constructorArgs);
-            } catch (NoSuchMethodException e) {
-                throw new RuntimeException(e);
-            } catch (InvocationTargetException e) {
-                throw new RuntimeException(e);
-            } catch (InstantiationException e) {
-                throw new RuntimeException(e);
-            } catch (IllegalAccessException e) {
-                throw new RuntimeException(e);
-            }
-
-
-        } catch (ClassNotFoundException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private class FourCcToBox {
-        private String type;
-        private byte[] userType;
-        private String parent;
-        private String clazzName;
-        private String[] param;
-
-        public FourCcToBox(String type, byte[] userType, String parent) {
-            this.type = type;
-            this.parent = parent;
-            this.userType = userType;
-        }
-
-        public String getClazzName() {
-            return clazzName;
-        }
-
-        public String[] getParam() {
-            return param;
-        }
-
-        public FourCcToBox invoke() {
-            String constructor;
-            if (userType != null) {
-                if (!"uuid".equals((type))) {
-                    throw new RuntimeException("we have a userType but no uuid box type. Something's wrong");
-                }
-                constructor = mapping.getProperty((parent) + "-uuid[" + Hex.encodeHex(userType).toUpperCase() + "]");
-                if (constructor == null) {
-                    constructor = mapping.getProperty("uuid[" + Hex.encodeHex(userType).toUpperCase() + "]");
-                }
-                if (constructor == null) {
-                    constructor = mapping.getProperty("uuid");
-                }
-            } else {
-                constructor = mapping.getProperty((parent) + "-" + (type));
-                if (constructor == null) {
-                    constructor = mapping.getProperty((type));
-                }
-            }
-            if (constructor == null) {
-                constructor = mapping.getProperty("default");
-            }
-            if (constructor == null) {
-                throw new RuntimeException("No box object found for " + type);
-            }
-            Matcher m = p.matcher(constructor);
-            boolean matches = m.matches();
-            if (!matches) {
-                throw new RuntimeException("Cannot work with that constructor: " + constructor);
-            }
-            clazzName = m.group(1);
-            param = m.group(2).split(",");
-            return this;
-        }
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/Utf8.java b/android/src/main/java/com/coremedia/iso/Utf8.java
deleted file mode 100755
index a30497e..0000000
--- a/android/src/main/java/com/coremedia/iso/Utf8.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.coremedia.iso;
-
-import java.io.UnsupportedEncodingException;
-
-/**
- * Converts <code>byte[]</code> -> <code>String</code> and vice versa.
- */
-public final class Utf8 {
-    public static byte[] convert(String s) {
-        try {
-            if (s != null) {
-                return s.getBytes("UTF-8");
-            } else {
-                return null;
-            }
-        } catch (UnsupportedEncodingException e) {
-            throw new Error(e);
-        }
-    }
-
-    public static String convert(byte[] b) {
-        try {
-            if (b != null) {
-                return new String(b, "UTF-8");
-            } else {
-                return null;
-            }
-        } catch (UnsupportedEncodingException e) {
-            throw new Error(e);
-        }
-    }
-
-    public static int utf8StringLengthInBytes(String utf8) {
-        try {
-            if (utf8 != null) {
-                return utf8.getBytes("UTF-8").length;
-            } else {
-                return 0;
-            }
-        } catch (UnsupportedEncodingException e) {
-            throw new RuntimeException();
-        }
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/AbstractMediaHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/AbstractMediaHeaderBox.java
deleted file mode 100755
index cc141ae..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/AbstractMediaHeaderBox.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2011 Sebastian Annies, Hamburg, Germany
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractFullBox;
-
-/**
- * A common superclass for all MediaInformationHeaderBoxes. E.g.
- * VideoMediaHeaderBox, SoundMediaHeaderBox & HintMediaHeaderBox
- */
-public abstract class AbstractMediaHeaderBox extends AbstractFullBox {
-    protected AbstractMediaHeaderBox(String type) {
-        super(type);
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/Box.java b/android/src/main/java/com/coremedia/iso/boxes/Box.java
deleted file mode 100755
index f6ca302..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/Box.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.boxes.ContainerBox;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-
-/**
- * Defines basic interaction possibilities for any ISO box. Each box has a parent box and a type.
- */
-public interface Box {
-    ContainerBox getParent();
-
-    void setParent(ContainerBox parent);
-
-    long getSize();
-
-    /**
-     * The box's 4-cc type.
-     * @return the 4 character type of the box
-     */
-    String getType();
-
-    /**
-     * Writes the complete box - size | 4-cc | content - to the given <code>writableByteChannel</code>.
-     * @param writableByteChannel the box's sink
-     * @throws IOException in case of problems with the <code>Channel</code>
-     */
-    void getBox(WritableByteChannel writableByteChannel) throws IOException;
-
-    void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException;
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/ChunkOffsetBox.java b/android/src/main/java/com/coremedia/iso/boxes/ChunkOffsetBox.java
deleted file mode 100755
index 01f5ae4..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/ChunkOffsetBox.java
+++ /dev/null
@@ -1,21 +0,0 @@
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractFullBox;
-
-/**
- * Abstract Chunk Offset Box
- */
-public abstract class ChunkOffsetBox extends AbstractFullBox {
-
-    public ChunkOffsetBox(String type) {
-        super(type);
-    }
-
-    public abstract long[] getChunkOffsets();
-
-
-    public String toString() {
-        return this.getClass().getSimpleName() + "[entryCount=" + getChunkOffsets().length + "]";
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/CompositionTimeToSample.java b/android/src/main/java/com/coremedia/iso/boxes/CompositionTimeToSample.java
deleted file mode 100755
index 411bfe9..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/CompositionTimeToSample.java
+++ /dev/null
@@ -1,150 +0,0 @@
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * <pre>
- * aligned(8) class CompositionOffsetBox
- * extends FullBox(‘ctts’, version = 0, 0) {
- *  unsigned int(32) entry_count;
- *  int i;
- *  if (version==0) {
- *   for (i=0; i < entry_count; i++) {
- *    unsigned int(32) sample_count;
- *    unsigned int(32) sample_offset;
- *   }
- *  }
- *  else if (version == 1) {
- *   for (i=0; i < entry_count; i++) {
- *    unsigned int(32) sample_count;
- *    signed int(32) sample_offset;
- *   }
- *  }
- * }
- * </pre>
- * <p/>
- * This box provides the offset between decoding time and composition time.
- * In version 0 of this box the decoding time must be less than the composition time, and
- * the offsets are expressed as unsigned numbers such that
- * CT(n) = DT(n) + CTTS(n) where CTTS(n) is the (uncompressed) table entry for sample n.
- * <p/>
- * In version 1 of this box, the composition timeline and the decoding timeline are
- * still derived from each other, but the offsets are signed.
- * It is recommended that for the computed composition timestamps, there is
- * exactly one with the value 0 (zero).
- */
-public class CompositionTimeToSample extends AbstractFullBox {
-    public static final String TYPE = "ctts";
-
-    List<Entry> entries = Collections.emptyList();
-
-    public CompositionTimeToSample() {
-        super(TYPE);
-    }
-
-    protected long getContentSize() {
-        return 8 + 8 * entries.size();
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        int numberOfEntries = l2i(IsoTypeReader.readUInt32(content));
-        entries = new ArrayList<Entry>(numberOfEntries);
-        for (int i = 0; i < numberOfEntries; i++) {
-            Entry e = new Entry(l2i(IsoTypeReader.readUInt32(content)), content.getInt());
-            entries.add(e);
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, entries.size());
-
-        for (Entry entry : entries) {
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getCount());
-            byteBuffer.putInt(entry.getOffset());
-        }
-
-    }
-
-
-    public static class Entry {
-        int count;
-        int offset;
-
-        public Entry(int count, int offset) {
-            this.count = count;
-            this.offset = offset;
-        }
-
-        public int getCount() {
-            return count;
-        }
-
-        public int getOffset() {
-            return offset;
-        }
-
-        public void setCount(int count) {
-            this.count = count;
-        }
-
-        public void setOffset(int offset) {
-            this.offset = offset;
-        }
-
-        @Override
-        public String toString() {
-            return "Entry{" +
-                    "count=" + count +
-                    ", offset=" + offset +
-                    '}';
-        }
-    }
-
-
-    /**
-     * Decompresses the list of entries and returns the list of composition times.
-     *
-     * @return decoding time per sample
-     */
-    public static int[] blowupCompositionTimes(List<CompositionTimeToSample.Entry> entries) {
-        long numOfSamples = 0;
-        for (CompositionTimeToSample.Entry entry : entries) {
-            numOfSamples += entry.getCount();
-        }
-        assert numOfSamples <= Integer.MAX_VALUE;
-        int[] decodingTime = new int[(int) numOfSamples];
-
-        int current = 0;
-
-
-        for (CompositionTimeToSample.Entry entry : entries) {
-            for (int i = 0; i < entry.getCount(); i++) {
-                decodingTime[current++] = entry.getOffset();
-            }
-        }
-
-        return decodingTime;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/ContainerBox.java b/android/src/main/java/com/coremedia/iso/boxes/ContainerBox.java
deleted file mode 100755
index a016374..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/ContainerBox.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoFile;
-
-import java.util.List;
-
-/**
- * Interface for all ISO boxes that may contain other boxes.
- */
-public interface ContainerBox extends Box {
-
-    /**
-     * Gets all child boxes. May not return <code>null</code>.
-     *
-     * @return an array of boxes, empty array in case of no children.
-     */
-    List<Box> getBoxes();
-
-    /**
-     * Sets all boxes and removes all previous child boxes.
-     * @param boxes the new list of children
-     */
-    void setBoxes(List<Box> boxes);
-
-    /**
-     * Gets all child boxes of the given type. May not return <code>null</code>.
-     *
-     * @param clazz child box's type
-     * @return an array of boxes, empty array in case of no children.
-     */
-    <T extends Box> List<T> getBoxes(Class<T> clazz);
-
-    /**
-     * Gets all child boxes of the given type. May not return <code>null</code>.
-     *
-     * @param clazz     child box's type
-     * @param recursive step down the tree
-     * @return an array of boxes, empty array in case of no children.
-     */
-    <T extends Box> List<T> getBoxes(Class<T> clazz, boolean recursive);
-
-    /**
-     * Gets the parent box. May be <code>null</code> in case of the
-     * {@link com.coremedia.iso.IsoFile} itself.
-     *
-     * @return a <code>ContainerBox</code> that contains <code>this</code>
-     */
-    ContainerBox getParent();
-
-    /**
-     * Returns the number of bytes from the start of the box to start of the first child.
-     *
-     * @return offset of first child from box start
-     */
-    long getNumOfBytesToFirstChild();
-
-    IsoFile getIsoFile();
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/DataEntryUrlBox.java b/android/src/main/java/com/coremedia/iso/boxes/DataEntryUrlBox.java
deleted file mode 100755
index b58608d..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/DataEntryUrlBox.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * Only used within the DataReferenceBox. Find more information there.
- *
- * @see com.coremedia.iso.boxes.DataReferenceBox
- */
-public class DataEntryUrlBox extends AbstractFullBox {
-    public static final String TYPE = "url ";
-
-    public DataEntryUrlBox() {
-        super(TYPE);
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-    }
-
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-    }
-
-    protected long getContentSize() {
-        return 4;
-    }
-
-    public String toString() {
-        return "DataEntryUrlBox[]";
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/DataInformationBox.java b/android/src/main/java/com/coremedia/iso/boxes/DataInformationBox.java
deleted file mode 100755
index 7f058eb..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/DataInformationBox.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-/**
- * <code>
- * Box Type: 'dinf'<br>
- * Container: {@link com.coremedia.iso.boxes.MediaInformationBox} ('minf')<br>
- * Mandatory: Yes<br>
- * Quantity: Exactly one<br><br></code>
- * The data information box contains objects that declare the location of the media information in a track.
- */
-public class DataInformationBox extends AbstractContainerBox {
-    public static final String TYPE = "dinf";
-
-    public DataInformationBox() {
-        super(TYPE);
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/DataReferenceBox.java b/android/src/main/java/com/coremedia/iso/boxes/DataReferenceBox.java
deleted file mode 100755
index 8156d3f..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/DataReferenceBox.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.FullContainerBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * The data reference object contains a table of data references (normally URLs) that declare the location(s) of
- * the media data used within the presentation. The data reference index in the sample description ties entries in
- * this table to the samples in the track. A track may be split over several sources in this way.
- * If the flag is set indicating that the data is in the same file as this box, then no string (not even an empty one)
- * shall be supplied in the entry field.
- * The DataEntryBox within the DataReferenceBox shall be either a DataEntryUrnBox or a DataEntryUrlBox.
- *
- * @see com.coremedia.iso.boxes.DataEntryUrlBox
- * @see com.coremedia.iso.boxes.DataEntryUrnBox
- */
-public class DataReferenceBox extends FullContainerBox {
-
-    public static final String TYPE = "dref";
-
-    public DataReferenceBox() {
-        super(TYPE);
-
-    }
-
-    @Override
-    protected long getContentSize() {
-        return super.getContentSize() + 4;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        content.get(new byte[4]); // basically a skip of 4 bytes signaling the number of child boxes
-        parseChildBoxes(content);
-    }
-
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, getBoxes().size());
-        writeChildBoxes(byteBuffer);
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/FileTypeBox.java b/android/src/main/java/com/coremedia/iso/boxes/FileTypeBox.java
deleted file mode 100755
index e6eed20..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/FileTypeBox.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractBox;
-import com.googlecode.mp4parser.annotations.DoNotParseDetail;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * This box identifies the specifications to which this file complies. <br>
- * Each brand is a printable four-character code, registered with ISO, that
- * identifies a precise specification.
- */
-public class FileTypeBox extends AbstractBox {
-    public static final String TYPE = "ftyp";
-
-    private String majorBrand;
-    private long minorVersion;
-    private List<String> compatibleBrands = Collections.emptyList();
-
-    public FileTypeBox() {
-        super(TYPE);
-    }
-
-    public FileTypeBox(String majorBrand, long minorVersion, List<String> compatibleBrands) {
-        super(TYPE);
-        this.majorBrand = majorBrand;
-        this.minorVersion = minorVersion;
-        this.compatibleBrands = compatibleBrands;
-    }
-
-    protected long getContentSize() {
-        return 8 + compatibleBrands.size() * 4;
-
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        majorBrand = IsoTypeReader.read4cc(content);
-        minorVersion = IsoTypeReader.readUInt32(content);
-        int compatibleBrandsCount = content.remaining() / 4;
-        compatibleBrands = new LinkedList<String>();
-        for (int i = 0; i < compatibleBrandsCount; i++) {
-            compatibleBrands.add(IsoTypeReader.read4cc(content));
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        byteBuffer.put(IsoFile.fourCCtoBytes(majorBrand));
-        IsoTypeWriter.writeUInt32(byteBuffer, minorVersion);
-        for (String compatibleBrand : compatibleBrands) {
-            byteBuffer.put(IsoFile.fourCCtoBytes(compatibleBrand));
-        }
-
-    }
-
-    /**
-     * Gets the brand identifier.
-     *
-     * @return the brand identifier
-     */
-    public String getMajorBrand() {
-        return majorBrand;
-    }
-
-    /**
-     * Sets the major brand of the file used to determine an appropriate reader.
-     *
-     * @param majorBrand the new major brand
-     */
-    public void setMajorBrand(String majorBrand) {
-        this.majorBrand = majorBrand;
-    }
-
-    /**
-     * Sets the "informative integer for the minor version of the major brand".
-     *
-     * @param minorVersion the version number of the major brand
-     */
-    public void setMinorVersion(int minorVersion) {
-        this.minorVersion = minorVersion;
-    }
-
-    /**
-     * Gets an informative integer for the minor version of the major brand.
-     *
-     * @return an informative integer
-     * @see FileTypeBox#getMajorBrand()
-     */
-    public long getMinorVersion() {
-        return minorVersion;
-    }
-
-    /**
-     * Gets an array of 4-cc brands.
-     *
-     * @return the compatible brands
-     */
-    public List<String> getCompatibleBrands() {
-        return compatibleBrands;
-    }
-
-    public void setCompatibleBrands(List<String> compatibleBrands) {
-        this.compatibleBrands = compatibleBrands;
-    }
-
-    @DoNotParseDetail
-    public String toString() {
-        StringBuilder result = new StringBuilder();
-        result.append("FileTypeBox[");
-        result.append("majorBrand=").append(getMajorBrand());
-        result.append(";");
-        result.append("minorVersion=").append(getMinorVersion());
-        for (String compatibleBrand : compatibleBrands) {
-            result.append(";");
-            result.append("compatibleBrand=").append(compatibleBrand);
-        }
-        result.append("]");
-        return result.toString();
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/FullBox.java b/android/src/main/java/com/coremedia/iso/boxes/FullBox.java
deleted file mode 100755
index 1515d76..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/FullBox.java
+++ /dev/null
@@ -1,17 +0,0 @@
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.boxes.Box;
-
-/**
- * The <code>FullBox</code> contains all getters and setters specific
- * to a so-called full box according to the ISO/IEC 14496/12 specification.
- */
-public interface FullBox extends Box {
-    int getVersion();
-
-    void setVersion(int version);
-
-    int getFlags();
-
-    void setFlags(int flags);
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/HandlerBox.java b/android/src/main/java/com/coremedia/iso/boxes/HandlerBox.java
deleted file mode 100755
index 01dcaca..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/HandlerBox.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.Utf8;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * This box within a Media Box declares the process by which the media-data in the track is presented,
- * and thus, the nature of the media in a track.
- * This Box when present in a Meta Box, declares the structure or format of the 'meta' box contents.
- * See ISO/IEC 14496-12 for details.
- *
- * @see MetaBox
- * @see MediaBox
- */
-public class HandlerBox extends AbstractFullBox {
-    public static final String TYPE = "hdlr";
-    public static final Map<String, String> readableTypes;
-
-    static {
-        HashMap<String, String> hm = new HashMap<String, String>();
-        hm.put("odsm", "ObjectDescriptorStream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("crsm", "ClockReferenceStream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("sdsm", "SceneDescriptionStream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("m7sm", "MPEG7Stream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("ocsm", "ObjectContentInfoStream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("ipsm", "IPMP Stream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("mjsm", "MPEG-J Stream - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-        hm.put("mdir", "Apple Meta Data iTunes Reader");
-        hm.put("mp7b", "MPEG-7 binary XML");
-        hm.put("mp7t", "MPEG-7 XML");
-        hm.put("vide", "Video Track");
-        hm.put("soun", "Sound Track");
-        hm.put("hint", "Hint Track");
-        hm.put("appl", "Apple specific");
-        hm.put("meta", "Timed Metadata track - defined in ISO/IEC JTC1/SC29/WG11 - CODING OF MOVING PICTURES AND AUDIO");
-
-        readableTypes = Collections.unmodifiableMap(hm);
-
-    }
-
-    private String handlerType;
-    private String name = null;
-    private long a, b, c;
-    private boolean zeroTerm = true;
-
-    private long shouldBeZeroButAppleWritesHereSomeValue;
-
-    public HandlerBox() {
-        super(TYPE);
-    }
-
-    public String getHandlerType() {
-        return handlerType;
-    }
-
-    /**
-     * You are required to add a '\0' string termination by yourself.
-     *
-     * @param name the new human readable name
-     */
-    public void setName(String name) {
-        this.name = name;
-    }
-
-    public void setHandlerType(String handlerType) {
-        this.handlerType = handlerType;
-    }
-
-    public String getName() {
-        return name;
-    }
-
-    public String getHumanReadableTrackType() {
-        return readableTypes.get(handlerType) != null ? readableTypes.get(handlerType) : "Unknown Handler Type";
-    }
-
-    protected long getContentSize() {
-        if (zeroTerm) {
-            return 25 + Utf8.utf8StringLengthInBytes(name);
-        } else {
-            return 24 + Utf8.utf8StringLengthInBytes(name);
-        }
-
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        shouldBeZeroButAppleWritesHereSomeValue = IsoTypeReader.readUInt32(content);
-        handlerType = IsoTypeReader.read4cc(content);
-        a = IsoTypeReader.readUInt32(content);
-        b = IsoTypeReader.readUInt32(content);
-        c = IsoTypeReader.readUInt32(content);
-        if (content.remaining() > 0) {
-            name = IsoTypeReader.readString(content, content.remaining());
-            if (name.endsWith("\0")) {
-                name = name.substring(0, name.length() - 1);
-                zeroTerm = true;
-            } else {
-                zeroTerm = false;
-            }
-        } else {
-            zeroTerm = false; //No string at all, not even zero term char
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, shouldBeZeroButAppleWritesHereSomeValue);
-        byteBuffer.put(IsoFile.fourCCtoBytes(handlerType));
-        IsoTypeWriter.writeUInt32(byteBuffer, a);
-        IsoTypeWriter.writeUInt32(byteBuffer, b);
-        IsoTypeWriter.writeUInt32(byteBuffer, c);
-        if (name != null) {
-            byteBuffer.put(Utf8.convert(name));
-        }
-        if (zeroTerm) {
-            byteBuffer.put((byte) 0);
-        }
-    }
-
-    public String toString() {
-        return "HandlerBox[handlerType=" + getHandlerType() + ";name=" + getName() + "]";
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/MediaBox.java b/android/src/main/java/com/coremedia/iso/boxes/MediaBox.java
deleted file mode 100755
index fa5642c..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/MediaBox.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-/**
- * The media declaration container contains all the objects that declare information about the media data within a
- * track.
- */
-public class MediaBox extends AbstractContainerBox {
-    public static final String TYPE = "mdia";
-
-    public MediaBox() {
-        super(TYPE);
-    }
-
-    public MediaInformationBox getMediaInformationBox() {
-        for (Box box : boxes) {
-            if (box instanceof MediaInformationBox) {
-                return (MediaInformationBox) box;
-            }
-        }
-        return null;
-    }
-
-    public MediaHeaderBox getMediaHeaderBox() {
-        for (Box box : boxes) {
-            if (box instanceof MediaHeaderBox) {
-                return (MediaHeaderBox) box;
-            }
-        }
-        return null;
-    }
-
-    public HandlerBox getHandlerBox() {
-        for (Box box : boxes) {
-            if (box instanceof HandlerBox) {
-                return (HandlerBox) box;
-            }
-        }
-        return null;
-    }
-
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/MediaHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/MediaHeaderBox.java
deleted file mode 100755
index 2720856..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/MediaHeaderBox.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-import com.googlecode.mp4parser.authoring.DateHelper;
-
-import java.nio.ByteBuffer;
-import java.util.Date;
-
-/**
- * This box defines overall information which is media-independent, and relevant to the entire presentation
- * considered as a whole.
- */
-public class MediaHeaderBox extends AbstractFullBox {
-    public static final String TYPE = "mdhd";
-
-
-    private Date creationTime;
-    private Date modificationTime;
-    private long timescale;
-    private long duration;
-    private String language;
-
-    public MediaHeaderBox() {
-        super(TYPE);
-    }
-
-    public Date getCreationTime() {
-        return creationTime;
-    }
-
-    public Date getModificationTime() {
-        return modificationTime;
-    }
-
-    public long getTimescale() {
-        return timescale;
-    }
-
-    public long getDuration() {
-        return duration;
-    }
-
-    public String getLanguage() {
-        return language;
-    }
-
-    protected long getContentSize() {
-        long contentSize = 4;
-        if (getVersion() == 1) {
-            contentSize += 8 + 8 + 4 + 8;
-        } else {
-            contentSize += 4 + 4 + 4 + 4;
-        }
-        contentSize += 2;
-        contentSize += 2;
-        return contentSize;
-
-    }
-
-    public void setCreationTime(Date creationTime) {
-        this.creationTime = creationTime;
-    }
-
-    public void setModificationTime(Date modificationTime) {
-        this.modificationTime = modificationTime;
-    }
-
-    public void setTimescale(long timescale) {
-        this.timescale = timescale;
-    }
-
-    public void setDuration(long duration) {
-        this.duration = duration;
-    }
-
-    public void setLanguage(String language) {
-        this.language = language;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        if (getVersion() == 1) {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            timescale = IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt64(content);
-        } else {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            timescale = IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt32(content);
-        }
-        language = IsoTypeReader.readIso639(content);
-        IsoTypeReader.readUInt16(content);
-    }
-
-
-    public String toString() {
-        StringBuilder result = new StringBuilder();
-        result.append("MediaHeaderBox[");
-        result.append("creationTime=").append(getCreationTime());
-        result.append(";");
-        result.append("modificationTime=").append(getModificationTime());
-        result.append(";");
-        result.append("timescale=").append(getTimescale());
-        result.append(";");
-        result.append("duration=").append(getDuration());
-        result.append(";");
-        result.append("language=").append(getLanguage());
-        result.append("]");
-        return result.toString();
-    }
-
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        if (getVersion() == 1) {
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, timescale);
-            IsoTypeWriter.writeUInt64(byteBuffer, duration);
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, timescale);
-            IsoTypeWriter.writeUInt32(byteBuffer, duration);
-        }
-        IsoTypeWriter.writeIso639(byteBuffer, language);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/MediaInformationBox.java b/android/src/main/java/com/coremedia/iso/boxes/MediaInformationBox.java
deleted file mode 100755
index ed25051..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/MediaInformationBox.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-/**
- * This box contains all the objects that declare characteristic information of the media in the track.
- */
-public class MediaInformationBox extends AbstractContainerBox {
-    public static final String TYPE = "minf";
-
-    public MediaInformationBox() {
-        super(TYPE);
-    }
-
-    public SampleTableBox getSampleTableBox() {
-        for (Box box : boxes) {
-            if (box instanceof SampleTableBox) {
-                return (SampleTableBox) box;
-            }
-        }
-        return null;
-    }
-
-    public AbstractMediaHeaderBox getMediaHeaderBox() {
-        for (Box box : boxes) {
-            if (box instanceof AbstractMediaHeaderBox) {
-                return (AbstractMediaHeaderBox) box;
-            }
-        }
-        return null;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/MovieBox.java b/android/src/main/java/com/coremedia/iso/boxes/MovieBox.java
deleted file mode 100755
index 3aff7d8..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/MovieBox.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.googlecode.mp4parser.AbstractBox;
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-import java.util.List;
-
-/**
- * The metadata for a presentation is stored in the single Movie Box which occurs at the top-level of a file.
- * Normally this box is close to the beginning or end of the file, though this is not required.
- */
-public class MovieBox extends AbstractContainerBox {
-    public static final String TYPE = "moov";
-
-    public MovieBox() {
-        super(TYPE);
-    }
-
-    public int getTrackCount() {
-        return getBoxes(TrackBox.class).size();
-    }
-
-
-    /**
-     * Returns the track numbers associated with this <code>MovieBox</code>.
-     *
-     * @return the tracknumbers (IDs) of the tracks in their order of appearance in the file
-     */
-    public long[] getTrackNumbers() {
-
-        List<TrackBox> trackBoxes = this.getBoxes(TrackBox.class);
-        long[] trackNumbers = new long[trackBoxes.size()];
-        for (int trackCounter = 0; trackCounter < trackBoxes.size(); trackCounter++) {
-            AbstractBox trackBoxe = trackBoxes.get(trackCounter);
-            TrackBox trackBox = (TrackBox) trackBoxe;
-            trackNumbers[trackCounter] = trackBox.getTrackHeaderBox().getTrackId();
-        }
-        return trackNumbers;
-    }
-
-    public MovieHeaderBox getMovieHeaderBox() {
-        for (Box box : boxes) {
-            if (box instanceof MovieHeaderBox) {
-                return (MovieHeaderBox) box;
-            }
-        }
-        return null;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/MovieHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/MovieHeaderBox.java
deleted file mode 100755
index 604ede9..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/MovieHeaderBox.java
+++ /dev/null
@@ -1,273 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-import com.googlecode.mp4parser.authoring.DateHelper;
-import com.googlecode.mp4parser.util.Matrix;
-
-import java.nio.ByteBuffer;
-import java.util.Date;
-
-/**
- * <code>
- * Box Type: 'mvhd'<br>
- * Container: {@link MovieBox} ('moov')<br>
- * Mandatory: Yes<br>
- * Quantity: Exactly one<br><br>
- * </code>
- * This box defines overall information which is media-independent, and relevant to the entire presentation
- * considered as a whole.
- */
-public class MovieHeaderBox extends AbstractFullBox {
-    private Date creationTime;
-    private Date modificationTime;
-    private long timescale;
-    private long duration;
-    private double rate = 1.0;
-    private float volume = 1.0f;
-    private Matrix matrix = Matrix.ROTATE_0;
-    private long nextTrackId;
-
-    private int previewTime;
-    private int previewDuration;
-    private int posterTime;
-    private int selectionTime;
-    private int selectionDuration;
-    private int currentTime;
-
-
-    public static final String TYPE = "mvhd";
-
-    public MovieHeaderBox() {
-        super(TYPE);
-    }
-
-    public Date getCreationTime() {
-        return creationTime;
-    }
-
-    public Date getModificationTime() {
-        return modificationTime;
-    }
-
-    public long getTimescale() {
-        return timescale;
-    }
-
-    public long getDuration() {
-        return duration;
-    }
-
-    public double getRate() {
-        return rate;
-    }
-
-    public float getVolume() {
-        return volume;
-    }
-
-    public Matrix getMatrix() {
-        return matrix;
-    }
-
-    public long getNextTrackId() {
-        return nextTrackId;
-    }
-
-    protected long getContentSize() {
-        long contentSize = 4;
-        if (getVersion() == 1) {
-            contentSize += 28;
-        } else {
-            contentSize += 16;
-        }
-        contentSize += 80;
-        return contentSize;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        if (getVersion() == 1) {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            timescale = IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt64(content);
-        } else {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            timescale = IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt32(content);
-        }
-        rate = IsoTypeReader.readFixedPoint1616(content);
-        volume = IsoTypeReader.readFixedPoint88(content);
-        IsoTypeReader.readUInt16(content);
-        IsoTypeReader.readUInt32(content);
-        IsoTypeReader.readUInt32(content);
-
-        matrix = Matrix.fromByteBuffer(content);
-
-        previewTime = content.getInt();
-        previewDuration = content.getInt();
-        posterTime = content.getInt();
-        selectionTime = content.getInt();
-        selectionDuration = content.getInt();
-        currentTime = content.getInt();
-
-        nextTrackId = IsoTypeReader.readUInt32(content);
-
-    }
-
-    public String toString() {
-        StringBuilder result = new StringBuilder();
-        result.append("MovieHeaderBox[");
-        result.append("creationTime=").append(getCreationTime());
-        result.append(";");
-        result.append("modificationTime=").append(getModificationTime());
-        result.append(";");
-        result.append("timescale=").append(getTimescale());
-        result.append(";");
-        result.append("duration=").append(getDuration());
-        result.append(";");
-        result.append("rate=").append(getRate());
-        result.append(";");
-        result.append("volume=").append(getVolume());
-        result.append(";");
-        result.append("matrix=").append(matrix);
-        result.append(";");
-        result.append("nextTrackId=").append(getNextTrackId());
-        result.append("]");
-        return result.toString();
-    }
-
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        if (getVersion() == 1) {
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, timescale);
-            IsoTypeWriter.writeUInt64(byteBuffer, duration);
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, timescale);
-            IsoTypeWriter.writeUInt32(byteBuffer, duration);
-        }
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, rate);
-        IsoTypeWriter.writeFixedPont88(byteBuffer, volume);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-        IsoTypeWriter.writeUInt32(byteBuffer, 0);
-        IsoTypeWriter.writeUInt32(byteBuffer, 0);
-
-        matrix.getContent(byteBuffer);
-
-        byteBuffer.putInt(previewTime);
-        byteBuffer.putInt(previewDuration);
-        byteBuffer.putInt(posterTime);
-        byteBuffer.putInt(selectionTime);
-        byteBuffer.putInt(selectionDuration);
-        byteBuffer.putInt(currentTime);
-
-        IsoTypeWriter.writeUInt32(byteBuffer, nextTrackId);
-    }
-
-
-    public void setCreationTime(Date creationTime) {
-        this.creationTime = creationTime;
-    }
-
-    public void setModificationTime(Date modificationTime) {
-        this.modificationTime = modificationTime;
-    }
-
-    public void setTimescale(long timescale) {
-        this.timescale = timescale;
-    }
-
-    public void setDuration(long duration) {
-        this.duration = duration;
-    }
-
-    public void setRate(double rate) {
-        this.rate = rate;
-    }
-
-    public void setVolume(float volume) {
-        this.volume = volume;
-    }
-
-    public void setMatrix(Matrix matrix) {
-        this.matrix = matrix;
-    }
-
-    public void setNextTrackId(long nextTrackId) {
-        this.nextTrackId = nextTrackId;
-    }
-
-    public int getPreviewTime() {
-        return previewTime;
-    }
-
-    public void setPreviewTime(int previewTime) {
-        this.previewTime = previewTime;
-    }
-
-    public int getPreviewDuration() {
-        return previewDuration;
-    }
-
-    public void setPreviewDuration(int previewDuration) {
-        this.previewDuration = previewDuration;
-    }
-
-    public int getPosterTime() {
-        return posterTime;
-    }
-
-    public void setPosterTime(int posterTime) {
-        this.posterTime = posterTime;
-    }
-
-    public int getSelectionTime() {
-        return selectionTime;
-    }
-
-    public void setSelectionTime(int selectionTime) {
-        this.selectionTime = selectionTime;
-    }
-
-    public int getSelectionDuration() {
-        return selectionDuration;
-    }
-
-    public void setSelectionDuration(int selectionDuration) {
-        this.selectionDuration = selectionDuration;
-    }
-
-    public int getCurrentTime() {
-        return currentTime;
-    }
-
-    public void setCurrentTime(int currentTime) {
-        this.currentTime = currentTime;
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SampleDependencyTypeBox.java b/android/src/main/java/com/coremedia/iso/boxes/SampleDependencyTypeBox.java
deleted file mode 100755
index bb38d8c..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SampleDependencyTypeBox.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2009 castLabs GmbH, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * aligned(8) class SampleDependencyTypeBox
- * extends FullBox('sdtp', version = 0, 0) {
- * for (i=0; i < sample_count; i++){
- * unsigned int(2) reserved = 0;
- * unsigned int(2) sample_depends_on;
- * unsigned int(2) sample_is_depended_on;
- * unsigned int(2) sample_has_redundancy;
- * }
- * }
- */
-public class SampleDependencyTypeBox extends AbstractFullBox {
-    public static final String TYPE = "sdtp";
-
-    private List<Entry> entries = new ArrayList<Entry>();
-
-    public static class Entry {
-
-        public Entry(int value) {
-            this.value = value;
-        }
-
-        private int value;
-
-
-        public int getReserved() {
-            return (value >> 6) & 0x03;
-        }
-
-        public void setReserved(int res) {
-            value = (res & 0x03) << 6 | value & 0x3f;
-        }
-
-        public int getSampleDependsOn() {
-            return (value >> 4) & 0x03;
-        }
-
-        public void setSampleDependsOn(int sdo) {
-            value = (sdo & 0x03) << 4 | value & 0xcf;
-        }
-
-        public int getSampleIsDependentOn() {
-            return (value >> 2) & 0x03;
-        }
-
-        public void setSampleIsDependentOn(int sido) {
-            value = (sido & 0x03) << 2 | value & 0xf3;
-        }
-
-        public int getSampleHasRedundancy() {
-            return value & 0x03;
-        }
-
-        public void setSampleHasRedundancy(int shr) {
-            value = shr & 0x03 | value & 0xfc;
-        }
-
-        @Override
-        public String toString() {
-            return "Entry{" +
-                    "reserved=" + getReserved() +
-                    ", sampleDependsOn=" + getSampleDependsOn() +
-                    ", sampleIsDependentOn=" + getSampleIsDependentOn() +
-                    ", sampleHasRedundancy=" + getSampleHasRedundancy() +
-                    '}';
-        }
-    }
-
-    public SampleDependencyTypeBox() {
-        super(TYPE);
-    }
-
-    @Override
-    protected long getContentSize() {
-        return 4 + entries.size();
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        for (Entry entry : entries) {
-            IsoTypeWriter.writeUInt8(byteBuffer, entry.value);
-        }
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        while (content.remaining() > 0) {
-            entries.add(new Entry(IsoTypeReader.readUInt8(content)));
-        }
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("SampleDependencyTypeBox");
-        sb.append("{entries=").append(entries);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SampleDescriptionBox.java b/android/src/main/java/com/coremedia/iso/boxes/SampleDescriptionBox.java
deleted file mode 100755
index 662fa99..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SampleDescriptionBox.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.boxes.sampleentry.SampleEntry;
-import com.googlecode.mp4parser.FullContainerBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * The sample description table gives detailed information about the coding type used, and any initialization
- * information needed for that coding. <br>
- * The information stored in the sample description box after the entry-count is both track-type specific as
- * documented here, and can also have variants within a track type (e.g. different codings may use different
- * specific information after some common fields, even within a video track).<br>
- * For video tracks, a VisualSampleEntry is used; for audio tracks, an AudioSampleEntry. Hint tracks use an
- * entry format specific to their protocol, with an appropriate name. Timed Text tracks use a TextSampleEntry
- * For hint tracks, the sample description contains appropriate declarative data for the streaming protocol being
- * used, and the format of the hint track. The definition of the sample description is specific to the protocol.
- * Multiple descriptions may be used within a track.<br>
- * The 'protocol' and 'codingname' fields are registered identifiers that uniquely identify the streaming protocol or
- * compression format decoder to be used. A given protocol or codingname may have optional or required
- * extensions to the sample description (e.g. codec initialization parameters). All such extensions shall be within
- * boxes; these boxes occur after the required fields. Unrecognized boxes shall be ignored.
- * <br>
- * Defined in ISO/IEC 14496-12
- *
- * @see com.coremedia.iso.boxes.sampleentry.VisualSampleEntry
- * @see com.coremedia.iso.boxes.sampleentry.TextSampleEntry
- * @see com.coremedia.iso.boxes.sampleentry.AudioSampleEntry
- */
-public class SampleDescriptionBox extends FullContainerBox {
-    public static final String TYPE = "stsd";
-
-    public SampleDescriptionBox() {
-        super(TYPE);
-    }
-
-    @Override
-    protected long getContentSize() {
-        return super.getContentSize() + 4;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        content.get(new byte[4]);
-        parseChildBoxes(content);
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, boxes.size());
-        writeChildBoxes(byteBuffer);
-    }
-
-    public SampleEntry getSampleEntry() {
-        for (Box box : boxes) {
-            if (box instanceof SampleEntry) {
-                return (SampleEntry) box;
-            }
-        }
-        return null;
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SampleSizeBox.java b/android/src/main/java/com/coremedia/iso/boxes/SampleSizeBox.java
deleted file mode 100755
index 3bc1df0..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SampleSizeBox.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This box containes the sample count and a table giving the size in bytes of each sample.
- * Defined in ISO/IEC 14496-12.
- */
-public class SampleSizeBox extends AbstractFullBox {
-    private long sampleSize;
-    private long[] sampleSizes = new long[0];
-    public static final String TYPE = "stsz";
-    int sampleCount;
-
-    public SampleSizeBox() {
-        super(TYPE);
-    }
-
-    /**
-     * Returns the field sample size.
-     * If sampleSize > 0 every sample has the same size.
-     * If sampleSize == 0 the samples have different size as stated in the sampleSizes field.
-     *
-     * @return the sampleSize field
-     */
-    public long getSampleSize() {
-        return sampleSize;
-    }
-
-    public void setSampleSize(long sampleSize) {
-        this.sampleSize = sampleSize;
-    }
-
-
-    public long getSampleSizeAtIndex(int index) {
-        if (sampleSize > 0) {
-            return sampleSize;
-        } else {
-            return sampleSizes[index];
-        }
-    }
-
-    public long getSampleCount() {
-        if (sampleSize > 0) {
-            return sampleCount;
-        } else {
-            return sampleSizes.length;
-        }
-
-    }
-
-    public long[] getSampleSizes() {
-        return sampleSizes;
-    }
-
-    public void setSampleSizes(long[] sampleSizes) {
-        this.sampleSizes = sampleSizes;
-    }
-
-    protected long getContentSize() {
-        return 12 + (sampleSize == 0 ? sampleSizes.length * 4 : 0);
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        sampleSize = IsoTypeReader.readUInt32(content);
-        sampleCount = l2i(IsoTypeReader.readUInt32(content));
-
-        if (sampleSize == 0) {
-            sampleSizes = new long[(int) sampleCount];
-
-            for (int i = 0; i < sampleCount; i++) {
-                sampleSizes[i] = IsoTypeReader.readUInt32(content);
-            }
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, sampleSize);
-
-        if (sampleSize == 0) {
-            IsoTypeWriter.writeUInt32(byteBuffer, sampleSizes.length);
-            for (long sampleSize1 : sampleSizes) {
-                IsoTypeWriter.writeUInt32(byteBuffer, sampleSize1);
-            }
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, sampleCount);
-        }
-
-    }
-
-    public String toString() {
-        return "SampleSizeBox[sampleSize=" + getSampleSize() + ";sampleCount=" + getSampleCount() + "]";
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SampleTableBox.java b/android/src/main/java/com/coremedia/iso/boxes/SampleTableBox.java
deleted file mode 100755
index 33968b3..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SampleTableBox.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-/**
- * The sample table contains all the time and data indexing of the media samples in a track. Using the tables
- * here, it is possible to locate samples in time, determine their type (e.g. I-frame or not), and determine their
- * size, container, and offset into that container.  <br>
- * If the track that contains the Sample Table Box references no data, then the Sample Table Box does not need
- * to contain any sub-boxes (this is not a very useful media track).                                          <br>
- * If the track that the Sample Table Box is contained in does reference data, then the following sub-boxes are
- * required: Sample Description, Sample Size, Sample To Chunk, and Chunk Offset. Further, the Sample
- * Description Box shall contain at least one entry. A Sample Description Box is required because it contains the
- * data reference index field which indicates which Data Reference Box to use to retrieve the media samples.
- * Without the Sample Description, it is not possible to determine where the media samples are stored. The Sync
- * Sample Box is optional. If the Sync Sample Box is not present, all samples are sync samples.<br>
- * Annex A provides a narrative description of random access using the structures defined in the Sample Table
- * Box.
- */
-public class SampleTableBox extends AbstractContainerBox {
-    public static final String TYPE = "stbl";
-
-    public SampleTableBox() {
-        super(TYPE);
-    }
-
-    public SampleDescriptionBox getSampleDescriptionBox() {
-        for (Box box : boxes) {
-            if (box instanceof SampleDescriptionBox) {
-                return (SampleDescriptionBox) box;
-            }
-        }
-        return null;
-    }
-
-    public SampleSizeBox getSampleSizeBox() {
-        for (Box box : boxes) {
-            if (box instanceof SampleSizeBox) {
-                return (SampleSizeBox) box;
-            }
-        }
-        return null;
-    }
-
-    public SampleToChunkBox getSampleToChunkBox() {
-        for (Box box : boxes) {
-            if (box instanceof SampleToChunkBox) {
-                return (SampleToChunkBox) box;
-            }
-        }
-        return null;
-    }
-
-    public ChunkOffsetBox getChunkOffsetBox() {
-        for (Box box : boxes) {
-            if (box instanceof ChunkOffsetBox) {
-                return (ChunkOffsetBox) box;
-            }
-        }
-        return null;
-    }
-
-    public void setChunkOffsetBox(ChunkOffsetBox b) {
-        for (int i = 0; i < boxes.size(); i++) {
-            Box box = boxes.get(i);
-            if (box instanceof ChunkOffsetBox) {
-                boxes.set(i, b);
-            }
-        }
-    }
-
-    public TimeToSampleBox getTimeToSampleBox() {
-        for (Box box : boxes) {
-            if (box instanceof TimeToSampleBox) {
-                return (TimeToSampleBox) box;
-            }
-        }
-        return null;
-    }
-
-    public SyncSampleBox getSyncSampleBox() {
-        for (Box box : boxes) {
-            if (box instanceof SyncSampleBox) {
-                return (SyncSampleBox) box;
-            }
-        }
-        return null;
-    }
-
-    public CompositionTimeToSample getCompositionTimeToSample() {
-        for (Box box : boxes) {
-            if (box instanceof CompositionTimeToSample) {
-                return (CompositionTimeToSample) box;
-            }
-        }
-        return null;
-    }
-
-    public SampleDependencyTypeBox getSampleDependencyTypeBox() {
-        for (Box box : boxes) {
-            if (box instanceof SampleDependencyTypeBox) {
-                return (SampleDependencyTypeBox) box;
-            }
-        }
-        return null;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SampleToChunkBox.java b/android/src/main/java/com/coremedia/iso/boxes/SampleToChunkBox.java
deleted file mode 100755
index 593504d..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SampleToChunkBox.java
+++ /dev/null
@@ -1,156 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * Samples within the media data are grouped into chunks. Chunks can be of different sizes, and the
- * samples within a chunk can have different sizes. This table can be used to find the chunk that
- * contains a sample, its position, and the associated sample description. Defined in ISO/IEC 14496-12.
- */
-public class SampleToChunkBox extends AbstractFullBox {
-    List<Entry> entries = Collections.emptyList();
-
-    public static final String TYPE = "stsc";
-
-    public SampleToChunkBox() {
-        super(TYPE);
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-
-    protected long getContentSize() {
-        return entries.size() * 12 + 8;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-
-        int entryCount = l2i(IsoTypeReader.readUInt32(content));
-        entries = new ArrayList<Entry>(entryCount);
-        for (int i = 0; i < entryCount; i++) {
-            entries.add(new Entry(
-                    IsoTypeReader.readUInt32(content),
-                    IsoTypeReader.readUInt32(content),
-                    IsoTypeReader.readUInt32(content)));
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, entries.size());
-        for (Entry entry : entries) {
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getFirstChunk());
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getSamplesPerChunk());
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getSampleDescriptionIndex());
-        }
-    }
-
-    public String toString() {
-        return "SampleToChunkBox[entryCount=" + entries.size() + "]";
-    }
-
-    /**
-     * Decompresses the list of entries and returns the number of samples per chunk for
-     * every single chunk.
-     *
-     * @param chunkCount overall number of chunks
-     * @return number of samples per chunk
-     */
-    public long[] blowup(int chunkCount) {
-        long[] numberOfSamples = new long[chunkCount];
-        int j = 0;
-        List<SampleToChunkBox.Entry> sampleToChunkEntries = new LinkedList<Entry>(entries);
-        Collections.reverse(sampleToChunkEntries);
-        Iterator<Entry> iterator = sampleToChunkEntries.iterator();
-        SampleToChunkBox.Entry currentEntry = iterator.next();
-
-        for (int i = numberOfSamples.length; i > 1; i--) {
-            numberOfSamples[i - 1] = currentEntry.getSamplesPerChunk();
-            if (i == currentEntry.getFirstChunk()) {
-                currentEntry = iterator.next();
-            }
-        }
-        numberOfSamples[0] = currentEntry.getSamplesPerChunk();
-        return numberOfSamples;
-    }
-
-    public static class Entry {
-        long firstChunk;
-        long samplesPerChunk;
-        long sampleDescriptionIndex;
-
-        public Entry(long firstChunk, long samplesPerChunk, long sampleDescriptionIndex) {
-            this.firstChunk = firstChunk;
-            this.samplesPerChunk = samplesPerChunk;
-            this.sampleDescriptionIndex = sampleDescriptionIndex;
-        }
-
-        public long getFirstChunk() {
-            return firstChunk;
-        }
-
-        public void setFirstChunk(long firstChunk) {
-            this.firstChunk = firstChunk;
-        }
-
-        public long getSamplesPerChunk() {
-            return samplesPerChunk;
-        }
-
-        public void setSamplesPerChunk(long samplesPerChunk) {
-            this.samplesPerChunk = samplesPerChunk;
-        }
-
-        public long getSampleDescriptionIndex() {
-            return sampleDescriptionIndex;
-        }
-
-        public void setSampleDescriptionIndex(long sampleDescriptionIndex) {
-            this.sampleDescriptionIndex = sampleDescriptionIndex;
-        }
-
-        @Override
-        public String toString() {
-            return "Entry{" +
-                    "firstChunk=" + firstChunk +
-                    ", samplesPerChunk=" + samplesPerChunk +
-                    ", sampleDescriptionIndex=" + sampleDescriptionIndex +
-                    '}';
-        }
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SoundMediaHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/SoundMediaHeaderBox.java
deleted file mode 100755
index c5fb88d..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SoundMediaHeaderBox.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-
-public class SoundMediaHeaderBox extends AbstractMediaHeaderBox {
-
-    public static final String TYPE = "smhd";
-    private float balance;
-
-    public SoundMediaHeaderBox() {
-        super(TYPE);
-    }
-
-    public float getBalance() {
-        return balance;
-    }
-
-    protected long getContentSize() {
-        return 8;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        balance = IsoTypeReader.readFixedPoint88(content);
-        IsoTypeReader.readUInt16(content);
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeFixedPont88(byteBuffer, balance);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-    }
-
-    public String toString() {
-        return "SoundMediaHeaderBox[balance=" + getBalance() + "]";
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/StaticChunkOffsetBox.java b/android/src/main/java/com/coremedia/iso/boxes/StaticChunkOffsetBox.java
deleted file mode 100755
index efcdd14..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/StaticChunkOffsetBox.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * The chunk offset table gives the index of each chunk into the containing file. Defined in ISO/IEC 14496-12.
- */
-public class StaticChunkOffsetBox extends ChunkOffsetBox {
-    public static final String TYPE = "stco";
-
-    private long[] chunkOffsets = new long[0];
-
-    public StaticChunkOffsetBox() {
-        super(TYPE);
-    }
-
-    public long[] getChunkOffsets() {
-        return chunkOffsets;
-    }
-
-    protected long getContentSize() {
-        return 8 + chunkOffsets.length * 4;
-    }
-
-    public void setChunkOffsets(long[] chunkOffsets) {
-        this.chunkOffsets = chunkOffsets;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        int entryCount = l2i(IsoTypeReader.readUInt32(content));
-        chunkOffsets = new long[entryCount];
-        for (int i = 0; i < entryCount; i++) {
-            chunkOffsets[i] = IsoTypeReader.readUInt32(content);
-        }
-
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, chunkOffsets.length);
-        for (long chunkOffset : chunkOffsets) {
-            IsoTypeWriter.writeUInt32(byteBuffer, chunkOffset);
-        }
-    }
-
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/SyncSampleBox.java b/android/src/main/java/com/coremedia/iso/boxes/SyncSampleBox.java
deleted file mode 100755
index 5fc758b..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/SyncSampleBox.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This box provides a compact marking of the random access points withinthe stream. The table is arranged in
- * strictly decreasinf order of sample number. Defined in ISO/IEC 14496-12.
- */
-public class SyncSampleBox extends AbstractFullBox {
-    public static final String TYPE = "stss";
-
-    private long[] sampleNumber;
-
-    public SyncSampleBox() {
-        super(TYPE);
-    }
-
-    /**
-     * Gives the numbers of the samples that are random access points in the stream.
-     *
-     * @return random access sample numbers.
-     */
-    public long[] getSampleNumber() {
-        return sampleNumber;
-    }
-
-    protected long getContentSize() {
-        return sampleNumber.length * 4 + 8;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        int entryCount = l2i(IsoTypeReader.readUInt32(content));
-
-        sampleNumber = new long[entryCount];
-        for (int i = 0; i < entryCount; i++) {
-            sampleNumber[i] = IsoTypeReader.readUInt32(content);
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-
-        IsoTypeWriter.writeUInt32(byteBuffer, sampleNumber.length);
-
-        for (long aSampleNumber : sampleNumber) {
-            IsoTypeWriter.writeUInt32(byteBuffer, aSampleNumber);
-        }
-
-    }
-
-    public String toString() {
-        return "SyncSampleBox[entryCount=" + sampleNumber.length + "]";
-    }
-
-    public void setSampleNumber(long[] sampleNumber) {
-        this.sampleNumber = sampleNumber;
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/TimeToSampleBox.java b/android/src/main/java/com/coremedia/iso/boxes/TimeToSampleBox.java
deleted file mode 100755
index 8f4f97e..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/TimeToSampleBox.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This box contains a compact version of a table that allows indexing from decoding time to sample number.
- * Other tables give sample sizes and pointers, from the sample number. Each entry in the table gives the
- * number of consecutive samples with the same time delta, and the delta of those samples. By adding the
- * deltas a complete time-to-sample map may be built.<br>
- * The Decoding Time to Sample Box contains decode time delta's: <code>DT(n+1) = DT(n) + STTS(n)</code> where STTS(n)
- * is the (uncompressed) table entry for sample n.<br>
- * The sample entries are ordered by decoding time stamps; therefore the deltas are all non-negative. <br>
- * The DT axis has a zero origin; <code>DT(i) = SUM(for j=0 to i-1 of delta(j))</code>, and the sum of all
- * deltas gives the length of the media in the track (not mapped to the overall timescale, and not considering
- * any edit list).    <br>
- * The Edit List Box provides the initial CT value if it is non-empty (non-zero).
- */
-public class TimeToSampleBox extends AbstractFullBox {
-    public static final String TYPE = "stts";
-
-    List<Entry> entries = Collections.emptyList();
-
-
-    public TimeToSampleBox() {
-        super(TYPE);
-    }
-
-    protected long getContentSize() {
-        return 8 + entries.size() * 8;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        int entryCount = l2i(IsoTypeReader.readUInt32(content));
-        entries = new ArrayList<Entry>(entryCount);
-
-        for (int i = 0; i < entryCount; i++) {
-            entries.add(new Entry(IsoTypeReader.readUInt32(content), IsoTypeReader.readUInt32(content)));
-        }
-
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt32(byteBuffer, entries.size());
-        for (Entry entry : entries) {
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getCount());
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getDelta());
-        }
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-
-    public String toString() {
-        return "TimeToSampleBox[entryCount=" + entries.size() + "]";
-    }
-
-    public static class Entry {
-        long count;
-        long delta;
-
-        public Entry(long count, long delta) {
-            this.count = count;
-            this.delta = delta;
-        }
-
-        public long getCount() {
-            return count;
-        }
-
-        public long getDelta() {
-            return delta;
-        }
-
-        public void setCount(long count) {
-            this.count = count;
-        }
-
-        public void setDelta(long delta) {
-            this.delta = delta;
-        }
-
-        @Override
-        public String toString() {
-            return "Entry{" +
-                    "count=" + count +
-                    ", delta=" + delta +
-                    '}';
-        }
-    }
-
-    /**
-     * Decompresses the list of entries and returns the list of decoding times.
-     *
-     * @return decoding time per sample
-     */
-    public static long[] blowupTimeToSamples(List<TimeToSampleBox.Entry> entries) {
-        long numOfSamples = 0;
-        for (TimeToSampleBox.Entry entry : entries) {
-            numOfSamples += entry.getCount();
-        }
-        assert numOfSamples <= Integer.MAX_VALUE;
-        long[] decodingTime = new long[(int) numOfSamples];
-
-        int current = 0;
-
-
-        for (TimeToSampleBox.Entry entry : entries) {
-            for (int i = 0; i < entry.getCount(); i++) {
-                decodingTime[current++] = entry.getDelta();
-            }
-        }
-
-        return decodingTime;
-    }
-
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/TrackBox.java b/android/src/main/java/com/coremedia/iso/boxes/TrackBox.java
deleted file mode 100755
index c2806b5..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/TrackBox.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractContainerBox;
-
-/**
- * Tracks are used for two purposes: (a) to contain media data (media tracks) and (b) to contain packetization
- * information for streaming protocols (hint tracks).  <br>
- * There shall be at least one media track within an ISO file, and all the media tracks that contributed to the hint
- * tracks shall remain in the file, even if the media data within them is not referenced by the hint tracks; after
- * deleting all hint tracks, the entire un-hinted presentation shall remain.
- */
-public class TrackBox extends AbstractContainerBox {
-    public static final String TYPE = "trak";
-
-    public TrackBox() {
-        super(TYPE);
-    }
-
-    public TrackHeaderBox getTrackHeaderBox() {
-        for (Box box : boxes) {
-            if (box instanceof TrackHeaderBox) {
-                return (TrackHeaderBox) box;
-            }
-        }
-        return null;
-    }
-
-    /**
-     * Gets the SampleTableBox at mdia/minf/stbl if existing.
-     *
-     * @return the SampleTableBox or <code>null</code>
-     */
-    public SampleTableBox getSampleTableBox() {
-        MediaBox mdia = getMediaBox();
-        if (mdia != null) {
-            MediaInformationBox minf = mdia.getMediaInformationBox();
-            if (minf != null) {
-                return minf.getSampleTableBox();
-            }
-        }
-        return null;
-
-    }
-
-
-    public MediaBox getMediaBox() {
-        for (Box box : boxes) {
-            if (box instanceof MediaBox) {
-                return (MediaBox) box;
-            }
-        }
-        return null;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/TrackHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/TrackHeaderBox.java
deleted file mode 100755
index f46154b..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/TrackHeaderBox.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-import com.googlecode.mp4parser.authoring.DateHelper;
-import com.googlecode.mp4parser.util.Matrix;
-
-import java.nio.ByteBuffer;
-import java.util.Date;
-
-/**
- * This box specifies the characteristics of a single track. Exactly one Track Header Box is contained in a track.<br>
- * In the absence of an edit list, the presentation of a track starts at the beginning of the overall presentation. An
- * empty edit is used to offset the start time of a track. <br>
- * The default value of the track header flags for media tracks is 7 (track_enabled, track_in_movie,
- * track_in_preview). If in a presentation all tracks have neither track_in_movie nor track_in_preview set, then all
- * tracks shall be treated as if both flags were set on all tracks. Hint tracks should have the track header flags set
- * to 0, so that they are ignored for local playback and preview.
- */
-public class TrackHeaderBox extends AbstractFullBox {
-
-    public static final String TYPE = "tkhd";
-
-    private Date creationTime;
-    private Date modificationTime;
-    private long trackId;
-    private long duration;
-    private int layer;
-    private int alternateGroup;
-    private float volume;
-    private Matrix matrix = Matrix.ROTATE_0;
-    private double width;
-    private double height;
-
-
-    public TrackHeaderBox() {
-        super(TYPE);
-
-    }
-
-    public Date getCreationTime() {
-        return creationTime;
-    }
-
-    public Date getModificationTime() {
-        return modificationTime;
-    }
-
-    public long getTrackId() {
-        return trackId;
-    }
-
-    public long getDuration() {
-        return duration;
-    }
-
-    public int getLayer() {
-        return layer;
-    }
-
-    public int getAlternateGroup() {
-        return alternateGroup;
-    }
-
-    public float getVolume() {
-        return volume;
-    }
-
-    public Matrix getMatrix() {
-        return matrix;
-    }
-
-    public double getWidth() {
-        return width;
-    }
-
-    public double getHeight() {
-        return height;
-    }
-
-    protected long getContentSize() {
-        long contentSize = 4;
-        if (getVersion() == 1) {
-            contentSize += 32;
-        } else {
-            contentSize += 20;
-        }
-        contentSize += 60;
-        return contentSize;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        if (getVersion() == 1) {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt64(content));
-            trackId = IsoTypeReader.readUInt32(content);
-            IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt64(content);
-        } else {
-            creationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            modificationTime = DateHelper.convert(IsoTypeReader.readUInt32(content));
-            trackId = IsoTypeReader.readUInt32(content);
-            IsoTypeReader.readUInt32(content);
-            duration = IsoTypeReader.readUInt32(content);
-        } // 196
-        IsoTypeReader.readUInt32(content);
-        IsoTypeReader.readUInt32(content);
-        layer = IsoTypeReader.readUInt16(content);    // 204
-        alternateGroup = IsoTypeReader.readUInt16(content);
-        volume = IsoTypeReader.readFixedPoint88(content);
-        IsoTypeReader.readUInt16(content);     // 212
-        matrix = Matrix.fromByteBuffer(content);
-        width = IsoTypeReader.readFixedPoint1616(content);    // 248
-        height = IsoTypeReader.readFixedPoint1616(content);
-    }
-
-    public void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        if (getVersion() == 1) {
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt64(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, trackId);
-            IsoTypeWriter.writeUInt32(byteBuffer, 0);
-            IsoTypeWriter.writeUInt64(byteBuffer, duration);
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(creationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, DateHelper.convert(modificationTime));
-            IsoTypeWriter.writeUInt32(byteBuffer, trackId);
-            IsoTypeWriter.writeUInt32(byteBuffer, 0);
-            IsoTypeWriter.writeUInt32(byteBuffer, duration);
-        } // 196
-        IsoTypeWriter.writeUInt32(byteBuffer, 0);
-        IsoTypeWriter.writeUInt32(byteBuffer, 0);
-        IsoTypeWriter.writeUInt16(byteBuffer, layer);
-        IsoTypeWriter.writeUInt16(byteBuffer, alternateGroup);
-        IsoTypeWriter.writeFixedPont88(byteBuffer, volume);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-        matrix.getContent(byteBuffer);
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, width);
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, height);
-    }
-
-    public String toString() {
-        StringBuilder result = new StringBuilder();
-        result.append("TrackHeaderBox[");
-        result.append("creationTime=").append(getCreationTime());
-        result.append(";");
-        result.append("modificationTime=").append(getModificationTime());
-        result.append(";");
-        result.append("trackId=").append(getTrackId());
-        result.append(";");
-        result.append("duration=").append(getDuration());
-        result.append(";");
-        result.append("layer=").append(getLayer());
-        result.append(";");
-        result.append("alternateGroup=").append(getAlternateGroup());
-        result.append(";");
-        result.append("volume=").append(getVolume());
-        result.append(";");
-        result.append("matrix=").append(matrix);
-        result.append(";");
-        result.append("width=").append(getWidth());
-        result.append(";");
-        result.append("height=").append(getHeight());
-        result.append("]");
-        return result.toString();
-    }
-
-    public void setCreationTime(Date creationTime) {
-        this.creationTime = creationTime;
-    }
-
-    public void setModificationTime(Date modificationTime) {
-        this.modificationTime = modificationTime;
-    }
-
-    public void setTrackId(long trackId) {
-        this.trackId = trackId;
-    }
-
-    public void setDuration(long duration) {
-        this.duration = duration;
-    }
-
-    public void setLayer(int layer) {
-        this.layer = layer;
-    }
-
-    public void setAlternateGroup(int alternateGroup) {
-        this.alternateGroup = alternateGroup;
-    }
-
-    public void setVolume(float volume) {
-        this.volume = volume;
-    }
-
-    public void setMatrix(Matrix matrix) {
-        this.matrix = matrix;
-    }
-
-    public void setWidth(double width) {
-        this.width = width;
-    }
-
-    public void setHeight(double height) {
-        this.height = height;
-    }
-
-
-    public boolean isEnabled() {
-        return (getFlags() & 1) > 0;
-    }
-
-    public boolean isInMovie() {
-        return (getFlags() & 2) > 0;
-    }
-
-    public boolean isInPreview() {
-        return (getFlags() & 4) > 0;
-    }
-
-    public boolean isInPoster() {
-        return (getFlags() & 8) > 0;
-    }
-
-    public void setEnabled(boolean enabled) {
-        if (enabled) {
-            setFlags(getFlags() | 1);
-        } else {
-            setFlags(getFlags() & ~1);
-        }
-    }
-
-    public void setInMovie(boolean inMovie) {
-        if (inMovie) {
-            setFlags(getFlags() | 2);
-        } else {
-            setFlags(getFlags() & ~2);
-        }
-    }
-
-    public void setInPreview(boolean inPreview) {
-        if (inPreview) {
-            setFlags(getFlags() | 4);
-        } else {
-            setFlags(getFlags() & ~4);
-        }
-    }
-
-    public void setInPoster(boolean inPoster) {
-        if (inPoster) {
-            setFlags(getFlags() | 8);
-        } else {
-            setFlags(getFlags() & ~8);
-        }
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/UserBox.java b/android/src/main/java/com/coremedia/iso/boxes/UserBox.java
deleted file mode 100755
index db0e741..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/UserBox.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.googlecode.mp4parser.AbstractBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * A user specifc box. See ISO/IEC 14496-12 for details.
- */
-public class UserBox extends AbstractBox {
-    byte[] data;
-    public static final String TYPE = "uuid";
-
-    public UserBox(byte[] userType) {
-        super(TYPE, userType);
-    }
-
-
-    protected long getContentSize() {
-        return data.length;
-    }
-
-    public String toString() {
-        return "UserBox[type=" + (getType()) +
-                ";userType=" + new String(getUserType()) +
-                ";contentLength=" + data.length + "]";
-    }
-
-
-    public byte[] getData() {
-        return data;
-    }
-
-    public void setData(byte[] data) {
-        this.data = data;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        data = new byte[content.remaining()];
-        content.get(data);
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        byteBuffer.put(data);
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/VideoMediaHeaderBox.java b/android/src/main/java/com/coremedia/iso/boxes/VideoMediaHeaderBox.java
deleted file mode 100755
index 421a67d..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/VideoMediaHeaderBox.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-
-/**
- * The video media header contains general presentation information, independent of the coding, for video
- * media. Note that the flags field has the value 1.
- */
-public class VideoMediaHeaderBox extends AbstractMediaHeaderBox {
-    private int graphicsmode = 0;
-    private int[] opcolor = new int[]{0, 0, 0};
-    public static final String TYPE = "vmhd";
-
-    public VideoMediaHeaderBox() {
-        super(TYPE);
-        setFlags(1); // 1 is default.
-    }
-
-    public int getGraphicsmode() {
-        return graphicsmode;
-    }
-
-    public int[] getOpcolor() {
-        return opcolor;
-    }
-
-    protected long getContentSize() {
-        return 12;
-    }
-
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        graphicsmode = IsoTypeReader.readUInt16(content);
-        opcolor = new int[3];
-        for (int i = 0; i < 3; i++) {
-            opcolor[i] = IsoTypeReader.readUInt16(content);
-        }
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        IsoTypeWriter.writeUInt16(byteBuffer, graphicsmode);
-        for (int anOpcolor : opcolor) {
-            IsoTypeWriter.writeUInt16(byteBuffer, anOpcolor);
-        }
-    }
-
-    public String toString() {
-        return "VideoMediaHeaderBox[graphicsmode=" + getGraphicsmode() + ";opcolor0=" + getOpcolor()[0] + ";opcolor1=" + getOpcolor()[1] + ";opcolor2=" + getOpcolor()[2] + "]";
-    }
-
-    public void setOpcolor(int[] opcolor) {
-        this.opcolor = opcolor;
-    }
-
-    public void setGraphicsmode(int graphicsmode) {
-        this.graphicsmode = graphicsmode;
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/h264/AvcConfigurationBox.java b/android/src/main/java/com/coremedia/iso/boxes/h264/AvcConfigurationBox.java
deleted file mode 100755
index 52f3695..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/h264/AvcConfigurationBox.java
+++ /dev/null
@@ -1,378 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes.h264;
-
-import com.coremedia.iso.Hex;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractBox;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.BitReaderBuffer;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.BitWriterBuffer;
-import com.googlecode.mp4parser.h264.model.PictureParameterSet;
-import com.googlecode.mp4parser.h264.model.SeqParameterSet;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Defined in ISO/IEC 14496-15:2004.
- */
-public final class AvcConfigurationBox extends AbstractBox {
-    public static final String TYPE = "avcC";
-
-    public AVCDecoderConfigurationRecord avcDecoderConfigurationRecord = new AVCDecoderConfigurationRecord();
-
-
-    public AvcConfigurationBox() {
-        super(TYPE);
-    }
-
-    public int getConfigurationVersion() {
-        return avcDecoderConfigurationRecord.configurationVersion;
-    }
-
-    public int getAvcProfileIndication() {
-        return avcDecoderConfigurationRecord.avcProfileIndication;
-    }
-
-    public int getProfileCompatibility() {
-        return avcDecoderConfigurationRecord.profileCompatibility;
-    }
-
-    public int getAvcLevelIndication() {
-        return avcDecoderConfigurationRecord.avcLevelIndication;
-    }
-
-    public int getLengthSizeMinusOne() {
-        return avcDecoderConfigurationRecord.lengthSizeMinusOne;
-    }
-
-    public List<byte[]> getSequenceParameterSets() {
-        return Collections.unmodifiableList(avcDecoderConfigurationRecord.sequenceParameterSets);
-    }
-
-    public List<byte[]> getPictureParameterSets() {
-        return Collections.unmodifiableList(avcDecoderConfigurationRecord.pictureParameterSets);
-    }
-
-    public void setConfigurationVersion(int configurationVersion) {
-        this.avcDecoderConfigurationRecord.configurationVersion = configurationVersion;
-    }
-
-    public void setAvcProfileIndication(int avcProfileIndication) {
-        this.avcDecoderConfigurationRecord.avcProfileIndication = avcProfileIndication;
-    }
-
-    public void setProfileCompatibility(int profileCompatibility) {
-        this.avcDecoderConfigurationRecord.profileCompatibility = profileCompatibility;
-    }
-
-    public void setAvcLevelIndication(int avcLevelIndication) {
-        this.avcDecoderConfigurationRecord.avcLevelIndication = avcLevelIndication;
-    }
-
-    public void setLengthSizeMinusOne(int lengthSizeMinusOne) {
-        this.avcDecoderConfigurationRecord.lengthSizeMinusOne = lengthSizeMinusOne;
-    }
-
-    public void setSequenceParameterSets(List<byte[]> sequenceParameterSets) {
-        this.avcDecoderConfigurationRecord.sequenceParameterSets = sequenceParameterSets;
-    }
-
-    public void setPictureParameterSets(List<byte[]> pictureParameterSets) {
-        this.avcDecoderConfigurationRecord.pictureParameterSets = pictureParameterSets;
-    }
-
-    public int getChromaFormat() {
-        return avcDecoderConfigurationRecord.chromaFormat;
-    }
-
-    public void setChromaFormat(int chromaFormat) {
-        this.avcDecoderConfigurationRecord.chromaFormat = chromaFormat;
-    }
-
-    public int getBitDepthLumaMinus8() {
-        return avcDecoderConfigurationRecord.bitDepthLumaMinus8;
-    }
-
-    public void setBitDepthLumaMinus8(int bitDepthLumaMinus8) {
-        this.avcDecoderConfigurationRecord.bitDepthLumaMinus8 = bitDepthLumaMinus8;
-    }
-
-    public int getBitDepthChromaMinus8() {
-        return avcDecoderConfigurationRecord.bitDepthChromaMinus8;
-    }
-
-    public void setBitDepthChromaMinus8(int bitDepthChromaMinus8) {
-        this.avcDecoderConfigurationRecord.bitDepthChromaMinus8 = bitDepthChromaMinus8;
-    }
-
-    public List<byte[]> getSequenceParameterSetExts() {
-        return avcDecoderConfigurationRecord.sequenceParameterSetExts;
-    }
-
-    public void setSequenceParameterSetExts(List<byte[]> sequenceParameterSetExts) {
-        this.avcDecoderConfigurationRecord.sequenceParameterSetExts = sequenceParameterSetExts;
-    }
-
-    public boolean hasExts() {
-        return avcDecoderConfigurationRecord.hasExts;
-    }
-
-    public void setHasExts(boolean hasExts) {
-        this.avcDecoderConfigurationRecord.hasExts = hasExts;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        avcDecoderConfigurationRecord = new AVCDecoderConfigurationRecord(content);
-    }
-
-
-    @Override
-    public long getContentSize() {
-        return avcDecoderConfigurationRecord.getContentSize();
-    }
-
-
-    @Override
-    public void getContent(ByteBuffer byteBuffer) {
-        avcDecoderConfigurationRecord.getContent(byteBuffer);
-    }
-
-    // just to display sps in isoviewer no practical use
-    public String[] getSPS() {
-        return avcDecoderConfigurationRecord.getSPS();
-    }
-
-    public String[] getPPS() {
-        return avcDecoderConfigurationRecord.getPPS();
-    }
-
-    public List<String> getSequenceParameterSetsAsStrings() {
-        return avcDecoderConfigurationRecord.getSequenceParameterSetsAsStrings();
-    }
-
-    public List<String> getSequenceParameterSetExtsAsStrings() {
-        return avcDecoderConfigurationRecord.getSequenceParameterSetExtsAsStrings();
-    }
-
-    public List<String> getPictureParameterSetsAsStrings() {
-        return avcDecoderConfigurationRecord.getPictureParameterSetsAsStrings();
-    }
-
-    public AVCDecoderConfigurationRecord getavcDecoderConfigurationRecord() {
-        return avcDecoderConfigurationRecord;
-    }
-
-
-    public static class AVCDecoderConfigurationRecord {
-        public int configurationVersion;
-        public int avcProfileIndication;
-        public int profileCompatibility;
-        public int avcLevelIndication;
-        public int lengthSizeMinusOne;
-        public List<byte[]> sequenceParameterSets = new ArrayList<byte[]>();
-        public List<byte[]> pictureParameterSets = new ArrayList<byte[]>();
-
-        public boolean hasExts = true;
-        public int chromaFormat = 1;
-        public int bitDepthLumaMinus8 = 0;
-        public int bitDepthChromaMinus8 = 0;
-        public List<byte[]> sequenceParameterSetExts = new ArrayList<byte[]>();
-
-        /**
-         * Just for non-spec-conform encoders
-         */
-        public int lengthSizeMinusOnePaddingBits = 60;
-        public int numberOfSequenceParameterSetsPaddingBits = 7;
-        public int chromaFormatPaddingBits = 31;
-        public int bitDepthLumaMinus8PaddingBits = 31;
-        public int bitDepthChromaMinus8PaddingBits = 31;
-
-        public AVCDecoderConfigurationRecord() {
-        }
-
-        public AVCDecoderConfigurationRecord(ByteBuffer content) {
-            configurationVersion = IsoTypeReader.readUInt8(content);
-            avcProfileIndication = IsoTypeReader.readUInt8(content);
-            profileCompatibility = IsoTypeReader.readUInt8(content);
-            avcLevelIndication = IsoTypeReader.readUInt8(content);
-            BitReaderBuffer brb = new BitReaderBuffer(content);
-            lengthSizeMinusOnePaddingBits = brb.readBits(6);
-            lengthSizeMinusOne = brb.readBits(2);
-            numberOfSequenceParameterSetsPaddingBits = brb.readBits(3);
-            int numberOfSeuqenceParameterSets = brb.readBits(5);
-            for (int i = 0; i < numberOfSeuqenceParameterSets; i++) {
-                int sequenceParameterSetLength = IsoTypeReader.readUInt16(content);
-
-                byte[] sequenceParameterSetNALUnit = new byte[sequenceParameterSetLength];
-                content.get(sequenceParameterSetNALUnit);
-                sequenceParameterSets.add(sequenceParameterSetNALUnit);
-            }
-            long numberOfPictureParameterSets = IsoTypeReader.readUInt8(content);
-            for (int i = 0; i < numberOfPictureParameterSets; i++) {
-                int pictureParameterSetLength = IsoTypeReader.readUInt16(content);
-                byte[] pictureParameterSetNALUnit = new byte[pictureParameterSetLength];
-                content.get(pictureParameterSetNALUnit);
-                pictureParameterSets.add(pictureParameterSetNALUnit);
-            }
-            if (content.remaining() < 4) {
-                hasExts = false;
-            }
-            if (hasExts && (avcProfileIndication == 100 || avcProfileIndication == 110 || avcProfileIndication == 122 || avcProfileIndication == 144)) {
-                // actually only some bits are interesting so masking with & x would be good but not all Mp4 creating tools set the reserved bits to 1.
-                // So we need to store all bits
-                brb = new BitReaderBuffer(content);
-                chromaFormatPaddingBits = brb.readBits(6);
-                chromaFormat = brb.readBits(2);
-                bitDepthLumaMinus8PaddingBits = brb.readBits(5);
-                bitDepthLumaMinus8 = brb.readBits(3);
-                bitDepthChromaMinus8PaddingBits = brb.readBits(5);
-                bitDepthChromaMinus8 = brb.readBits(3);
-                long numOfSequenceParameterSetExt = IsoTypeReader.readUInt8(content);
-                for (int i = 0; i < numOfSequenceParameterSetExt; i++) {
-                    int sequenceParameterSetExtLength = IsoTypeReader.readUInt16(content);
-                    byte[] sequenceParameterSetExtNALUnit = new byte[sequenceParameterSetExtLength];
-                    content.get(sequenceParameterSetExtNALUnit);
-                    sequenceParameterSetExts.add(sequenceParameterSetExtNALUnit);
-                }
-            } else {
-                chromaFormat = -1;
-                bitDepthLumaMinus8 = -1;
-                bitDepthChromaMinus8 = -1;
-            }
-        }
-
-        public void getContent(ByteBuffer byteBuffer) {
-            IsoTypeWriter.writeUInt8(byteBuffer, configurationVersion);
-            IsoTypeWriter.writeUInt8(byteBuffer, avcProfileIndication);
-            IsoTypeWriter.writeUInt8(byteBuffer, profileCompatibility);
-            IsoTypeWriter.writeUInt8(byteBuffer, avcLevelIndication);
-            BitWriterBuffer bwb = new BitWriterBuffer(byteBuffer);
-            bwb.writeBits(lengthSizeMinusOnePaddingBits, 6);
-            bwb.writeBits(lengthSizeMinusOne, 2);
-            bwb.writeBits(numberOfSequenceParameterSetsPaddingBits, 3);
-            bwb.writeBits(pictureParameterSets.size(), 5);
-            for (byte[] sequenceParameterSetNALUnit : sequenceParameterSets) {
-                IsoTypeWriter.writeUInt16(byteBuffer, sequenceParameterSetNALUnit.length);
-                byteBuffer.put(sequenceParameterSetNALUnit);
-            }
-            IsoTypeWriter.writeUInt8(byteBuffer, pictureParameterSets.size());
-            for (byte[] pictureParameterSetNALUnit : pictureParameterSets) {
-                IsoTypeWriter.writeUInt16(byteBuffer, pictureParameterSetNALUnit.length);
-                byteBuffer.put(pictureParameterSetNALUnit);
-            }
-            if (hasExts && (avcProfileIndication == 100 || avcProfileIndication == 110 || avcProfileIndication == 122 || avcProfileIndication == 144)) {
-
-                bwb = new BitWriterBuffer(byteBuffer);
-                bwb.writeBits(chromaFormatPaddingBits, 6);
-                bwb.writeBits(chromaFormat, 2);
-                bwb.writeBits(bitDepthLumaMinus8PaddingBits, 5);
-                bwb.writeBits(bitDepthLumaMinus8, 3);
-                bwb.writeBits(bitDepthChromaMinus8PaddingBits, 5);
-                bwb.writeBits(bitDepthChromaMinus8, 3);
-                for (byte[] sequenceParameterSetExtNALUnit : sequenceParameterSetExts) {
-                    IsoTypeWriter.writeUInt16(byteBuffer, sequenceParameterSetExtNALUnit.length);
-                    byteBuffer.put(sequenceParameterSetExtNALUnit);
-                }
-            }
-        }
-
-        public long getContentSize() {
-            long size = 5;
-            size += 1; // sequenceParamsetLength
-            for (byte[] sequenceParameterSetNALUnit : sequenceParameterSets) {
-                size += 2; //lengthSizeMinusOne field
-                size += sequenceParameterSetNALUnit.length;
-            }
-            size += 1; // pictureParamsetLength
-            for (byte[] pictureParameterSetNALUnit : pictureParameterSets) {
-                size += 2; //lengthSizeMinusOne field
-                size += pictureParameterSetNALUnit.length;
-            }
-            if (hasExts && (avcProfileIndication == 100 || avcProfileIndication == 110 || avcProfileIndication == 122 || avcProfileIndication == 144)) {
-                size += 4;
-                for (byte[] sequenceParameterSetExtNALUnit : sequenceParameterSetExts) {
-                    size += 2;
-                    size += sequenceParameterSetExtNALUnit.length;
-                }
-            }
-
-            return size;
-        }
-
-        public String[] getPPS() {
-            ArrayList<String> l = new ArrayList<String>();
-            for (byte[] pictureParameterSet : pictureParameterSets) {
-                String details = "not parsable";
-                try {
-                    details = PictureParameterSet.read(pictureParameterSet).toString();
-                } catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-
-                l.add(details);
-            }
-            return l.toArray(new String[l.size()]);
-        }
-
-        public String[] getSPS() {
-            ArrayList<String> l = new ArrayList<String>();
-            for (byte[] sequenceParameterSet : sequenceParameterSets) {
-                String detail = "not parsable";
-                try {
-                    detail = SeqParameterSet.read(new ByteArrayInputStream(sequenceParameterSet)).toString();
-                } catch (IOException e) {
-
-                }
-                l.add(detail);
-            }
-            return l.toArray(new String[l.size()]);
-        }
-
-        public List<String> getSequenceParameterSetsAsStrings() {
-            List <String> result = new ArrayList<String>(sequenceParameterSets.size());
-            for (byte[] parameterSet : sequenceParameterSets) {
-                result.add(Hex.encodeHex(parameterSet));
-            }
-            return result;
-        }
-
-        public List<String> getSequenceParameterSetExtsAsStrings() {
-            List <String> result = new ArrayList<String>(sequenceParameterSetExts.size());
-            for (byte[] parameterSet : sequenceParameterSetExts) {
-                result.add(Hex.encodeHex(parameterSet));
-            }
-            return result;
-        }
-
-        public List<String> getPictureParameterSetsAsStrings() {
-            List <String> result = new ArrayList<String>(pictureParameterSets.size());
-            for (byte[] parameterSet : pictureParameterSets) {
-                result.add(Hex.encodeHex(parameterSet));
-            }
-            return result;
-        }
-
-    }
-}
-
diff --git a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/AudioSampleEntry.java b/android/src/main/java/com/coremedia/iso/boxes/sampleentry/AudioSampleEntry.java
deleted file mode 100755
index 69aeb79..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/AudioSampleEntry.java
+++ /dev/null
@@ -1,278 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes.sampleentry;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * Contains basic information about the audio samples in this track. Format-specific information
- * is appened as boxes after the data described in ISO/IEC 14496-12 chapter 8.16.2.
- */
-public class AudioSampleEntry extends SampleEntry implements ContainerBox {
-
-    public static final String TYPE1 = "samr";
-    public static final String TYPE2 = "sawb";
-    public static final String TYPE3 = "mp4a";
-    public static final String TYPE4 = "drms";
-    public static final String TYPE5 = "alac";
-    public static final String TYPE7 = "owma";
-    public static final String TYPE8 = "ac-3"; /* ETSI TS 102 366 1.2.1 Annex F */
-    public static final String TYPE9 = "ec-3"; /* ETSI TS 102 366 1.2.1 Annex F */
-    public static final String TYPE10 = "mlpa";
-    public static final String TYPE11 = "dtsl";
-    public static final String TYPE12 = "dtsh";
-    public static final String TYPE13 = "dtse";
-
-    /**
-     * Identifier for an encrypted audio track.
-     *
-     * @see com.coremedia.iso.boxes.ProtectionSchemeInformationBox
-     */
-    public static final String TYPE_ENCRYPTED = "enca";
-
-    private int channelCount;
-    private int sampleSize;
-    private long sampleRate;
-    private int soundVersion;
-    private int compressionId;
-    private int packetSize;
-    private long samplesPerPacket;
-    private long bytesPerPacket;
-    private long bytesPerFrame;
-    private long bytesPerSample;
-
-    private int reserved1;
-    private long reserved2;
-    private byte[] soundVersion2Data;
-    private BoxParser boxParser;
-
-    public AudioSampleEntry(String type) {
-        super(type);
-    }
-
-    public int getChannelCount() {
-        return channelCount;
-    }
-
-    public int getSampleSize() {
-        return sampleSize;
-    }
-
-    public long getSampleRate() {
-        return sampleRate;
-    }
-
-    public int getSoundVersion() {
-        return soundVersion;
-    }
-
-    public int getCompressionId() {
-        return compressionId;
-    }
-
-    public int getPacketSize() {
-        return packetSize;
-    }
-
-    public long getSamplesPerPacket() {
-        return samplesPerPacket;
-    }
-
-    public long getBytesPerPacket() {
-        return bytesPerPacket;
-    }
-
-    public long getBytesPerFrame() {
-        return bytesPerFrame;
-    }
-
-    public long getBytesPerSample() {
-        return bytesPerSample;
-    }
-
-    public byte[] getSoundVersion2Data() {
-        return soundVersion2Data;
-    }
-
-    public int getReserved1() {
-        return reserved1;
-    }
-
-    public long getReserved2() {
-        return reserved2;
-    }
-
-    public void setChannelCount(int channelCount) {
-        this.channelCount = channelCount;
-    }
-
-    public void setSampleSize(int sampleSize) {
-        this.sampleSize = sampleSize;
-    }
-
-    public void setSampleRate(long sampleRate) {
-        this.sampleRate = sampleRate;
-    }
-
-    public void setSoundVersion(int soundVersion) {
-        this.soundVersion = soundVersion;
-    }
-
-    public void setCompressionId(int compressionId) {
-        this.compressionId = compressionId;
-    }
-
-    public void setPacketSize(int packetSize) {
-        this.packetSize = packetSize;
-    }
-
-    public void setSamplesPerPacket(long samplesPerPacket) {
-        this.samplesPerPacket = samplesPerPacket;
-    }
-
-    public void setBytesPerPacket(long bytesPerPacket) {
-        this.bytesPerPacket = bytesPerPacket;
-    }
-
-    public void setBytesPerFrame(long bytesPerFrame) {
-        this.bytesPerFrame = bytesPerFrame;
-    }
-
-    public void setBytesPerSample(long bytesPerSample) {
-        this.bytesPerSample = bytesPerSample;
-    }
-
-    public void setReserved1(int reserved1) {
-        this.reserved1 = reserved1;
-    }
-
-    public void setReserved2(long reserved2) {
-        this.reserved2 = reserved2;
-    }
-
-    public void setSoundVersion2Data(byte[] soundVersion2Data) {
-        this.soundVersion2Data = soundVersion2Data;
-    }
-
-    public void setBoxParser(BoxParser boxParser) {
-        this.boxParser = boxParser;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        _parseReservedAndDataReferenceIndex(content);    //parses the six reserved bytes and dataReferenceIndex
-        // 8 bytes already parsed
-        //reserved bits - used by qt
-        soundVersion = IsoTypeReader.readUInt16(content);
-
-        //reserved
-        reserved1 = IsoTypeReader.readUInt16(content);
-        reserved2 = IsoTypeReader.readUInt32(content);
-
-        channelCount = IsoTypeReader.readUInt16(content);
-        sampleSize = IsoTypeReader.readUInt16(content);
-        //reserved bits - used by qt
-        compressionId = IsoTypeReader.readUInt16(content);
-        //reserved bits - used by qt
-        packetSize = IsoTypeReader.readUInt16(content);
-        //sampleRate = in.readFixedPoint1616();
-        sampleRate = IsoTypeReader.readUInt32(content);
-        if (!type.equals("mlpa")) {
-            sampleRate = sampleRate >>> 16;
-        }
-
-        //more qt stuff - see http://mp4v2.googlecode.com/svn-history/r388/trunk/src/atom_sound.cpp
-        if (soundVersion > 0) {
-            samplesPerPacket = IsoTypeReader.readUInt32(content);
-            bytesPerPacket = IsoTypeReader.readUInt32(content);
-            bytesPerFrame = IsoTypeReader.readUInt32(content);
-            bytesPerSample = IsoTypeReader.readUInt32(content);
-        }
-        if (soundVersion == 2) {
-
-            soundVersion2Data = new byte[20];
-            content.get(20);
-        }
-        _parseChildBoxes(content);
-
-    }
-
-
-    @Override
-    protected long getContentSize() {
-        long contentSize = 28;
-        contentSize += soundVersion > 0 ? 16 : 0;
-        contentSize += soundVersion == 2 ? 20 : 0;
-        for (Box boxe : boxes) {
-            contentSize += boxe.getSize();
-        }
-        return contentSize;
-    }
-
-    @Override
-    public String toString() {
-        return "AudioSampleEntry{" +
-                "bytesPerSample=" + bytesPerSample +
-                ", bytesPerFrame=" + bytesPerFrame +
-                ", bytesPerPacket=" + bytesPerPacket +
-                ", samplesPerPacket=" + samplesPerPacket +
-                ", packetSize=" + packetSize +
-                ", compressionId=" + compressionId +
-                ", soundVersion=" + soundVersion +
-                ", sampleRate=" + sampleRate +
-                ", sampleSize=" + sampleSize +
-                ", channelCount=" + channelCount +
-                ", boxes=" + getBoxes() +
-                '}';
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        _writeReservedAndDataReferenceIndex(byteBuffer);
-        IsoTypeWriter.writeUInt16(byteBuffer, soundVersion);
-        IsoTypeWriter.writeUInt16(byteBuffer, reserved1);
-        IsoTypeWriter.writeUInt32(byteBuffer, reserved2);
-        IsoTypeWriter.writeUInt16(byteBuffer, channelCount);
-        IsoTypeWriter.writeUInt16(byteBuffer, sampleSize);
-        IsoTypeWriter.writeUInt16(byteBuffer, compressionId);
-        IsoTypeWriter.writeUInt16(byteBuffer, packetSize);
-        //isos.writeFixedPont1616(getSampleRate());
-        if (type.equals("mlpa")) {
-            IsoTypeWriter.writeUInt32(byteBuffer, getSampleRate());
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, getSampleRate() << 16);
-        }
-
-        if (soundVersion > 0) {
-            IsoTypeWriter.writeUInt32(byteBuffer, samplesPerPacket);
-            IsoTypeWriter.writeUInt32(byteBuffer, bytesPerPacket);
-            IsoTypeWriter.writeUInt32(byteBuffer, bytesPerFrame);
-            IsoTypeWriter.writeUInt32(byteBuffer, bytesPerSample);
-        }
-
-        if (soundVersion == 2) {
-            byteBuffer.put(soundVersion2Data);
-        }
-        _writeChildBoxes(byteBuffer);
-    }
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/SampleEntry.java b/android/src/main/java/com/coremedia/iso/boxes/sampleentry/SampleEntry.java
deleted file mode 100755
index f9eb071..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/SampleEntry.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes.sampleentry;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractBox;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.googlecode.mp4parser.util.ByteBufferByteChannel;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * Abstract base class for all sample entries.
- *
- * @see com.coremedia.iso.boxes.sampleentry.AudioSampleEntry
- * @see com.coremedia.iso.boxes.sampleentry.VisualSampleEntry
- * @see com.coremedia.iso.boxes.sampleentry.TextSampleEntry
- */
-public abstract class SampleEntry extends AbstractBox implements ContainerBox {
-
-
-    private int dataReferenceIndex = 1;
-    protected List<Box> boxes = new LinkedList<Box>();
-    private BoxParser boxParser;
-
-
-    protected SampleEntry(String type) {
-        super(type);
-    }
-
-    public void setType(String type) {
-        this.type = type;
-    }
-
-    public int getDataReferenceIndex() {
-        return dataReferenceIndex;
-    }
-
-    public void setDataReferenceIndex(int dataReferenceIndex) {
-        this.dataReferenceIndex = dataReferenceIndex;
-    }
-
-    public void setBoxes(List<Box> boxes) {
-        this.boxes = new LinkedList<Box>(boxes);
-    }
-
-    public void addBox(Box b) {
-        b.setParent(this);
-        boxes.add(b);
-    }
-
-    public boolean removeBox(Box b) {
-        b.setParent(this);
-        return boxes.remove(b);
-    }
-
-    public List<Box> getBoxes() {
-        return boxes;
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz, boolean recursive) {
-        List<T> boxesToBeReturned = new ArrayList<T>(2);
-        for (Box boxe : boxes) { //clazz.isInstance(boxe) / clazz == boxe.getClass()?
-            if (clazz == boxe.getClass()) {
-                boxesToBeReturned.add((T) boxe);
-            }
-
-            if (recursive && boxe instanceof ContainerBox) {
-                boxesToBeReturned.addAll(((ContainerBox) boxe).getBoxes(clazz, recursive));
-            }
-        }
-        // Optimize here! Spare object creation work on arrays directly! System.arrayCopy
-        return boxesToBeReturned;
-        //return (T[]) boxesToBeReturned.toArray();
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz) {
-        return getBoxes(clazz, false);
-    }
-
-    @Override
-    public void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException {
-        this.boxParser = boxParser;
-        super.parse(readableByteChannel, header, contentSize, boxParser);
-
-    }
-
-
-    public void _parseReservedAndDataReferenceIndex(ByteBuffer content) {
-        content.get(new byte[6]); // ignore 6 reserved bytes;
-        dataReferenceIndex = IsoTypeReader.readUInt16(content);
-    }
-
-    public void _parseChildBoxes(ByteBuffer content) {
-        while (content.remaining() > 8) {
-            try {
-                boxes.add(boxParser.parseBox(new ByteBufferByteChannel(content), this));
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-
-        }
-        setDeadBytes(content.slice());
-    }
-
-    public void _writeReservedAndDataReferenceIndex(ByteBuffer bb) {
-        bb.put(new byte[6]);
-        IsoTypeWriter.writeUInt16(bb, dataReferenceIndex);
-    }
-
-    public void _writeChildBoxes(ByteBuffer bb) {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        WritableByteChannel wbc = Channels.newChannel(baos);
-        try {
-            for (Box box : boxes) {
-                box.getBox(wbc);
-            }
-            wbc.close();
-        } catch (IOException e) {
-            throw new RuntimeException("Cannot happen. Everything should be in memory and therefore no exceptions.");
-        }
-        bb.put(baos.toByteArray());
-    }
-
-    public long getNumOfBytesToFirstChild() {
-        long sizeOfChildren = 0;
-        for (Box box : boxes) {
-            sizeOfChildren += box.getSize();
-        }
-        return getSize() - sizeOfChildren;
-    }
-
-}
diff --git a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/VisualSampleEntry.java b/android/src/main/java/com/coremedia/iso/boxes/sampleentry/VisualSampleEntry.java
deleted file mode 100755
index a732019..0000000
--- a/android/src/main/java/com/coremedia/iso/boxes/sampleentry/VisualSampleEntry.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.coremedia.iso.boxes.sampleentry;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.Utf8;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * Contains information common to all visual tracks.
- * <code>
- * <pre>
- * class VisualSampleEntry(codingname) extends SampleEntry (codingname){
- * unsigned int(16) pre_defined = 0;
- * const unsigned int(16) reserved = 0;
- * unsigned int(32)[3] pre_defined = 0;
- * unsigned int(16) width;
- * unsigned int(16) height;
- * template unsigned int(32) horizresolution = 0x00480000; // 72 dpi
- * template unsigned int(32) vertresolution = 0x00480000; // 72 dpi
- * const unsigned int(32) reserved = 0;
- * template unsigned int(16) frame_count = 1;
- * string[32] compressorname;
- * template unsigned int(16) depth = 0x0018;
- * int(16) pre_defined = -1;
- * }<br>
- * </pre>
- * </code>
- * <p/>
- * Format-specific informationis appened as boxes after the data described in ISO/IEC 14496-12 chapter 8.16.2.
- */
-public class VisualSampleEntry extends SampleEntry implements ContainerBox {
-    public static final String TYPE1 = "mp4v";
-    public static final String TYPE2 = "s263";
-    public static final String TYPE3 = "avc1";
-
-
-    /**
-     * Identifier for an encrypted video track.
-     *
-     * @see com.coremedia.iso.boxes.ProtectionSchemeInformationBox
-     */
-    public static final String TYPE_ENCRYPTED = "encv";
-
-
-    private int width;
-    private int height;
-    private double horizresolution = 72;
-    private double vertresolution = 72;
-    private int frameCount = 1;
-    private String compressorname;
-    private int depth = 24;
-
-    private long[] predefined = new long[3];
-
-    public VisualSampleEntry(String type) {
-        super(type);
-    }
-
-    public int getWidth() {
-        return width;
-    }
-
-    public int getHeight() {
-        return height;
-    }
-
-    public double getHorizresolution() {
-        return horizresolution;
-    }
-
-    public double getVertresolution() {
-        return vertresolution;
-    }
-
-    public int getFrameCount() {
-        return frameCount;
-    }
-
-    public String getCompressorname() {
-        return compressorname;
-    }
-
-    public int getDepth() {
-        return depth;
-    }
-
-    public void setCompressorname(String compressorname) {
-        this.compressorname = compressorname;
-    }
-
-    public void setWidth(int width) {
-        this.width = width;
-    }
-
-    public void setHeight(int height) {
-        this.height = height;
-    }
-
-    public void setHorizresolution(double horizresolution) {
-        this.horizresolution = horizresolution;
-    }
-
-    public void setVertresolution(double vertresolution) {
-        this.vertresolution = vertresolution;
-    }
-
-    public void setFrameCount(int frameCount) {
-        this.frameCount = frameCount;
-    }
-
-    public void setDepth(int depth) {
-        this.depth = depth;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        _parseReservedAndDataReferenceIndex(content);
-        long tmp = IsoTypeReader.readUInt16(content);
-        assert 0 == tmp : "reserved byte not 0";
-        tmp = IsoTypeReader.readUInt16(content);
-        assert 0 == tmp : "reserved byte not 0";
-        predefined[0] = IsoTypeReader.readUInt32(content);     // should be zero
-        predefined[1] = IsoTypeReader.readUInt32(content);     // should be zero
-        predefined[2] = IsoTypeReader.readUInt32(content);     // should be zero
-        width = IsoTypeReader.readUInt16(content);
-        height = IsoTypeReader.readUInt16(content);
-        horizresolution = IsoTypeReader.readFixedPoint1616(content);
-        vertresolution = IsoTypeReader.readFixedPoint1616(content);
-        tmp = IsoTypeReader.readUInt32(content);
-        assert 0 == tmp : "reserved byte not 0";
-        frameCount = IsoTypeReader.readUInt16(content);
-        int compressornameDisplayAbleData = IsoTypeReader.readUInt8(content);
-        if (compressornameDisplayAbleData > 31) {
-            System.out.println("invalid compressor name displayable data: " + compressornameDisplayAbleData);
-            compressornameDisplayAbleData = 31;
-        }
-        byte[] bytes = new byte[compressornameDisplayAbleData];
-        content.get(bytes);
-        compressorname = Utf8.convert(bytes);
-        if (compressornameDisplayAbleData < 31) {
-            byte[] zeros = new byte[31 - compressornameDisplayAbleData];
-            content.get(zeros);
-            //assert Arrays.equals(zeros, new byte[zeros.length]) : "The compressor name length was not filled up with zeros";
-        }
-        depth = IsoTypeReader.readUInt16(content);
-        tmp = IsoTypeReader.readUInt16(content);
-        assert 0xFFFF == tmp;
-
-        _parseChildBoxes(content);
-
-    }
-
-
-    protected long getContentSize() {
-        long contentSize = 78;
-        for (Box boxe : boxes) {
-            contentSize += boxe.getSize();
-        }
-        return contentSize;
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        _writeReservedAndDataReferenceIndex(byteBuffer);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-        IsoTypeWriter.writeUInt16(byteBuffer, 0);
-        IsoTypeWriter.writeUInt32(byteBuffer, predefined[0]);
-        IsoTypeWriter.writeUInt32(byteBuffer, predefined[1]);
-        IsoTypeWriter.writeUInt32(byteBuffer, predefined[2]);
-
-        IsoTypeWriter.writeUInt16(byteBuffer, getWidth());
-        IsoTypeWriter.writeUInt16(byteBuffer, getHeight());
-
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, getHorizresolution());
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, getVertresolution());
-
-
-        IsoTypeWriter.writeUInt32(byteBuffer, 0);
-        IsoTypeWriter.writeUInt16(byteBuffer, getFrameCount());
-        IsoTypeWriter.writeUInt8(byteBuffer, Utf8.utf8StringLengthInBytes(getCompressorname()));
-        byteBuffer.put(Utf8.convert(getCompressorname()));
-        int a = Utf8.utf8StringLengthInBytes(getCompressorname());
-        while (a < 31) {
-            a++;
-            byteBuffer.put((byte) 0);
-        }
-        IsoTypeWriter.writeUInt16(byteBuffer, getDepth());
-        IsoTypeWriter.writeUInt16(byteBuffer, 0xFFFF);
-
-        _writeChildBoxes(byteBuffer);
-
-    }
-
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/Crypto.java b/android/src/main/java/com/github/faucamp/simplertmp/Crypto.java
deleted file mode 100755
index cd2eed5..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/Crypto.java
+++ /dev/null
@@ -1,73 +0,0 @@
-package com.github.faucamp.simplertmp;
-
-import android.util.Log;
-
-import java.security.InvalidKeyException;
-import java.security.NoSuchAlgorithmException;
-import javax.crypto.Mac;
-import javax.crypto.spec.SecretKeySpec;
-
-/**
- * Some helper utilities for SHA256, mostly (used during handshake)
- * This is separated in order to be more easily replaced on platforms that
- * do not have the javax.crypto.* and/or java.security.* packages
- * 
- * This implementation is directly inspired by the RTMPHandshake class of the
- * Red5  Open Source Flash Server project
- * 
- * @author francois
- */
-public class Crypto {
-
-    private static final String TAG = "Crypto";
-
-    private Mac hmacSHA256;
-
-    public Crypto() {
-        try {
-            hmacSHA256 = Mac.getInstance("HmacSHA256");
-        } catch (SecurityException e) {
-            Log.e(TAG, "Security exception when getting HMAC", e);
-        } catch (NoSuchAlgorithmException e) {
-            Log.e(TAG, "HMAC SHA256 does not exist");
-        }
-    }
-
-    /**
-     * Calculates an HMAC SHA256 hash using a default key length.
-     *
-     *
-     * @param input
-     * @param key
-     * @return hmac hashed bytes
-     */
-    public byte[] calculateHmacSHA256(byte[] input, byte[] key) {
-        byte[] output = null;
-        try {
-            hmacSHA256.init(new SecretKeySpec(key, "HmacSHA256"));
-            output = hmacSHA256.doFinal(input);
-        } catch (InvalidKeyException e) {
-            Log.e(TAG, "Invalid key", e);
-        }
-        return output;
-    }
-
-    /**
-     * Calculates an HMAC SHA256 hash using a set key length.
-     *
-     * @param input
-     * @param key
-     * @param length
-     * @return hmac hashed bytes
-     */
-    public byte[] calculateHmacSHA256(byte[] input, byte[] key, int length) {
-        byte[] output = null;
-        try {
-            hmacSHA256.init(new SecretKeySpec(key, 0, length, "HmacSHA256"));
-            output = hmacSHA256.doFinal(input);
-        } catch (InvalidKeyException e) {
-            Log.e(TAG, "Invalid key", e);
-        }
-        return output;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/DefaultRtmpPublisher.java b/android/src/main/java/com/github/faucamp/simplertmp/DefaultRtmpPublisher.java
deleted file mode 100755
index 8acc774..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/DefaultRtmpPublisher.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.github.faucamp.simplertmp;
-
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.github.faucamp.simplertmp.io.RtmpConnection;
-
-/**
- * Srs implementation of an RTMP publisher
- * 
- * @author francois, leoma
- */
-public class DefaultRtmpPublisher implements RtmpPublisher {
-
-    private RtmpConnection rtmpConnection;
-
-    public DefaultRtmpPublisher(RtmpHandler handler) {
-        rtmpConnection = new RtmpConnection(handler);
-    }
-
-    @Override
-    public boolean connect(String url) {
-        return rtmpConnection.connect(url);
-    }
-
-    @Override
-    public boolean publish(String publishType) {
-        return rtmpConnection.publish(publishType);
-    }
-
-    @Override
-    public void close() {
-        rtmpConnection.close();
-    }
-
-    @Override
-    public void publishVideoData(byte[] data, int size, int dts) {
-        rtmpConnection.publishVideoData(data, size, dts);
-    }
-
-    @Override
-    public void publishAudioData(byte[] data, int size, int dts) {
-        rtmpConnection.publishAudioData(data, size, dts);
-    }
-
-    @Override
-    public AtomicInteger getVideoFrameCacheNumber() {
-        return rtmpConnection.getVideoFrameCacheNumber();
-    }
-
-    @Override
-    public final String getServerIpAddr() {
-        return rtmpConnection.getServerIpAddr();
-    }
-
-    @Override
-    public final int getServerPid() {
-        return rtmpConnection.getServerPid();
-    }
-
-    @Override
-    public final int getServerId() {
-        return rtmpConnection.getServerId();
-    }
-
-    @Override
-    public void setVideoResolution(int width, int height) {
-        rtmpConnection.setVideoResolution(width, height);
-    }
-
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/RtmpHandler.java b/android/src/main/java/com/github/faucamp/simplertmp/RtmpHandler.java
deleted file mode 100755
index b077e0d..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/RtmpHandler.java
+++ /dev/null
@@ -1,169 +0,0 @@
-package com.github.faucamp.simplertmp;
-
-import android.os.Handler;
-import android.os.Message;
-
-import java.io.IOException;
-import java.lang.ref.WeakReference;
-import java.net.SocketException;
-
-/**
- * Created by leo.ma on 2016/11/3.
- */
-
-public class RtmpHandler extends Handler {
-
-    private static final int MSG_RTMP_CONNECTING = 0;
-    private static final int MSG_RTMP_CONNECTED = 1;
-    private static final int MSG_RTMP_VIDEO_STREAMING = 2;
-    private static final int MSG_RTMP_AUDIO_STREAMING = 3;
-    private static final int MSG_RTMP_STOPPED = 4;
-    private static final int MSG_RTMP_DISCONNECTED = 5;
-    private static final int MSG_RTMP_VIDEO_FPS_CHANGED = 6;
-    private static final int MSG_RTMP_VIDEO_BITRATE_CHANGED = 7;
-    private static final int MSG_RTMP_AUDIO_BITRATE_CHANGED = 8;
-
-    private static final int MSG_RTMP_SOCKET_EXCEPTION = 9;
-    private static final int MSG_RTMP_IO_EXCEPTION = 10;
-    private static final int MSG_RTMP_ILLEGAL_ARGUMENT_EXCEPTION = 11;
-    private static final int MSG_RTMP_ILLEGAL_STATE_EXCEPTION = 12;
-
-    private WeakReference<RtmpListener> mWeakListener;
-
-    public RtmpHandler(RtmpListener listener) {
-        mWeakListener = new WeakReference<>(listener);
-    }
-
-    public void notifyRtmpConnecting(String msg) {
-        obtainMessage(MSG_RTMP_CONNECTING, msg).sendToTarget();
-    }
-
-    public void notifyRtmpConnected(String msg) {
-        obtainMessage(MSG_RTMP_CONNECTED, msg).sendToTarget();
-    }
-
-    public void notifyRtmpVideoStreaming() {
-        sendEmptyMessage(MSG_RTMP_VIDEO_STREAMING);
-    }
-
-    public void notifyRtmpAudioStreaming() {
-        sendEmptyMessage(MSG_RTMP_AUDIO_STREAMING);
-    }
-
-    public void notifyRtmpStopped() {
-        sendEmptyMessage(MSG_RTMP_STOPPED);
-    }
-
-    public void notifyRtmpDisconnected() {
-        sendEmptyMessage(MSG_RTMP_DISCONNECTED);
-    }
-
-    public void notifyRtmpVideoFpsChanged(double fps) {
-        obtainMessage(MSG_RTMP_VIDEO_FPS_CHANGED, fps).sendToTarget();
-    }
-
-    public void notifyRtmpVideoBitrateChanged(double bitrate) {
-        obtainMessage(MSG_RTMP_VIDEO_BITRATE_CHANGED, bitrate).sendToTarget();
-    }
-
-    public void notifyRtmpAudioBitrateChanged(double bitrate) {
-        obtainMessage(MSG_RTMP_AUDIO_BITRATE_CHANGED, bitrate).sendToTarget();
-    }
-
-    public void notifyRtmpSocketException(SocketException e) {
-        obtainMessage(MSG_RTMP_SOCKET_EXCEPTION, e).sendToTarget();
-    }
-
-    public void notifyRtmpIOException(IOException e) {
-        obtainMessage(MSG_RTMP_IO_EXCEPTION, e).sendToTarget();
-    }
-
-    public void notifyRtmpIllegalArgumentException(IllegalArgumentException e) {
-        obtainMessage(MSG_RTMP_ILLEGAL_ARGUMENT_EXCEPTION, e).sendToTarget();
-    }
-
-    public void notifyRtmpIllegalStateException(IllegalStateException e) {
-        obtainMessage(MSG_RTMP_ILLEGAL_STATE_EXCEPTION, e).sendToTarget();
-    }
-
-    @Override  // runs on UI thread
-    public void handleMessage(Message msg) {
-        RtmpListener listener = mWeakListener.get();
-        if (listener == null) {
-            return;
-        }
-
-        switch (msg.what) {
-            case MSG_RTMP_CONNECTING:
-                listener.onRtmpConnecting((String) msg.obj);
-                break;
-            case MSG_RTMP_CONNECTED:
-                listener.onRtmpConnected((String) msg.obj);
-                break;
-            case MSG_RTMP_VIDEO_STREAMING:
-                listener.onRtmpVideoStreaming();
-                break;
-            case MSG_RTMP_AUDIO_STREAMING:
-                listener.onRtmpAudioStreaming();
-                break;
-            case MSG_RTMP_STOPPED:
-                listener.onRtmpStopped();
-                break;
-            case MSG_RTMP_DISCONNECTED:
-                listener.onRtmpDisconnected();
-                break;
-            case MSG_RTMP_VIDEO_FPS_CHANGED:
-                listener.onRtmpVideoFpsChanged((double) msg.obj);
-                break;
-            case MSG_RTMP_VIDEO_BITRATE_CHANGED:
-                listener.onRtmpVideoBitrateChanged((double) msg.obj);
-                break;
-            case MSG_RTMP_AUDIO_BITRATE_CHANGED:
-                listener.onRtmpAudioBitrateChanged((double) msg.obj);
-                break;
-            case MSG_RTMP_SOCKET_EXCEPTION:
-                listener.onRtmpSocketException((SocketException) msg.obj);
-                break;
-            case MSG_RTMP_IO_EXCEPTION:
-                listener.onRtmpIOException((IOException) msg.obj);
-                break;
-            case MSG_RTMP_ILLEGAL_ARGUMENT_EXCEPTION:
-                listener.onRtmpIllegalArgumentException((IllegalArgumentException) msg.obj);
-                break;
-            case MSG_RTMP_ILLEGAL_STATE_EXCEPTION:
-                listener.onRtmpIllegalStateException((IllegalStateException) msg.obj);
-                break;
-            default:
-                throw new RuntimeException("unknown msg " + msg.what);
-        }
-    }
-
-    public interface RtmpListener {
-        
-        void onRtmpConnecting(String msg);
-
-        void onRtmpConnected(String msg);
-
-        void onRtmpVideoStreaming();
-
-        void onRtmpAudioStreaming();
-
-        void onRtmpStopped();
-
-        void onRtmpDisconnected();
-
-        void onRtmpVideoFpsChanged(double fps);
-
-        void onRtmpVideoBitrateChanged(double bitrate);
-
-        void onRtmpAudioBitrateChanged(double bitrate);
-
-        void onRtmpSocketException(SocketException e);
-
-        void onRtmpIOException(IOException e);
-
-        void onRtmpIllegalArgumentException(IllegalArgumentException e);
-
-        void onRtmpIllegalStateException(IllegalStateException e);
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/RtmpPublisher.java b/android/src/main/java/com/github/faucamp/simplertmp/RtmpPublisher.java
deleted file mode 100755
index e667f97..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/RtmpPublisher.java
+++ /dev/null
@@ -1,82 +0,0 @@
-package com.github.faucamp.simplertmp;
-
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Simple RTMP publisher, using vanilla Java networking (no NIO)
- * This was created primarily to address a NIO bug in Android 2.2 when
- * used with Apache Mina, but also to provide an easy-to-use way to access
- * RTMP streams
- * 
- * @author francois, leo
- */
-public interface RtmpPublisher {
-    /**
-     * Issues an RTMP "connect" command and wait for the response.
-     *
-     * @param url specify the RTMP url
-     * @return If succeeded return true else return false
-     */
-    boolean connect(String url);
-    
-    /**
-     * Issues an RTMP "publish" command and write the media content stream packets (audio and video). 
-     * 
-     * @param publishType specify the way to publish raw RTMP packets among "live", "record" and "append"
-     * @return If succeeded return true else return false
-     * @throws IllegalStateException if the client is not connected to a RTMP server
-     */
-    boolean publish(String publishType);
-     
-    /**
-     * Stop and close the current RTMP streaming client.
-     */
-    void close();
-
-    /**
-     * publish a video content packet to server
-     *
-     * @param data video stream byte array
-     * @param size video stream byte size (not the whole length of byte array)
-     * @param dts video stream decoding timestamp
-     */
-    void publishVideoData(byte[] data, int size, int dts);
-
-    /**
-     * publish an audio content packet to server
-     *
-     * @param data audio stream byte array
-     * @param size audio stream byte size (not the whole length of byte array)
-     * @param dts audio stream decoding timestamp
-     */
-    void publishAudioData(byte[] data, int size, int dts);
-
-    /**
-     * obtain video frame number cached in publisher
-     */
-    AtomicInteger getVideoFrameCacheNumber();
-
-    /**
-     * obtain the IP address of the peer if any
-     */
-    String getServerIpAddr();
-
-    /**
-     * obtain the PID of the peer if any
-     */
-    int getServerPid();
-
-    /**
-     * obtain the ID of the peer if any
-     */
-    int getServerId();
-
-    /**
-     * set video resolution
-     *
-     * @param width video width
-     * @param height video height
-     */
-    void setVideoResolution(int width, int height);
-
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/Util.java b/android/src/main/java/com/github/faucamp/simplertmp/Util.java
deleted file mode 100755
index 6be2e79..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/Util.java
+++ /dev/null
@@ -1,138 +0,0 @@
-package com.github.faucamp.simplertmp;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * Misc utility method
- * @author francois
- */
-public class Util {
-
-    private static final String HEXES = "0123456789ABCDEF";
-
-    public static void writeUnsignedInt32(OutputStream out, int value) throws IOException {
-        out.write((byte) (value >>> 24));
-        out.write((byte) (value >>> 16));
-        out.write((byte) (value >>> 8));
-        out.write((byte) value);
-    }
-
-    public static int readUnsignedInt32(InputStream in) throws IOException {
-        return ((in.read() & 0xff) << 24) | ((in.read() & 0xff) << 16) | ((in.read() & 0xff) << 8) | (in.read() & 0xff);
-    }
-
-    public static int readUnsignedInt24(InputStream in) throws IOException {
-        return ((in.read() & 0xff) << 16) | ((in.read() & 0xff) << 8) | (in.read() & 0xff);
-    }
-
-    public static int readUnsignedInt16(InputStream in) throws IOException {
-        return ((in.read() & 0xff) << 8) | (in.read() & 0xff);
-    }
-
-    public static void writeUnsignedInt24(OutputStream out, int value) throws IOException {
-        out.write((byte) (value >>> 16));
-        out.write((byte) (value >>> 8));
-        out.write((byte) value);
-    }
-
-    public static void writeUnsignedInt16(OutputStream out, int value) throws IOException {
-        out.write((byte) (value >>> 8));
-        out.write((byte) value);
-    }
-
-    public static int toUnsignedInt32(byte[] bytes) {
-        return (((int) bytes[0] & 0xff) << 24) | (((int)bytes[1] & 0xff) << 16) | (((int)bytes[2] & 0xff) << 8) | ((int)bytes[3] & 0xff);
-    }
-
-    public static int toUnsignedInt32LittleEndian(byte[] bytes) {
-        return ((bytes[3] & 0xff) << 24) | ((bytes[2] & 0xff) << 16) | ((bytes[1] & 0xff) << 8) | (bytes[0] & 0xff);
-    }
-    
-    public static void writeUnsignedInt32LittleEndian(OutputStream out, int value) throws IOException {        
-        out.write((byte) value);
-        out.write((byte) (value >>> 8));
-        out.write((byte) (value >>> 16));
-        out.write((byte) (value >>> 24));
-    }
-    
-    public static int toUnsignedInt24(byte[] bytes) {
-        return ((bytes[1] & 0xff) << 16) | ((bytes[2] & 0xff) << 8) | (bytes[3] & 0xff);
-    }
-
-    public static int toUnsignedInt16(byte[] bytes) {
-        return ((bytes[2] & 0xff) << 8) | (bytes[3] & 0xff);
-    }
-
-    public static String toHexString(byte[] raw) {
-        if (raw == null) {
-            return null;
-        }
-        final StringBuilder hex = new StringBuilder(2 * raw.length);
-        for (final byte b : raw) {
-            hex.append(HEXES.charAt((b & 0xF0) >> 4)).append(HEXES.charAt((b & 0x0F)));
-        }
-        return hex.toString();
-    }
-    
-    public static String toHexString(byte b) {
-        return new StringBuilder().append(HEXES.charAt((b & 0xF0) >> 4)).append(HEXES.charAt((b & 0x0F))).toString();        
-    }
-
-    /** 
-     * Reads bytes from the specified inputstream into the specified target buffer until it is filled up     
-     */
-    public static void readBytesUntilFull(InputStream in, byte[] targetBuffer) throws IOException {
-        int totalBytesRead = 0;
-        int read;
-        final int targetBytes = targetBuffer.length;
-        do {
-            read = in.read(targetBuffer, totalBytesRead, (targetBytes - totalBytesRead));
-            if (read != -1) {
-                totalBytesRead += read;
-            } else {
-                throw new IOException("Unexpected EOF reached before read buffer was filled");
-            }
-        } while (totalBytesRead < targetBytes);
-    }    
-
-    public static byte[] toByteArray(double d) {
-        long l = Double.doubleToRawLongBits(d);
-        return new byte[]{
-                    (byte) ((l >> 56) & 0xff),
-                    (byte) ((l >> 48) & 0xff),
-                    (byte) ((l >> 40) & 0xff),
-                    (byte) ((l >> 32) & 0xff),
-                    (byte) ((l >> 24) & 0xff),
-                    (byte) ((l >> 16) & 0xff),
-                    (byte) ((l >> 8) & 0xff),
-                    (byte) (l & 0xff),};
-    }
-
-    public static byte[] unsignedInt32ToByteArray(int value) throws IOException {
-        return new byte[]{
-                    (byte) (value >>> 24),
-                    (byte) (value >>> 16),
-                    (byte) (value >>> 8),
-                    (byte) value};
-    }
-
-    public static double readDouble(InputStream in) throws IOException {
-        long bits = ((long) (in.read() & 0xff) << 56) | ((long) (in.read() & 0xff) << 48) | ((long) (in.read() & 0xff) << 40) | ((long) (in.read() & 0xff) << 32) | ((in.read() & 0xff) << 24) | ((in.read() & 0xff) << 16) | ((in.read() & 0xff) << 8) | (in.read() & 0xff);
-        return Double.longBitsToDouble(bits);
-    }       
-
-    public static void writeDouble(OutputStream out, double d) throws IOException {
-        long l = Double.doubleToRawLongBits(d);
-        out.write(new byte[]{
-                    (byte) ((l >> 56) & 0xff),
-                    (byte) ((l >> 48) & 0xff),
-                    (byte) ((l >> 40) & 0xff),
-                    (byte) ((l >> 32) & 0xff),
-                    (byte) ((l >> 24) & 0xff),
-                    (byte) ((l >> 16) & 0xff),
-                    (byte) ((l >> 8) & 0xff),
-                    (byte) (l & 0xff)});
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfArray.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfArray.java
deleted file mode 100755
index eef63e5..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfArray.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.github.faucamp.simplertmp.Util;
-
-/**
- * AMF Array
- * 
- * @author francois
- */
-public class AmfArray implements AmfData {
-
-    private List<AmfData> items;
-    private int size = -1;
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        throw new UnsupportedOperationException("Not supported yet.");
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)
-        int length = Util.readUnsignedInt32(in);
-        size = 5; // 1 + 4
-        items = new ArrayList<AmfData>(length);
-        for (int i = 0; i < length; i++) {
-            AmfData dataItem = AmfDecoder.readFrom(in);
-            size += dataItem.getSize();
-            items.add(dataItem);
-        }
-    }
-
-    @Override
-    public int getSize() {
-        if (size == -1) {
-            size = 5; // 1 + 4
-            if (items != null) {
-                for (AmfData dataItem : items) {
-                    size += dataItem.getSize();
-                }
-            }
-        }
-        return size;
-    }
-
-    /** @return the amount of items in this the array */
-    public int getLength() {
-        return items != null ? items.size() : 0;
-    }
-
-    public List<AmfData> getItems() {
-        if (items == null) {
-            items = new ArrayList<AmfData>();
-        }
-        return items;
-    }
-
-    public void addItem(AmfData dataItem) {
-        getItems().add(this);
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfBoolean.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfBoolean.java
deleted file mode 100755
index 4fbbdfe..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfBoolean.java
+++ /dev/null
@@ -1,50 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- *
- * @author francois
- */
-public class AmfBoolean implements AmfData {
-    
-    private boolean value;
-    
-    public boolean isValue() {
-        return value;
-    }
-    
-    public void setValue(boolean value) {
-        this.value = value;
-    }
-    
-    public AmfBoolean(boolean value) {
-        this.value = value;
-    }
-    
-    public AmfBoolean() {
-    }
-    
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        out.write(AmfType.BOOLEAN.getValue());
-        out.write(value ? 0x01 : 0x00);
-    }
-    
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        value = (in.read() == 0x01) ? true : false;
-    }
-    
-    public static boolean readBooleanFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)
-        return (in.read() == 0x01) ? true : false;
-    }
-    
-    @Override
-    public int getSize() {
-        return 2;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfData.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfData.java
deleted file mode 100755
index 766d4c0..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfData.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * Base AMF data object. All other AMF data type instances derive from this
- * (including AmfObject)
- * 
- * @author francois
- */
-public interface AmfData {
-       
-    /** 
-     * Write/Serialize this AMF data intance (Object/string/integer etc) to
-     * the specified OutputStream
-     */
-    void writeTo(OutputStream out) throws IOException;
-    
-    /**
-     * Read and parse bytes from the specified input stream to populate this
-     * AMFData instance (deserialize)
-     * 
-     * @return the amount of bytes read
-     */
-    void readFrom(InputStream in) throws IOException;
-    
-    /** @return the amount of bytes required for this object */
-    int getSize();
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfDecoder.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfDecoder.java
deleted file mode 100755
index 6405b38..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfDecoder.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-/**
- *
- * @author francois
- */
-public class AmfDecoder {
-
-    public static AmfData readFrom(InputStream in) throws IOException {
-
-        byte amfTypeByte = (byte) in.read();
-        AmfType amfType = AmfType.valueOf(amfTypeByte);
-
-        AmfData amfData;
-        switch (amfType) {
-            case NUMBER:
-                amfData = new AmfNumber();
-                break;
-            case BOOLEAN:
-                amfData = new AmfBoolean();
-                break;
-            case STRING:
-                amfData = new AmfString();
-                break;
-            case OBJECT:
-                amfData = new AmfObject();
-                break;
-            case NULL:
-                return new AmfNull();
-            case UNDEFINED:
-                return new AmfUndefined();
-            case MAP:
-                amfData = new AmfMap();
-                break;
-            case ARRAY:
-                amfData = new AmfArray();
-                break;
-            default:
-                throw new IOException("Unknown/unimplemented AMF data type: " + amfType);
-        }
-
-        amfData.readFrom(in);
-        return amfData;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfMap.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfMap.java
deleted file mode 100755
index b0eedad..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfMap.java
+++ /dev/null
@@ -1,53 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Map;
-
-import com.github.faucamp.simplertmp.Util;
-
-/**
- * AMF map; that is, an "object"-like structure of key/value pairs, but with 
- * an array-like size indicator at the start (which is seemingly always 0)
- * 
- * @author francois
- */
-public class AmfMap extends AmfObject {
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        // Begin the map/object/array/whatever exactly this is
-        out.write(AmfType.MAP.getValue());
-
-        // Write the "array size"
-        Util.writeUnsignedInt32(out, properties.size());
-
-        // Write key/value pairs in this object        
-        for (Map.Entry<String, AmfData> entry : properties.entrySet()) {
-            // The key must be a STRING type, and thus the "type-definition" byte is implied (not included in message)
-            AmfString.writeStringTo(out, entry.getKey(), true);
-            entry.getValue().writeTo(out);
-        }
-
-        // End the object        
-        out.write(OBJECT_END_MARKER);
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)
-        int length = Util.readUnsignedInt32(in); // Seems this is always 0
-        super.readFrom(in);
-        size += 4; // Add the bytes read for parsing the array size (length)
-    }
-
-    @Override
-    public int getSize() {
-        if (size == -1) {
-            size = super.getSize();
-            size += 4; // array length bytes
-        }
-        return size;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNull.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNull.java
deleted file mode 100755
index f25249f..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNull.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- *
- * @author francois
- */
-public class AmfNull implements AmfData {
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        out.write(AmfType.NULL.getValue());
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)    
-    }
-    
-    public static void writeNullTo(OutputStream out) throws IOException {
-        out.write(AmfType.NULL.getValue());
-    }
-
-    @Override
-    public int getSize() {
-        return 1;
-    }    
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNumber.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNumber.java
deleted file mode 100755
index 79dccf2..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfNumber.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-
-/**
- * AMF0 Number data type
- * 
- * @author francois
- */
-public class AmfNumber implements AmfData {
-
-    double value;
-    /** Size of an AMF number, in bytes (including type bit) */
-    public static final int SIZE = 9;
-
-    public AmfNumber(double value) {
-        this.value = value;
-    }
-
-    public AmfNumber() {
-    }
-
-    public double getValue() {
-        return value;
-    }
-
-    public void setValue(double value) {
-        this.value = value;
-    }    
-    
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        out.write(AmfType.NUMBER.getValue());
-        Util.writeDouble(out, value);
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)
-        value = Util.readDouble(in);
-    }
-    
-    public static double readNumberFrom(InputStream in) throws IOException {
-       // Skip data type byte
-       in.read();       
-       return Util.readDouble(in);
-    }
-    
-    public static void writeNumberTo(OutputStream out, double number) throws IOException {
-       out.write(AmfType.NUMBER.getValue());
-       Util.writeDouble(out, number);
-    } 
-    
-    @Override
-    public int getSize() {
-        return SIZE;
-    }
-    
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfObject.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfObject.java
deleted file mode 100755
index cdd3f57..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfObject.java
+++ /dev/null
@@ -1,109 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-/**
- * AMF object
- * 
- * @author francois
- */
-public class AmfObject implements AmfData {
-
-    protected Map<String, AmfData> properties = new LinkedHashMap<String, AmfData>();
-    protected int size = -1;
-    /** Byte sequence that marks the end of an AMF object */
-    protected static final byte[] OBJECT_END_MARKER = new byte[]{0x00, 0x00, 0x09};
-
-    public AmfObject() {
-    }
-
-    public AmfData getProperty(String key) {
-        return properties.get(key);
-    }
-
-    public void setProperty(String key, AmfData value) {
-        properties.put(key, value);
-    }
-
-    public void setProperty(String key, boolean value) {
-        properties.put(key, new AmfBoolean(value));
-    }
-
-    public void setProperty(String key, String value) {
-        properties.put(key, new AmfString(value, false));
-    }
-
-    public void setProperty(String key, int value) {
-        properties.put(key, new AmfNumber(value));
-    }
-
-    public void setProperty(String key, double value) {
-        properties.put(key, new AmfNumber(value));
-    }
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        // Begin the object
-        out.write(AmfType.OBJECT.getValue());
-
-        // Write key/value pairs in this object        
-        for (Map.Entry<String, AmfData> entry : properties.entrySet()) {
-            // The key must be a STRING type, and thus the "type-definition" byte is implied (not included in message)
-            AmfString.writeStringTo(out, entry.getKey(), true);
-            entry.getValue().writeTo(out);
-        }
-
-        // End the object        
-        out.write(OBJECT_END_MARKER);
-
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)       
-        size = 1;
-        InputStream markInputStream = in.markSupported() ? in : new BufferedInputStream(in);
-
-        while (true) {
-            // Look for the 3-byte object end marker [0x00 0x00 0x09]
-            markInputStream.mark(3);
-            byte[] endMarker = new byte[3];
-            markInputStream.read(endMarker);
-
-            if (endMarker[0] == OBJECT_END_MARKER[0] && endMarker[1] == OBJECT_END_MARKER[1] && endMarker[2] == OBJECT_END_MARKER[2]) {
-                // End marker found
-                size += 3;
-                return;
-            } else {
-                // End marker not found; reset the stream to the marked position and read an AMF property
-                markInputStream.reset();
-                // Read the property key...
-                String key = AmfString.readStringFrom(in, true);
-                size += AmfString.sizeOf(key, true);
-                // ...and the property value
-                AmfData value = AmfDecoder.readFrom(markInputStream);
-                size += value.getSize();
-                properties.put(key, value);
-            }
-        }
-    }
-
-    @Override
-    public int getSize() {
-        if (size == -1) {
-            size = 1; // object marker                
-            for (Map.Entry<String, AmfData> entry : properties.entrySet()) {
-                size += AmfString.sizeOf(entry.getKey(), true);
-                size += entry.getValue().getSize();
-            }
-            size += 3; // end of object marker
-
-        }
-        return size;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfString.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfString.java
deleted file mode 100755
index 927d791..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfString.java
+++ /dev/null
@@ -1,130 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
-import java.lang.String;
-
-import android.util.Log;
-
-import com.github.faucamp.simplertmp.Util;
-
-/**
- *
- * @author francois
- */
-public class AmfString implements AmfData {
-
-    private static final String TAG = "AmfString";
-	
-    private String value;
-    private boolean key;
-    private int size = -1;
-
-    public AmfString() {
-    }
-
-    public AmfString(String value, boolean isKey) {
-        this.value = value;
-        this.key = isKey;
-    }
-
-    public AmfString(String value) {
-        this(value, false);
-    }
-
-    public AmfString(boolean isKey) {
-        this.key = isKey;
-    }
-
-    public String getValue() {
-        return value;
-    }
-
-    public void setValue(String value) {
-        this.value = value;
-    }
-
-    public boolean isKey() {
-        return key;
-    }
-
-    public void setKey(boolean key) {
-        this.key = key;
-    }
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        // Strings are ASCII encoded
-        byte[] byteValue = this.value.getBytes("ASCII");
-        // Write the STRING data type definition (except if this String is used as a key)
-        if (!key) {
-            out.write(AmfType.STRING.getValue());
-        }
-        // Write 2 bytes indicating string length
-        Util.writeUnsignedInt16(out, byteValue.length);
-        // Write string
-        out.write(byteValue);
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)        
-        int length = Util.readUnsignedInt16(in);
-        size = 3 + length; // 1 + 2 + length
-        // Read string value
-        byte[] byteValue = new byte[length];
-        Util.readBytesUntilFull(in, byteValue);
-        value = new String(byteValue, "ASCII");
-    }
-
-    public static String readStringFrom(InputStream in, boolean isKey) throws IOException {
-        if (!isKey) {
-            // Read past the data type byte
-            in.read();
-        }
-        int length = Util.readUnsignedInt16(in);
-        // Read string value
-        byte[] byteValue = new byte[length];
-        Util.readBytesUntilFull(in, byteValue);
-        return new String(byteValue, "ASCII");
-    }
-
-    public static void writeStringTo(OutputStream out, String string, boolean isKey) throws IOException {
-        // Strings are ASCII encoded
-        byte[] byteValue = string.getBytes("ASCII");
-        // Write the STRING data type definition (except if this String is used as a key)
-        if (!isKey) {
-            out.write(AmfType.STRING.getValue());
-        }
-        // Write 2 bytes indicating string length
-        Util.writeUnsignedInt16(out, byteValue.length);
-        // Write string
-        out.write(byteValue);
-    }
-
-    @Override
-    public int getSize() {
-        if (size == -1) {
-            try {
-                size = (isKey() ? 0 : 1) + 2 + value.getBytes("ASCII").length;
-            } catch (UnsupportedEncodingException ex) {
-                Log.e(TAG, "AmfString.getSize(): caught exception", ex);
-                throw new RuntimeException(ex);
-            }
-        }
-        return size;
-    }
-
-    /** @return the byte size of the resulting AMF string of the specified value */
-    public static int sizeOf(String string, boolean isKey) {
-        try {
-            int size = (isKey ? 0 : 1) + 2 + string.getBytes("ASCII").length;
-            return size;
-        } catch (UnsupportedEncodingException ex) {
-            Log.e(TAG, "AmfString.SizeOf(): caught exception", ex);
-            throw new RuntimeException(ex);
-        }
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfType.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfType.java
deleted file mode 100755
index f40ff6a..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfType.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package com.github.faucamp.simplertmp.amf;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * AMF0 data type enum 
- * 
- * @author francois
- */
-public enum AmfType {
-
-    /** Number (encoded as IEEE 64-bit double precision floating point number) */
-    NUMBER(0x00),
-    /** Boolean (Encoded as a single byte of value 0x00 or 0x01) */
-    BOOLEAN(0x01),
-    /** String (ASCII encoded) */
-    STRING(0x02),
-    /** Object - set of key/value pairs */
-    OBJECT(0x03),
-    NULL(0x05),
-    UNDEFINED(0x06),
-    MAP(0x08),
-    ARRAY(0x0A);
-    private byte value;
-    private static final Map<Byte, AmfType> quickLookupMap = new HashMap<Byte, AmfType>();
-
-    static {
-        for (AmfType amfType : AmfType.values()) {
-            quickLookupMap.put(amfType.getValue(), amfType);
-        }
-    }
-
-    private AmfType(int intValue) {
-        this.value = (byte) intValue;
-    }
-
-    public byte getValue() {
-        return value;
-    }
-
-    public static AmfType valueOf(byte amfTypeByte) {
-        return quickLookupMap.get(amfTypeByte);
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfUndefined.java b/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfUndefined.java
deleted file mode 100755
index 2d38559..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/amf/AmfUndefined.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
-package com.github.faucamp.simplertmp.amf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- *
- * @author leoma
- */
-public class AmfUndefined implements AmfData {
-
-    @Override
-    public void writeTo(OutputStream out) throws IOException {
-        out.write(AmfType.UNDEFINED.getValue());
-    }
-
-    @Override
-    public void readFrom(InputStream in) throws IOException {
-        // Skip data type byte (we assume it's already read)    
-    }
-    
-    public static void writeUndefinedTo(OutputStream out) throws IOException {
-        out.write(AmfType.UNDEFINED.getValue());
-    }
-
-    @Override
-    public int getSize() {
-        return 1;
-    }    
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/io/ChunkStreamInfo.java b/android/src/main/java/com/github/faucamp/simplertmp/io/ChunkStreamInfo.java
deleted file mode 100755
index 5448b7d..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/io/ChunkStreamInfo.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package com.github.faucamp.simplertmp.io;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.packets.RtmpHeader;
-
-/**
- * Chunk stream channel information
- * 
- * @author francois, leo
- */
-public class ChunkStreamInfo {
-
-    public static final byte RTMP_CID_PROTOCOL_CONTROL = 0x02;
-    public static final byte RTMP_CID_OVER_CONNECTION = 0x03;
-    public static final byte RTMP_CID_OVER_CONNECTION2 = 0x04;
-    public static final byte RTMP_CID_OVER_STREAM = 0x05;
-    public static final byte RTMP_CID_VIDEO = 0x06;
-    public static final byte RTMP_CID_AUDIO = 0x07;
-    private RtmpHeader prevHeaderRx;
-    private RtmpHeader prevHeaderTx;
-    private static long sessionBeginTimestamp;
-    private long realLastTimestamp = System.nanoTime() / 1000000;  // Do not use wall time!
-    private ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 128);
-
-    /** @return the previous header that was received on this channel, or <code>null</code> if no previous header was received */
-    public RtmpHeader prevHeaderRx() {
-        return prevHeaderRx;
-    }
-
-    /** Sets the previous header that was received on this channel, or <code>null</code> if no previous header was sent */
-    public void setPrevHeaderRx(RtmpHeader previousHeader) {
-        this.prevHeaderRx = previousHeader;
-    }
-
-    /** @return the previous header that was transmitted on this channel */
-    public RtmpHeader getPrevHeaderTx() {
-        return prevHeaderTx;
-    }
-
-    public boolean canReusePrevHeaderTx(RtmpHeader.MessageType forMessageType) {
-        return (prevHeaderTx != null && prevHeaderTx.getMessageType() == forMessageType);
-    }
-
-    /** Sets the previous header that was transmitted on this channel */
-    public void setPrevHeaderTx(RtmpHeader prevHeaderTx) {
-        this.prevHeaderTx = prevHeaderTx;
-    }
-
-    /** Sets the session beginning timestamp for all chunks */
-    public static void markSessionTimestampTx() {
-        sessionBeginTimestamp = System.nanoTime() / 1000000;
-    }
-
-    /** Utility method for calculating & synchronizing transmitted timestamps */
-    public long markAbsoluteTimestampTx() {
-        return System.nanoTime() / 1000000 - sessionBeginTimestamp;
-    }
-
-    /** Utility method for calculating & synchronizing transmitted timestamp deltas */
-    public long markDeltaTimestampTx() {
-        long currentTimestamp = System.nanoTime() / 1000000;
-        long diffTimestamp = currentTimestamp - realLastTimestamp;
-        realLastTimestamp = currentTimestamp;
-        return diffTimestamp;
-    }
-
-    /** @return <code>true</code> if all packet data has been stored, or <code>false</code> if not */
-    public boolean storePacketChunk(InputStream in, int chunkSize) throws IOException {
-        final int remainingBytes = prevHeaderRx.getPacketLength() - baos.size();
-        byte[] chunk = new byte[Math.min(remainingBytes, chunkSize)];
-        Util.readBytesUntilFull(in, chunk);
-        baos.write(chunk);
-        return (baos.size() == prevHeaderRx.getPacketLength());
-    }
-
-    public ByteArrayInputStream getStoredPacketInputStream() {
-        ByteArrayInputStream bis = new ByteArrayInputStream(baos.toByteArray());
-        baos.reset();
-        return bis;
-    }
-    
-    /** Clears all currently-stored packet chunks (used when an ABORT packet is received) */
-    public void clearStoredChunks() {
-        baos.reset();
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpConnection.java b/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpConnection.java
deleted file mode 100755
index 4525011..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpConnection.java
+++ /dev/null
@@ -1,673 +0,0 @@
-package com.github.faucamp.simplertmp.io;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.SocketAddress;
-import java.net.SocketException;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import android.util.Log;
-
-import com.github.faucamp.simplertmp.RtmpHandler;
-import com.github.faucamp.simplertmp.RtmpPublisher;
-import com.github.faucamp.simplertmp.amf.AmfMap;
-import com.github.faucamp.simplertmp.amf.AmfNull;
-import com.github.faucamp.simplertmp.amf.AmfNumber;
-import com.github.faucamp.simplertmp.amf.AmfObject;
-import com.github.faucamp.simplertmp.amf.AmfString;
-import com.github.faucamp.simplertmp.packets.Abort;
-import com.github.faucamp.simplertmp.packets.Data;
-import com.github.faucamp.simplertmp.packets.Handshake;
-import com.github.faucamp.simplertmp.packets.Command;
-import com.github.faucamp.simplertmp.packets.Audio;
-import com.github.faucamp.simplertmp.packets.SetPeerBandwidth;
-import com.github.faucamp.simplertmp.packets.Video;
-import com.github.faucamp.simplertmp.packets.UserControl;
-import com.github.faucamp.simplertmp.packets.RtmpPacket;
-import com.github.faucamp.simplertmp.packets.WindowAckSize;
-
-/**
- * Main RTMP connection implementation class
- * 
- * @author francois, leoma
- */
-public class RtmpConnection implements RtmpPublisher {
-
-    private static final String TAG = "RtmpConnection";
-    private static final Pattern rtmpUrlPattern = Pattern.compile("^rtmp://([^/:]+)(:(\\d+))*/([^/]+)(/(.*))*$");
-
-    private RtmpHandler mHandler;
-    private int port;
-    private String host;
-    private String appName;
-    private String streamName;
-    private String publishType;
-    private String swfUrl;
-    private String tcUrl;
-    private String pageUrl;
-    private Socket socket;
-    private String srsServerInfo = "";
-    private String socketExceptionCause = "";
-    private RtmpSessionInfo rtmpSessionInfo;
-    private RtmpDecoder rtmpDecoder;
-    private BufferedInputStream inputStream;
-    private BufferedOutputStream outputStream;
-    private Thread rxPacketHandler;
-    private volatile boolean connected = false;
-    private volatile boolean publishPermitted = false;
-    private final Object connectingLock = new Object();
-    private final Object publishLock = new Object();
-    private AtomicInteger videoFrameCacheNumber = new AtomicInteger(0);
-    private int currentStreamId = 0;
-    private int transactionIdCounter = 0;
-    private AmfString serverIpAddr;
-    private AmfNumber serverPid;
-    private AmfNumber serverId;
-    private int videoWidth;
-    private int videoHeight;
-    private int videoFrameCount;
-    private int videoDataLength;
-    private int audioFrameCount;
-    private int audioDataLength;
-    private long videoLastTimeMillis;
-    private long audioLastTimeMillis;
-
-    public RtmpConnection(RtmpHandler handler) {
-        mHandler = handler;
-    }
-
-    private void handshake(InputStream in, OutputStream out) throws IOException {
-        Handshake handshake = new Handshake();
-        handshake.writeC0(out);
-        handshake.writeC1(out); // Write C1 without waiting for S0
-        out.flush();
-        handshake.readS0(in);
-        handshake.readS1(in);
-        handshake.writeC2(out);
-        handshake.readS2(in);
-    }
-
-    @Override
-    public boolean connect(String url) {
-        Matcher matcher = rtmpUrlPattern.matcher(url);
-        if (matcher.matches()) {
-            tcUrl = url.substring(0, url.lastIndexOf('/'));
-            swfUrl = "";
-            pageUrl = "";
-            host = matcher.group(1);
-            String portStr = matcher.group(3);
-            port = portStr != null ? Integer.parseInt(portStr) : 1935;
-            appName = matcher.group(4);
-            streamName = matcher.group(6);
-        } else {
-            mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException(
-                "Invalid RTMP URL. Must be in format: rtmp://host[:port]/application/streamName"));
-            return false;
-        }
-
-        if (streamName == null || appName == null) {
-            mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException(
-                "Invalid RTMP URL. Must be in format: rtmp://host[:port]/application/streamName"));
-            return false;
-        }
-
-        // socket connection
-        Log.d(TAG, "connect() called. Host: " + host + ", port: " + port + ", appName: " + appName + ", publishPath: " + streamName);
-        rtmpSessionInfo = new RtmpSessionInfo();
-        rtmpDecoder = new RtmpDecoder(rtmpSessionInfo);
-        socket = new Socket();
-        SocketAddress socketAddress = new InetSocketAddress(host, port);
-        try {
-            socket.connect(socketAddress, 3000);
-            inputStream = new BufferedInputStream(socket.getInputStream());
-            outputStream = new BufferedOutputStream(socket.getOutputStream());
-            Log.d(TAG, "connect(): socket connection established, doing handhake...");
-            handshake(inputStream, outputStream);
-            Log.d(TAG, "connect(): handshake done");
-        } catch (IOException e) {
-            e.printStackTrace();
-            mHandler.notifyRtmpIOException(e);
-            return false;
-        }
-
-        // Start the "main" handling thread
-        rxPacketHandler = new Thread(new Runnable() {
-
-            @Override
-            public void run() {
-                try {
-                    Log.d(TAG, "starting main rx handler loop");
-                    handleRxPacketLoop();
-                } catch (IOException ex) {
-                    Logger.getLogger(RtmpConnection.class.getName()).log(Level.SEVERE, null, ex);
-                }
-            }
-        });
-        rxPacketHandler.start();
-
-        return rtmpConnect();
-    }
-
-    private boolean rtmpConnect() {
-        if (connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Already connected to RTMP server"));
-            return false;
-        }
-
-        // Mark session timestamp of all chunk stream information on connection.
-        ChunkStreamInfo.markSessionTimestampTx();
-
-        Log.d(TAG, "rtmpConnect(): Building 'connect' invoke packet");
-        ChunkStreamInfo chunkStreamInfo = rtmpSessionInfo.getChunkStreamInfo(ChunkStreamInfo.RTMP_CID_OVER_CONNECTION);
-        Command invoke = new Command("connect", ++transactionIdCounter, chunkStreamInfo);
-        invoke.getHeader().setMessageStreamId(0);
-        AmfObject args = new AmfObject();
-        args.setProperty("app", appName);
-        args.setProperty("flashVer", "LNX 11,2,202,233"); // Flash player OS: Linux, version: 11.2.202.233
-        args.setProperty("swfUrl", swfUrl);
-        args.setProperty("tcUrl", tcUrl);
-        args.setProperty("fpad", false);
-        args.setProperty("capabilities", 239);
-        args.setProperty("audioCodecs", 3575);
-        args.setProperty("videoCodecs", 252);
-        args.setProperty("videoFunction", 1);
-        args.setProperty("pageUrl", pageUrl);
-        args.setProperty("objectEncoding", 0);
-        invoke.addData(args);
-        sendRtmpPacket(invoke);
-        mHandler.notifyRtmpConnecting("Connecting");
-
-        synchronized (connectingLock) {
-            try {
-                connectingLock.wait(5000);
-            } catch (InterruptedException ex) {
-                // do nothing
-            }
-        }
-        if (!connected) {
-            shutdown();
-        }
-        return connected;
-    }
-
-    @Override
-    public boolean publish(String type) {
-        if (type == null) {
-            mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException("No publish type specified"));
-            return false;
-        }
-        publishType = type;
-        return createStream();
-    }
-
-    private boolean createStream() {
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return false;
-        }
-        if (currentStreamId != 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Current stream object has existed"));
-            return false;
-        }
-
-        Log.d(TAG, "createStream(): Sending releaseStream command...");
-        // transactionId == 2
-        Command releaseStream = new Command("releaseStream", ++transactionIdCounter);
-        releaseStream.getHeader().setChunkStreamId(ChunkStreamInfo.RTMP_CID_OVER_STREAM);
-        releaseStream.addData(new AmfNull());  // command object: null for "createStream"
-        releaseStream.addData(streamName);  // command object: null for "releaseStream"
-        sendRtmpPacket(releaseStream);
-
-        Log.d(TAG, "createStream(): Sending FCPublish command...");
-        // transactionId == 3
-        Command FCPublish = new Command("FCPublish", ++transactionIdCounter);
-        FCPublish.getHeader().setChunkStreamId(ChunkStreamInfo.RTMP_CID_OVER_STREAM);
-        FCPublish.addData(new AmfNull());  // command object: null for "FCPublish"
-        FCPublish.addData(streamName);
-        sendRtmpPacket(FCPublish);
-
-        Log.d(TAG, "createStream(): Sending createStream command...");
-        ChunkStreamInfo chunkStreamInfo = rtmpSessionInfo.getChunkStreamInfo(ChunkStreamInfo.RTMP_CID_OVER_CONNECTION);
-        // transactionId == 4
-        Command createStream = new Command("createStream", ++transactionIdCounter, chunkStreamInfo);
-        createStream.addData(new AmfNull());  // command object: null for "createStream"
-        sendRtmpPacket(createStream);
-
-        // Waiting for "NetStream.Publish.Start" response.
-        synchronized (publishLock) {
-            try {
-                publishLock.wait(5000);
-            } catch (InterruptedException ex) {
-                // do nothing
-            }
-        }
-        if (publishPermitted) {
-            mHandler.notifyRtmpConnected("Connected" + srsServerInfo);
-        } else {
-            shutdown();
-        }
-        return publishPermitted;
-    }
-
-    private void fmlePublish() {
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return;
-        }
-        if (currentStreamId == 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("No current stream object exists"));
-            return;
-        }
-
-        Log.d(TAG, "fmlePublish(): Sending publish command...");
-        // transactionId == 0
-        Command publish = new Command("publish", 0);
-        publish.getHeader().setChunkStreamId(ChunkStreamInfo.RTMP_CID_OVER_STREAM);
-        publish.getHeader().setMessageStreamId(currentStreamId);
-        publish.addData(new AmfNull());  // command object: null for "publish"
-        publish.addData(streamName);
-        publish.addData(publishType);
-        sendRtmpPacket(publish);
-    }
-
-    private void onMetaData() {
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return;
-        }
-        if (currentStreamId == 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("No current stream object exists"));
-            return;
-        }
-
-        Log.d(TAG, "onMetaData(): Sending empty onMetaData...");
-        Data metadata = new Data("@setDataFrame");
-        metadata.getHeader().setMessageStreamId(currentStreamId);
-        metadata.addData("onMetaData");
-        AmfMap ecmaArray = new AmfMap();
-        ecmaArray.setProperty("duration", 0);
-        ecmaArray.setProperty("width", videoWidth);
-        ecmaArray.setProperty("height", videoHeight);
-        ecmaArray.setProperty("videodatarate", 0);
-        ecmaArray.setProperty("framerate", 0);
-        ecmaArray.setProperty("audiodatarate", 0);
-        ecmaArray.setProperty("audiosamplerate", 44100);
-        ecmaArray.setProperty("audiosamplesize", 16);
-        ecmaArray.setProperty("stereo", true);
-        ecmaArray.setProperty("filesize", 0);
-        metadata.addData(ecmaArray);
-        sendRtmpPacket(metadata);
-    }
-
-    @Override
-    public void close() {
-        if (socket != null) {
-            closeStream();
-        }
-        shutdown();
-    }
-
-    private void closeStream() {
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return;
-        }
-        if (currentStreamId == 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("No current stream object exists"));
-            return;
-        }
-        if (!publishPermitted) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not get _result(Netstream.Publish.Start)"));
-            return;
-        }
-        Log.d(TAG, "closeStream(): setting current stream ID to 0");
-        Command closeStream = new Command("closeStream", 0);
-        closeStream.getHeader().setChunkStreamId(ChunkStreamInfo.RTMP_CID_OVER_STREAM);
-        closeStream.getHeader().setMessageStreamId(currentStreamId);
-        closeStream.addData(new AmfNull());
-        sendRtmpPacket(closeStream);
-        mHandler.notifyRtmpStopped();
-    }
-
-    private void shutdown() {
-        if (socket != null) {
-            try {
-                // It will raise EOFException in handleRxPacketThread
-                socket.shutdownInput();
-                // It will raise SocketException in sendRtmpPacket
-                socket.shutdownOutput();
-            } catch (IOException ioe) {
-                ioe.printStackTrace();
-            }
-
-            // shutdown rxPacketHandler
-            if (rxPacketHandler != null) {
-                rxPacketHandler.interrupt();
-                try {
-                    rxPacketHandler.join();
-                } catch (InterruptedException ie) {
-                    rxPacketHandler.interrupt();
-                }
-                rxPacketHandler = null;
-            }
-
-            // shutdown socket as well as its input and output stream
-            try {
-                socket.close();
-                Log.d(TAG, "socket closed");
-            } catch (IOException ex) {
-                Log.e(TAG, "shutdown(): failed to close socket", ex);
-            }
-
-            mHandler.notifyRtmpDisconnected();
-        }
-
-        reset();
-    }
-
-    private void reset() {
-        connected = false;
-        publishPermitted = false;
-        tcUrl = null;
-        swfUrl = null;
-        pageUrl = null;
-        appName = null;
-        streamName = null;
-        publishType = null;
-        currentStreamId = 0;
-        transactionIdCounter = 0;
-        videoFrameCacheNumber.set(0);
-        socketExceptionCause = "";
-        serverIpAddr = null;
-        serverPid = null;
-        serverId = null;
-        socket = null;
-        rtmpSessionInfo = null;
-        rtmpDecoder = null;
-    }
-
-    @Override
-    public void publishAudioData(byte[] data, int size, int dts) {
-        if (data == null || data.length == 0 || dts < 0) {
-            mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException("Invalid Audio Data"));
-            return;
-        }
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return;
-        }
-        if (currentStreamId == 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("No current stream object exists"));
-            return;
-        }
-        if (!publishPermitted) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not get _result(Netstream.Publish.Start)"));
-            return;
-        }
-        Audio audio = new Audio();
-        audio.setData(data, size);
-        audio.getHeader().setAbsoluteTimestamp(dts);
-        audio.getHeader().setMessageStreamId(currentStreamId);
-        sendRtmpPacket(audio);
-        calcAudioBitrate(audio.getHeader().getPacketLength());
-        mHandler.notifyRtmpAudioStreaming();
-    }
-
-    @Override
-    public void publishVideoData(byte[] data, int size, int dts) {
-        if (data == null || data.length == 0 || dts < 0) {
-            mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException("Invalid Video Data"));
-            return;
-        }
-        if (!connected) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not connected to RTMP server"));
-            return;
-        }
-        if (currentStreamId == 0) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("No current stream object exists"));
-            return;
-        }
-        if (!publishPermitted) {
-            mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Not get _result(Netstream.Publish.Start)"));
-            return;
-        }
-        Video video = new Video();
-        video.setData(data, size);
-        video.getHeader().setAbsoluteTimestamp(dts);
-        video.getHeader().setMessageStreamId(currentStreamId);
-        sendRtmpPacket(video);
-        videoFrameCacheNumber.decrementAndGet();
-        calcVideoFpsAndBitrate(video.getHeader().getPacketLength());
-        mHandler.notifyRtmpVideoStreaming();
-    }
-
-    private void calcVideoFpsAndBitrate(int length) {
-        videoDataLength += length;
-        if (videoFrameCount == 0) {
-            videoLastTimeMillis = System.nanoTime() / 1000000;
-            videoFrameCount++;
-        } else {
-            if (++videoFrameCount >= 48) {
-                long diffTimeMillis = System.nanoTime() / 1000000 - videoLastTimeMillis;
-                mHandler.notifyRtmpVideoFpsChanged((double) videoFrameCount * 1000 / diffTimeMillis);
-                mHandler.notifyRtmpVideoBitrateChanged((double) videoDataLength * 8 * 1000 / diffTimeMillis);
-                videoFrameCount = 0;
-                videoDataLength = 0;
-            }
-        }
-    }
-
-    private void calcAudioBitrate(int length) {
-        audioDataLength += length;
-        if (audioFrameCount == 0) {
-            audioLastTimeMillis = System.nanoTime() / 1000000;
-            audioFrameCount++;
-        } else {
-            if (++audioFrameCount >= 48) {
-                long diffTimeMillis = System.nanoTime() / 1000000 - audioLastTimeMillis;
-                mHandler.notifyRtmpAudioBitrateChanged((double) audioDataLength * 8 * 1000 / diffTimeMillis);
-                audioFrameCount = 0;
-                audioDataLength = 0;
-            }
-        }
-    }
-
-    private void sendRtmpPacket(RtmpPacket rtmpPacket) {
-        try {
-            ChunkStreamInfo chunkStreamInfo = rtmpSessionInfo.getChunkStreamInfo(rtmpPacket.getHeader().getChunkStreamId());
-            chunkStreamInfo.setPrevHeaderTx(rtmpPacket.getHeader());
-            if (!(rtmpPacket instanceof Video || rtmpPacket instanceof Audio)) {
-                rtmpPacket.getHeader().setAbsoluteTimestamp((int) chunkStreamInfo.markAbsoluteTimestampTx());
-            }
-            rtmpPacket.writeTo(outputStream, rtmpSessionInfo.getTxChunkSize(), chunkStreamInfo);
-            Log.d(TAG, "wrote packet: " + rtmpPacket + ", size: " + rtmpPacket.getHeader().getPacketLength());
-            if (rtmpPacket instanceof Command) {
-                rtmpSessionInfo.addInvokedCommand(((Command) rtmpPacket).getTransactionId(), ((Command) rtmpPacket).getCommandName());
-            }
-            outputStream.flush();
-        } catch (SocketException se) {
-            // Since there are still remaining AV frame in the cache, we set a flag to guarantee the
-            // socket exception only issue one time.
-            if (!socketExceptionCause.contentEquals(se.getMessage())) {
-                socketExceptionCause = se.getMessage();
-                Log.e(TAG, "Caught SocketException during write loop, shutting down: " + se.getMessage());
-                mHandler.notifyRtmpSocketException(se);
-            }
-        } catch (IOException ioe) {
-            Log.e(TAG, "Caught IOException during write loop, shutting down: " + ioe.getMessage());
-            mHandler.notifyRtmpIOException(ioe);
-        }
-    }
-
-    private void handleRxPacketLoop() throws IOException {
-        // Handle all queued received RTMP packets
-        while (!Thread.interrupted()) {
-            try {
-                // It will be blocked when no data in input stream buffer
-                RtmpPacket rtmpPacket = rtmpDecoder.readPacket(inputStream);
-                if (rtmpPacket != null) {
-                    //Log.d(TAG, "handleRxPacketLoop(): RTMP rx packet message type: " + rtmpPacket.getHeader().getMessageType());
-                    switch (rtmpPacket.getHeader().getMessageType()) {
-                        case ABORT:
-                            rtmpSessionInfo.getChunkStreamInfo(((Abort) rtmpPacket).getChunkStreamId()).clearStoredChunks();
-                            break;
-                        case USER_CONTROL_MESSAGE:
-                            UserControl user = (UserControl) rtmpPacket;
-                            switch (user.getType()) {
-                                case STREAM_BEGIN:
-                                    if (currentStreamId != user.getFirstEventData()) {
-                                        mHandler.notifyRtmpIllegalStateException(new IllegalStateException("Current stream ID error!"));
-                                    }
-                                    break;
-                                case PING_REQUEST:
-                                    ChunkStreamInfo channelInfo = rtmpSessionInfo.getChunkStreamInfo(ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL);
-                                    Log.d(TAG, "handleRxPacketLoop(): Sending PONG reply..");
-                                    UserControl pong = new UserControl(user, channelInfo);
-                                    sendRtmpPacket(pong);
-                                    break;
-                                case STREAM_EOF:
-                                    Log.i(TAG, "handleRxPacketLoop(): Stream EOF reached, closing RTMP writer...");
-                                    break;
-                                default:
-                                    // Ignore...
-                                    break;
-                            }
-                            break;
-                        case WINDOW_ACKNOWLEDGEMENT_SIZE:
-                            WindowAckSize windowAckSize = (WindowAckSize) rtmpPacket;
-                            int size = windowAckSize.getAcknowledgementWindowSize();
-                            Log.d(TAG, "handleRxPacketLoop(): Setting acknowledgement window size: " + size);
-                            rtmpSessionInfo.setAcknowledgmentWindowSize(size);
-                            break;
-                        case SET_PEER_BANDWIDTH:
-                            SetPeerBandwidth bw = (SetPeerBandwidth) rtmpPacket;
-                            rtmpSessionInfo.setAcknowledgmentWindowSize(bw.getAcknowledgementWindowSize());
-                            int acknowledgementWindowsize = rtmpSessionInfo.getAcknowledgementWindowSize();
-                            ChunkStreamInfo chunkStreamInfo = rtmpSessionInfo.getChunkStreamInfo(ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL);
-                            Log.d(TAG, "handleRxPacketLoop(): Send acknowledgement window size: " + acknowledgementWindowsize);
-                            sendRtmpPacket(new WindowAckSize(acknowledgementWindowsize, chunkStreamInfo));
-                            // Set socket option
-                            socket.setSendBufferSize(acknowledgementWindowsize);
-                            break;
-                        case COMMAND_AMF0:
-                            handleRxInvoke((Command) rtmpPacket);
-                            break;
-                        default:
-                            Log.w(TAG, "handleRxPacketLoop(): Not handling unimplemented/unknown packet of type: " + rtmpPacket.getHeader().getMessageType());
-                            break;
-                    }
-                }
-            } catch (EOFException eof) {
-                Thread.currentThread().interrupt();
-            } catch (SocketException se) {
-                Log.e(TAG, "Caught SocketException while reading/decoding packet, shutting down: " + se.getMessage());
-                mHandler.notifyRtmpSocketException(se);
-            } catch (IOException ioe) {
-                Log.e(TAG, "Caught exception while reading/decoding packet, shutting down: " + ioe.getMessage());
-                mHandler.notifyRtmpIOException(ioe);
-            }
-        }
-    }
-
-    private void handleRxInvoke(Command invoke) throws IOException {
-        String commandName = invoke.getCommandName();
-
-        if (commandName.equals("_result")) {
-            // This is the result of one of the methods invoked by us
-            String method = rtmpSessionInfo.takeInvokedCommand(invoke.getTransactionId());
-
-            Log.d(TAG, "handleRxInvoke: Got result for invoked method: " + method);
-            if ("connect".equals(method)) {
-                // Capture server ip/pid/id information if any
-                srsServerInfo = onSrsServerInfo(invoke);
-                // We can now send createStream commands
-                connected = true;
-                synchronized (connectingLock) {
-                    connectingLock.notifyAll();
-                }
-            } else if ("createStream".contains(method)) {
-                // Get stream id
-                currentStreamId = (int) ((AmfNumber) invoke.getData().get(1)).getValue();
-                Log.d(TAG, "handleRxInvoke(): Stream ID to publish: " + currentStreamId);
-                if (streamName != null && publishType != null) {
-                    fmlePublish();
-                }
-            } else if ("releaseStream".contains(method)) {
-                Log.d(TAG, "handleRxInvoke(): 'releaseStream'");
-            } else if ("FCPublish".contains(method)) {
-                Log.d(TAG, "handleRxInvoke(): 'FCPublish'");
-            } else {
-                Log.w(TAG, "handleRxInvoke(): '_result' message received for unknown method: " + method);
-            }
-        } else if (commandName.equals("onBWDone")) {
-            Log.d(TAG, "handleRxInvoke(): 'onBWDone'");
-        } else if (commandName.equals("onFCPublish")) {
-            Log.d(TAG, "handleRxInvoke(): 'onFCPublish'");
-        } else if (commandName.equals("onStatus")) {
-            String code = ((AmfString) ((AmfObject) invoke.getData().get(1)).getProperty("code")).getValue();
-            Log.d(TAG, "handleRxInvoke(): onStatus " + code);
-            if (code.equals("NetStream.Publish.Start")) {
-                onMetaData();
-                // We can now publish AV data
-                publishPermitted = true;
-                synchronized (publishLock) {
-                    publishLock.notifyAll();
-                }
-            }
-        } else {
-            Log.e(TAG, "handleRxInvoke(): Unknown/unhandled server invoke: " + invoke);
-        }
-    }
-
-    private String onSrsServerInfo(Command invoke) {
-        // SRS server special information
-        AmfObject objData = (AmfObject) invoke.getData().get(1);
-        if ((objData).getProperty("data") instanceof AmfObject) {
-            objData = ((AmfObject) objData.getProperty("data"));
-            serverIpAddr = (AmfString) objData.getProperty("srs_server_ip");
-            serverPid = (AmfNumber) objData.getProperty("srs_pid");
-            serverId = (AmfNumber) objData.getProperty("srs_id");
-        }
-        String info = "";
-        info += serverIpAddr == null ? "" : " ip: " + serverIpAddr.getValue();
-        info += serverPid == null ? "" : " pid: " + (int) serverPid.getValue();
-        info += serverId == null ? "" : " id: " + (int) serverId.getValue();
-        return info;
-    }
-
-    @Override
-    public AtomicInteger getVideoFrameCacheNumber() {
-        return videoFrameCacheNumber;
-    }
-
-    @Override
-    public final String getServerIpAddr() {
-        return serverIpAddr == null ? null : serverIpAddr.getValue();
-    }
-
-    @Override
-    public final int getServerPid() {
-        return serverPid == null ? 0 : (int) serverPid.getValue();
-    }
-
-    @Override
-    public final int getServerId() {
-        return serverId == null ? 0 : (int) serverId.getValue();
-    }
-
-    @Override
-    public void setVideoResolution(int width, int height) {
-        videoWidth = width;
-        videoHeight = height;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpDecoder.java b/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpDecoder.java
deleted file mode 100755
index 720bcd3..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpDecoder.java
+++ /dev/null
@@ -1,95 +0,0 @@
-package com.github.faucamp.simplertmp.io;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import android.util.Log;
-
-import com.github.faucamp.simplertmp.packets.Abort;
-import com.github.faucamp.simplertmp.packets.Audio;
-import com.github.faucamp.simplertmp.packets.Command;
-import com.github.faucamp.simplertmp.packets.Data;
-import com.github.faucamp.simplertmp.packets.RtmpHeader;
-import com.github.faucamp.simplertmp.packets.RtmpPacket;
-import com.github.faucamp.simplertmp.packets.SetChunkSize;
-import com.github.faucamp.simplertmp.packets.SetPeerBandwidth;
-import com.github.faucamp.simplertmp.packets.UserControl;
-import com.github.faucamp.simplertmp.packets.Video;
-import com.github.faucamp.simplertmp.packets.WindowAckSize;
-import com.github.faucamp.simplertmp.packets.Acknowledgement;
-
-/**
- * @author francois
- */
-public class RtmpDecoder {
-
-    private static final String TAG = "RtmpDecoder";
-
-    private RtmpSessionInfo rtmpSessionInfo;
-
-    public RtmpDecoder(RtmpSessionInfo rtmpSessionInfo) {
-        this.rtmpSessionInfo = rtmpSessionInfo;
-    }
-
-    public RtmpPacket readPacket(InputStream in) throws IOException {
-
-        RtmpHeader header = RtmpHeader.readHeader(in, rtmpSessionInfo);
-        // Log.d(TAG, "readPacket(): header.messageType: " + header.getMessageType());
-
-        ChunkStreamInfo chunkStreamInfo = rtmpSessionInfo.getChunkStreamInfo(header.getChunkStreamId());
-        chunkStreamInfo.setPrevHeaderRx(header);
-
-        if (header.getPacketLength() > rtmpSessionInfo.getRxChunkSize()) {
-            // If the packet consists of more than one chunk,
-            // store the chunks in the chunk stream until everything is read
-            if (!chunkStreamInfo.storePacketChunk(in, rtmpSessionInfo.getRxChunkSize())) {
-                // return null because of incomplete packet
-                return null;
-            } else {
-                // stored chunks complete packet, get the input stream of the chunk stream
-                in = chunkStreamInfo.getStoredPacketInputStream();
-            }
-        }
-
-        RtmpPacket rtmpPacket;
-        switch (header.getMessageType()) {
-            case SET_CHUNK_SIZE:
-                SetChunkSize setChunkSize = new SetChunkSize(header);
-                setChunkSize.readBody(in);
-                Log.d(TAG, "readPacket(): Setting chunk size to: " + setChunkSize.getChunkSize());
-                rtmpSessionInfo.setRxChunkSize(setChunkSize.getChunkSize());
-                return null;
-            case ABORT:
-                rtmpPacket = new Abort(header);
-                break;
-            case USER_CONTROL_MESSAGE:
-                rtmpPacket = new UserControl(header);
-                break;
-            case WINDOW_ACKNOWLEDGEMENT_SIZE:
-                rtmpPacket = new WindowAckSize(header);
-                break;
-            case SET_PEER_BANDWIDTH:
-                rtmpPacket = new SetPeerBandwidth(header);
-                break;
-            case AUDIO:
-                rtmpPacket = new Audio(header);
-                break;
-            case VIDEO:
-                rtmpPacket = new Video(header);
-                break;
-            case COMMAND_AMF0:
-                rtmpPacket = new Command(header);
-                break;
-            case DATA_AMF0:
-                rtmpPacket = new Data(header);
-                break;
-            case ACKNOWLEDGEMENT:
-                rtmpPacket = new Acknowledgement(header);
-                break;
-            default:
-                throw new IOException("No packet body implementation for message type: " + header.getMessageType());
-        }                
-        rtmpPacket.readBody(in);                        
-        return rtmpPacket;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpSessionInfo.java b/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpSessionInfo.java
deleted file mode 100755
index e9d03bb..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/io/RtmpSessionInfo.java
+++ /dev/null
@@ -1,82 +0,0 @@
-package com.github.faucamp.simplertmp.io;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-import com.github.faucamp.simplertmp.packets.RtmpPacket;
-
-/**
- *
- * @author francois
- */
-public class RtmpSessionInfo {
-
-    /** The (total) number of bytes read for this window (resets to 0 if the agreed-upon RTMP window acknowledgement size is reached) */
-    private int windowBytesRead;
-    /** The window acknowledgement size for this RTMP session, in bytes; default to max to avoid unnecessary "Acknowledgment" messages from being sent */
-    private int acknowledgementWindowSize = Integer.MAX_VALUE;
-    /** Used internally to store the total number of bytes read (used when sending Acknowledgement messages) */
-    private int totalBytesRead = 0;
-    
-    /** Default chunk size is 128 bytes */
-    private int rxChunkSize = 128;
-    private int txChunkSize = 128;
-    private Map<Integer, ChunkStreamInfo> chunkChannels = new HashMap<Integer, ChunkStreamInfo>();
-    private Map<Integer, String> invokedMethods = new ConcurrentHashMap<Integer, String>();
-
-    public ChunkStreamInfo getChunkStreamInfo(int chunkStreamId) {
-        ChunkStreamInfo chunkStreamInfo = chunkChannels.get(chunkStreamId);
-        if (chunkStreamInfo == null) {
-            chunkStreamInfo = new ChunkStreamInfo();
-            chunkChannels.put(chunkStreamId, chunkStreamInfo);
-        }
-        return chunkStreamInfo;
-    }
-
-    public String takeInvokedCommand(int transactionId) {
-        return invokedMethods.remove(transactionId);
-    }
-
-    public String addInvokedCommand(int transactionId, String commandName) {
-        return invokedMethods.put(transactionId, commandName);
-    }
-
-    public int getRxChunkSize() {
-        return rxChunkSize;
-    }
-
-    public void setRxChunkSize(int chunkSize) {
-        this.rxChunkSize = chunkSize;
-    }
-
-    public int getTxChunkSize() {
-        return txChunkSize;
-    }
-
-    public void setTxChunkSize(int chunkSize) {
-        this.txChunkSize = chunkSize;
-    }
-
-    public int getAcknowledgementWindowSize() {
-        return acknowledgementWindowSize;
-    }
-
-    public void setAcknowledgmentWindowSize(int acknowledgementWindowSize) {
-        this.acknowledgementWindowSize = acknowledgementWindowSize;
-    }
-
-    /**
-     * Add the specified amount of bytes to the total number of bytes read for this RTMP window;     
-     * @param numBytes the number of bytes to add
-     * @return <code>true</code> if an "acknowledgement" packet should be sent, <code>false</code> otherwise
-     */
-    public final void addToWindowBytesRead(final int numBytes, final RtmpPacket packet) throws WindowAckRequired {
-        windowBytesRead += numBytes;
-        totalBytesRead += numBytes;
-        if (windowBytesRead >= acknowledgementWindowSize) {            
-            windowBytesRead -= acknowledgementWindowSize;                       
-            throw new WindowAckRequired(totalBytesRead, packet);
-        }
-    }       
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/io/WindowAckRequired.java b/android/src/main/java/com/github/faucamp/simplertmp/io/WindowAckRequired.java
deleted file mode 100755
index 6046474..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/io/WindowAckRequired.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package com.github.faucamp.simplertmp.io;
-
-import com.github.faucamp.simplertmp.packets.RtmpPacket;
-
-/**
- * Thrown by RTMP read thread when an Acknowledgement packet needs to be sent
- * to acknowledge the RTMP window size. It contains the RTMP packet that was 
- * read when this event occurred (if any).
- * 
- * @author francois
- */
-public class WindowAckRequired extends Exception {
-
-    private RtmpPacket rtmpPacket;
-    private int bytesRead;
-
-    /**
-     * Used when the window acknowledgement size was reached, whilst fully reading
-     * an RTMP packet or not. If a packet is present, it should still be handled as if it was returned
-     * by the RTMP decoder.
-     * 
-     * @param bytesReadThusFar The (total) number of bytes received so far
-     * @param rtmpPacket The packet that was read (and thus should be handled), can be <code>null</code>
-     */
-    public WindowAckRequired(int bytesReadThusFar, RtmpPacket rtmpPacket) {
-        this.rtmpPacket = rtmpPacket;
-        this.bytesRead = bytesReadThusFar;
-    }
-
-    /**
-     * @return The RTMP packet that should be handled, or <code>null</code> if no full packet is available
-     */
-    public RtmpPacket getRtmpPacket() {
-        return rtmpPacket;
-    }   
-
-    public int getBytesRead() {
-        return bytesRead;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Abort.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Abort.java
deleted file mode 100755
index 0f44f1d..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Abort.java
+++ /dev/null
@@ -1,58 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * A "Abort" RTMP control message, received on chunk stream ID 2 (control channel)
- * 
- * @author francois
- */
-public class Abort extends RtmpPacket {
-
-    private int chunkStreamId;
-    
-    public Abort(RtmpHeader header) {
-        super(header);
-    }
-
-    public Abort(int chunkStreamId) {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_1_RELATIVE_LARGE, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.SET_CHUNK_SIZE));
-        this.chunkStreamId = chunkStreamId;
-    }
-
-    /** @return the ID of the chunk stream to be aborted */
-    public int getChunkStreamId() {
-        return chunkStreamId;
-    }
-
-    /** Sets the ID of the chunk stream to be aborted */
-    public void setChunkStreamId(int chunkStreamId) {
-        this.chunkStreamId = chunkStreamId;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        // Value is received in the 4 bytes of the body
-        chunkStreamId = Util.readUnsignedInt32(in);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        Util.writeUnsignedInt32(out, chunkStreamId);
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Acknowledgement.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Acknowledgement.java
deleted file mode 100755
index 1d0ef90..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Acknowledgement.java
+++ /dev/null
@@ -1,73 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * (Window) Acknowledgement
- * 
- * The client or the server sends the acknowledgment to the peer after
- * receiving bytes equal to the window size. The window size is the
- * maximum number of bytes that the sender sends without receiving
- * acknowledgment from the receiver. The server sends the window size to
- * the client after application connects. This message specifies the
- * sequence number, which is the number of the bytes received so far.
- * 
- * @author francois
- */
-public class Acknowledgement extends RtmpPacket {
-
-    private int sequenceNumber;
-
-    public Acknowledgement(RtmpHeader header) {
-        super(header);
-    }
-
-    public Acknowledgement(int numBytesReadThusFar) {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.ACKNOWLEDGEMENT));
-        this.sequenceNumber = numBytesReadThusFar;
-    }
-
-    public int getAcknowledgementWindowSize() {
-        return sequenceNumber;
-    }
-
-    /** @return the sequence number, which is the number of the bytes received so far */
-    public int getSequenceNumber() {
-        return sequenceNumber;
-    }
-
-    /** Sets the sequence number, which is the number of the bytes received so far */
-    public void setSequenceNumber(int numBytesRead) {
-        this.sequenceNumber = numBytesRead;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        sequenceNumber = Util.readUnsignedInt32(in);
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        Util.writeUnsignedInt32(out, sequenceNumber);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Acknowledgment (sequence number: " + sequenceNumber + ")";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Audio.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Audio.java
deleted file mode 100755
index 0d7ace6..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Audio.java
+++ /dev/null
@@ -1,24 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * Audio data packet
- *  
- * @author francois
- */
-public class Audio extends ContentData {
-
-    public Audio(RtmpHeader header) {
-        super(header);
-    }
-
-    public Audio() {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_AUDIO, RtmpHeader.MessageType.AUDIO));
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Audio";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Command.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Command.java
deleted file mode 100755
index 74496f2..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Command.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.amf.AmfNumber;
-import com.github.faucamp.simplertmp.amf.AmfString;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * Encapsulates an command/"invoke" RTMP packet
- * 
- * Invoke/command packet structure (AMF encoded):
- * (String) <commmand name>
- * (Number) <Transaction ID>
- * (Mixed) <Argument> ex. Null, String, Object: {key1:value1, key2:value2 ... }
- * 
- * @author francois
- */
-public class Command extends VariableBodyRtmpPacket {
-
-    private static final String TAG = "Command";
-
-    private String commandName;
-    private int transactionId;    
-
-    public Command(RtmpHeader header) {
-        super(header);
-    }
-
-    public Command(String commandName, int transactionId, ChunkStreamInfo channelInfo) {
-        super(new RtmpHeader((channelInfo.canReusePrevHeaderTx(RtmpHeader.MessageType.COMMAND_AMF0) ? RtmpHeader.ChunkType.TYPE_1_RELATIVE_LARGE : RtmpHeader.ChunkType.TYPE_0_FULL), ChunkStreamInfo.RTMP_CID_OVER_CONNECTION, RtmpHeader.MessageType.COMMAND_AMF0));
-        this.commandName = commandName;
-        this.transactionId = transactionId;
-    }
-    
-    public Command(String commandName, int transactionId) {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_OVER_CONNECTION, RtmpHeader.MessageType.COMMAND_AMF0));
-        this.commandName = commandName;
-        this.transactionId = transactionId;
-    }
-
-    public String getCommandName() {
-        return commandName;
-    }
-
-    public void setCommandName(String commandName) {
-        this.commandName = commandName;
-    }
-
-    public int getTransactionId() {
-        return transactionId;
-    }
-
-    public void setTransactionId(int transactionId) {
-        this.transactionId = transactionId;
-    }    
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        // The command name and transaction ID are always present (AMF string followed by number)
-        commandName = AmfString.readStringFrom(in, false);
-        transactionId = (int) AmfNumber.readNumberFrom(in);        
-        int bytesRead = AmfString.sizeOf(commandName, false) + AmfNumber.SIZE;
-        readVariableData(in, bytesRead);
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        AmfString.writeStringTo(out, commandName, false);
-        AmfNumber.writeNumberTo(out, transactionId);
-        // Write body data
-        writeVariableData(out);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Command (command: " + commandName + ", transaction ID: " + transactionId + ")";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/ContentData.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/ContentData.java
deleted file mode 100755
index 20e85f3..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/ContentData.java
+++ /dev/null
@@ -1,58 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import android.support.annotation.Nullable;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-
-/**
- * Content (audio/video) data packet base
- *  
- * @author francois
- */
-public abstract class ContentData extends RtmpPacket {
-
-    protected byte[] data;
-    protected int size;
-
-    public ContentData(RtmpHeader header) {
-        super(header);
-    }
-
-    public byte[] getData() {
-        return data;
-    }
-
-    public void setData(byte[] data, int size) {
-        this.data = data;
-        this.size = size;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        data = new byte[this.header.getPacketLength()];
-        Util.readBytesUntilFull(in, data);
-    }
-
-    /**
-     * Method is public for content (audio/video)
-     * Write this packet body without chunking;
-     * useful for dumping audio/video streams
-     */
-    @Override
-    public void writeBody(OutputStream out) throws IOException {
-    }
-
-    @Override
-    public byte[] array() {
-        return data;
-    }
-
-    @Override
-    public int size() {
-        return size;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Data.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Data.java
deleted file mode 100755
index 6935a2a..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Data.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.amf.AmfString;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * AMF Data packet
- * 
- * Also known as NOTIFY in some RTMP implementations.
- * 
- * The client or the server sends this message to send Metadata or any user data
- * to the peer. Metadata includes details about the data (audio, video etc.) 
- * like creation time, duration, theme and so on.
- * 
- * @author francois
- */
-public class Data extends VariableBodyRtmpPacket {
-
-    private String type;
-
-    public Data(RtmpHeader header) {
-        super(header);
-    }
-
-    public Data(String type) {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_OVER_CONNECTION, RtmpHeader.MessageType.DATA_AMF0));
-        this.type = type;
-    }
-
-    public String getType() {
-        return type;
-    }
-
-    public void setType(String type) {
-        this.type = type;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        // Read notification type
-        type = AmfString.readStringFrom(in, false);
-        int bytesRead = AmfString.sizeOf(type, false);
-        // Read data body
-        readVariableData(in, bytesRead);
-    }
-
-    /** 
-     * This method is public for Data to make it easy to dump its contents to 
-     * another output stream
-     */
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        AmfString.writeStringTo(out, type, false);
-        writeVariableData(out);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Handshake.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Handshake.java
deleted file mode 100755
index 2a7b50f..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Handshake.java
+++ /dev/null
@@ -1,223 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Random;
-
-import android.util.Log;
-
-import com.github.faucamp.simplertmp.Crypto;
-import com.github.faucamp.simplertmp.Util;
-
-/**
- * Handles the RTMP handshake song 'n dance
- * 
- * Thanks to http://thompsonng.blogspot.com/2010/11/rtmp-part-10-handshake.html for some very useful information on
- * the the hidden "features" of the RTMP handshake
- * 
- * @author francois
- */
-public final class Handshake {
-    private static final String TAG = "Handshake";
-    /** S1 as sent by the server */
-    private byte[] s1;
-    private static final int PROTOCOL_VERSION = 0x03;
-    private static final int HANDSHAKE_SIZE = 1536;
-    private static final int SHA256_DIGEST_SIZE = 32;
-    
-    private static final int DIGEST_OFFSET_INDICATOR_POS = 772; // should either be byte 772 or byte 8
-    
-    private static final byte[] GENUINE_FP_KEY = {
-        (byte) 0x47, (byte) 0x65, (byte) 0x6E, (byte) 0x75, (byte) 0x69, (byte) 0x6E, (byte) 0x65, (byte) 0x20,
-        (byte) 0x41, (byte) 0x64, (byte) 0x6F, (byte) 0x62, (byte) 0x65, (byte) 0x20, (byte) 0x46, (byte) 0x6C,
-        (byte) 0x61, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x50, (byte) 0x6C, (byte) 0x61, (byte) 0x79,
-        (byte) 0x65, (byte) 0x72, (byte) 0x20, (byte) 0x30, (byte) 0x30, (byte) 0x31, // Genuine Adobe Flash Player 001
-        (byte) 0xF0, (byte) 0xEE, (byte) 0xC2, (byte) 0x4A, (byte) 0x80, (byte) 0x68, (byte) 0xBE, (byte) 0xE8,
-        (byte) 0x2E, (byte) 0x00, (byte) 0xD0, (byte) 0xD1, (byte) 0x02, (byte) 0x9E, (byte) 0x7E, (byte) 0x57,
-        (byte) 0x6E, (byte) 0xEC, (byte) 0x5D, (byte) 0x2D, (byte) 0x29, (byte) 0x80, (byte) 0x6F, (byte) 0xAB,
-        (byte) 0x93, (byte) 0xB8, (byte) 0xE6, (byte) 0x36, (byte) 0xCF, (byte) 0xEB, (byte) 0x31, (byte) 0xAE};
-
-    /** Generates and writes the first handshake packet (C0) */
-    public final void writeC0(OutputStream out) throws IOException {
-        Log.d(TAG, "writeC0");
-        out.write(PROTOCOL_VERSION);
-    }
-
-    public final void readS0(InputStream in) throws IOException {
-        Log.d(TAG, "readS0");
-        byte s0 = (byte) in.read();
-        if (s0 != PROTOCOL_VERSION) {
-            if (s0 == -1) {
-                throw new IOException("InputStream closed");
-            } else {
-                throw new IOException("Invalid RTMP protocol version; expected " + PROTOCOL_VERSION + ", got " + s0);
-            }
-        }
-    }
-
-    /** Generates and writes the second handshake packet (C1) */
-    public final void writeC1(OutputStream out) throws IOException {
-        Log.d(TAG, "writeC1");
-//        Util.writeUnsignedInt32(out, (int) (System.currentTimeMillis() / 1000)); // Bytes 0 - 3 bytes: current epoch (timestamp)
-        //out.write(new byte[]{0x09, 0x00, 0x7c, 0x02}); // Bytes 4 - 7: Flash player version: 9.0.124.2
-
-//        out.write(new byte[]{(byte) 0x80, 0x00, 0x07, 0x02}); // Bytes 4 - 7: Flash player version: 11.2.202.233
-
-
-        Log.d(TAG, "writeC1(): Calculating digest offset");
-        Random random = new Random();
-        // Since we are faking a real Flash Player handshake, include a digest in C1        
-        // Choose digest offset point (scheme 1; that is, offset is indicated by bytes 772 - 775 (4 bytes) )
-        final int digestOffset = random.nextInt(HANDSHAKE_SIZE - DIGEST_OFFSET_INDICATOR_POS - 4 - 8 - SHA256_DIGEST_SIZE); //random.nextInt(DIGEST_OFFSET_INDICATOR_POS - SHA256_DIGEST_SIZE);
-
-        final int absoluteDigestOffset = ((digestOffset % 728) + DIGEST_OFFSET_INDICATOR_POS + 4);
-        Log.d(TAG, "writeC1(): (real value of) digestOffset: " + digestOffset);
-        
-        
-        Log.d(TAG, "writeC1(): recalculated digestOffset: " + absoluteDigestOffset);
-
-        int remaining = digestOffset;
-        final byte[] digestOffsetBytes = new byte[4];
-        for (int i = 3; i >= 0; i--) {            
-            if (remaining > 255) {
-                digestOffsetBytes[i] = (byte)255;
-                remaining -= 255;
-            } else {
-                digestOffsetBytes[i] = (byte)remaining;
-                remaining -= remaining;
-            }
-        }
-        
-        
-        
-        
-        // Calculate the offset value that will be written
-        //inal byte[] digestOffsetBytes = Util.unsignedInt32ToByteArray(digestOffset);// //((digestOffset - DIGEST_OFFSET_INDICATOR_POS) % 728)); // Thanks to librtmp for the mod 728                
-        Log.d(TAG, "writeC1(): digestOffsetBytes: " + Util.toHexString(digestOffsetBytes));  //Util.unsignedInt32ToByteArray((digestOffset % 728))));
-
-        // Create random bytes up to the digest offset point
-        byte[] partBeforeDigest = new byte[absoluteDigestOffset];
-        Log.d(TAG, "partBeforeDigest(): size: " + partBeforeDigest.length);
-        random.nextBytes(partBeforeDigest);
-
-        Log.d(TAG, "writeC1(): Writing timestamp and Flash Player version");
-        byte[] timeStamp = Util.unsignedInt32ToByteArray((int) (System.currentTimeMillis() / 1000));
-        System.arraycopy(timeStamp, 0, partBeforeDigest, 0, 4); // Bytes 0 - 3 bytes: current epoch timestamp
-        System.arraycopy(new byte[]{(byte) 0x80, 0x00, 0x07, 0x02}, 0, partBeforeDigest, 4, 4); // Bytes 4 - 7: Flash player version: 11.2.202.233
-
-        // Create random bytes for the part after the digest
-        byte[] partAfterDigest = new byte[HANDSHAKE_SIZE - absoluteDigestOffset - SHA256_DIGEST_SIZE]; // subtract 8 because of initial 8 bytes already written
-        Log.d(TAG, "partAfterDigest(): size: " + partAfterDigest.length);
-        random.nextBytes(partAfterDigest);
-
-
-        // Set the offset byte
-//        if (digestOffset > 772) {                      
-            Log.d(TAG, "copying digest offset bytes in partBeforeDigest");
-            System.arraycopy(digestOffsetBytes, 0, partBeforeDigest, 772, 4);          
-//        } else {
-        // Implied offset of partAfterDigest is digestOffset + 32
-///        Log.d(TAG, "copying digest offset bytes in partAfterDigest");
-///        Log.d(TAG, " writing to location: " + (DIGEST_OFFSET_INDICATOR_POS - digestOffset - SHA256_DIGEST_SIZE - 8));
-//        System.arraycopy(digestOffsetBytes, 0, partAfterDigest, (DIGEST_OFFSET_INDICATOR_POS - digestOffset - SHA256_DIGEST_SIZE - 8), 4);
-//        }
-
-        Log.d(TAG, "writeC1(): Calculating digest");
-        byte[] tempBuffer = new byte[HANDSHAKE_SIZE - SHA256_DIGEST_SIZE];
-        System.arraycopy(partBeforeDigest, 0, tempBuffer, 0, partBeforeDigest.length);
-        System.arraycopy(partAfterDigest, 0, tempBuffer, partBeforeDigest.length, partAfterDigest.length);
-
-        Crypto crypto = new Crypto();
-        byte[] digest = crypto.calculateHmacSHA256(tempBuffer, GENUINE_FP_KEY, 30);
-
-        // Now write the packet
-        Log.d(TAG, "writeC1(): writing C1 packet");
-        out.write(partBeforeDigest);
-        out.write(digest);
-        out.write(partAfterDigest);
-    }
-
-    public final void readS1(InputStream in) throws IOException {
-        // S1 == 1536 bytes. We do not bother with checking the content of it
-        Log.d(TAG, "readS1");
-        s1 = new byte[HANDSHAKE_SIZE];
-
-        // Read server time (4 bytes)
-        int totalBytesRead = 0;
-        int read;
-        do {
-            read = in.read(s1, totalBytesRead, (HANDSHAKE_SIZE - totalBytesRead));
-            if (read != -1) {
-                totalBytesRead += read;
-            }
-        } while (totalBytesRead < HANDSHAKE_SIZE);
-
-        if (totalBytesRead != HANDSHAKE_SIZE) {
-            throw new IOException("Unexpected EOF while reading S1, expected " + HANDSHAKE_SIZE + " bytes, but only read " + totalBytesRead + " bytes");
-        } else {
-            Log.d(TAG, "readS1(): S1 total bytes read OK");
-        }
-    }
-
-    /** Generates and writes the third handshake packet (C2) */
-    public final void writeC2(OutputStream out) throws IOException {
-        Log.d(TAG, "readC2");
-        // C2 is an echo of S1
-        if (s1 == null) {
-            throw new IllegalStateException("C2 cannot be written without S1 being read first");
-        }
-        out.write(s1);
-    }
-
-    public final void readS2(InputStream in) throws IOException {
-        // S2 should be an echo of C1, but we are not too strict
-        Log.d(TAG, "readS2");
-        byte[] sr_serverTime = new byte[4];
-        byte[] s2_serverVersion = new byte[4];
-        byte[] s2_rest = new byte[HANDSHAKE_SIZE - 8]; // subtract 4+4 bytes for time and version
-
-        // Read server time (4 bytes)
-        int totalBytesRead = 0;
-        int read;
-        do {
-            read = in.read(sr_serverTime, totalBytesRead, (4 - totalBytesRead));
-            if (read == -1) {
-                // End of stream reached - should not have happened at this point
-                throw new IOException("Unexpected EOF while reading S2 bytes 0-3");
-            } else {
-                totalBytesRead += read;
-            }
-        } while (totalBytesRead < 4);
-
-        // Read server version (4 bytes)
-        totalBytesRead = 0;
-        do {
-            read = in.read(s2_serverVersion, totalBytesRead, (4 - totalBytesRead));
-            if (read == -1) {
-                // End of stream reached - should not have happened at this point
-                throw new IOException("Unexpected EOF while reading S2 bytes 4-7");
-            } else {
-                totalBytesRead += read;
-            }
-        } while (totalBytesRead < 4);
-
-        // Read 1528 bytes (to make up S1 total size of 1536 bytes)
-        final int remainingBytes = HANDSHAKE_SIZE - 8;
-        totalBytesRead = 0;
-        do {
-            read = in.read(s2_rest, totalBytesRead, (remainingBytes - totalBytesRead));
-            if (read != -1) {
-                totalBytesRead += read;
-            }
-        } while (totalBytesRead < remainingBytes && read != -1);
-
-        if (totalBytesRead != remainingBytes) {
-            throw new IOException("Unexpected EOF while reading remainder of S2, expected " + remainingBytes + " bytes, but only read " + totalBytesRead + " bytes");
-        } else {
-            Log.d(TAG, "readS2(): S2 total bytes read OK");
-        }
-
-        // Technically we should check that S2 == C1, but for now this is ignored
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpHeader.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpHeader.java
deleted file mode 100755
index 1ed61e9..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpHeader.java
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-import com.github.faucamp.simplertmp.io.RtmpSessionInfo;
-
-/**
- *
- * @author francois, leoma
- */
-public class RtmpHeader {
-
-    private static final String TAG = "RtmpHeader";
-    /**
-     * RTMP packet/message type definitions.
-     * Note: docstrings are adapted from the official Adobe RTMP spec:
-     * http://www.adobe.com/devnet/rtmp/
-     */
-    public enum MessageType {
-
-        /**
-         * Protocol control message 1
-         * Set Chunk Size, is used to notify the peer a new maximum chunk size to use.
-         */
-        SET_CHUNK_SIZE(0x01),
-        /** 
-         * Protocol control message 2
-         * Abort Message, is used to notify the peer if it is waiting for chunks
-         * to complete a message, then to discard the partially received message
-         * over a chunk stream and abort processing of that message.
-         */
-        ABORT(0x02),
-        /** 
-         * Protocol control message 3
-         * The client or the server sends the acknowledgment to the peer after
-         * receiving bytes equal to the window size. The window size is the
-         * maximum number of bytes that the sender sends without receiving
-         * acknowledgment from the receiver.
-         */
-        ACKNOWLEDGEMENT(0x03),
-        /**
-         * Protocol control message 4
-         * The client or the server sends this message to notify the peer about
-         * the user control events. This message carries Event type and Event
-         * data.
-         * Also known as a PING message in some RTMP implementations.
-         */
-        USER_CONTROL_MESSAGE(0x04),
-        /**
-         * Protocol control message 5
-         * The client or the server sends this message to inform the peer which
-         * window size to use when sending acknowledgment.
-         * Also known as ServerBW ("server bandwidth") in some RTMP implementations.
-         */
-        WINDOW_ACKNOWLEDGEMENT_SIZE(0x05),
-        /**
-         * Protocol control message 6
-         * The client or the server sends this message to update the output
-         * bandwidth of the peer. The output bandwidth value is the same as the
-         * window size for the peer.
-         * Also known as ClientBW ("client bandwidth") in some RTMP implementations.
-         */
-        SET_PEER_BANDWIDTH(0x06),
-        /**
-         * RTMP audio packet (0x08)
-         * The client or the server sends this message to send audio data to the peer.
-         */
-        AUDIO(0x08),
-        /**
-         * RTMP video packet (0x09)
-         * The client or the server sends this message to send video data to the peer.         
-         */
-        VIDEO(0x09),
-        /**
-         * RTMP message type 0x0F
-         * The client or the server sends this message to send Metadata or any
-         * user data to the peer. Metadata includes details about the data (audio, video etc.) 
-         * like creation time, duration, theme and so on.
-         * This is the AMF3-encoded version.
-         */
-        DATA_AMF3(0x0F),
-        /**
-         * RTMP message type 0x10 
-         * A shared object is a Flash object (a collection of name value pairs)
-         * that are in synchronization across multiple clients, instances, and
-         * so on. 
-         * This is the AMF3 version: kMsgContainerEx=16 for AMF3.
-         */
-        SHARED_OBJECT_AMF3(0x10),
-        /**
-         * RTMP message type 0x11
-         * Command messages carry the AMF-encoded commands between the client
-         * and the server.
-         * A command message consists of command name, transaction ID, and command object that
-         * contains related parameters.
-         * This is the AMF3-encoded version.
-         */
-        COMMAND_AMF3(0x11),
-        /**
-         * RTMP message type 0x12
-         * The client or the server sends this message to send Metadata or any
-         * user data to the peer. Metadata includes details about the data (audio, video etc.) 
-         * like creation time, duration, theme and so on.
-         * This is the AMF0-encoded version.
-         */
-        DATA_AMF0(0x12),
-        /**
-         * RTMP message type 0x14
-         * Command messages carry the AMF-encoded commands between the client
-         * and the server.
-         * A command message consists of command name, transaction ID, and command object that
-         * contains related parameters.
-         * This is the common AMF0 version, also known as INVOKE in some RTMP implementations.
-         */
-        COMMAND_AMF0(0x14),
-        /**
-         * RTMP message type 0x13 
-         * A shared object is a Flash object (a collection of name value pairs)
-         * that are in synchronization across multiple clients, instances, and
-         * so on. 
-         * This is the AMF0 version: kMsgContainer=19 for AMF0.         
-         */
-        SHARED_OBJECT_AMF0(0x13),
-        /**
-         * RTMP message type 0x16
-         * An aggregate message is a single message that contains a list of sub-messages.
-         */
-        AGGREGATE_MESSAGE(0x16);
-        private byte value;
-        private static final Map<Byte, MessageType> quickLookupMap = new HashMap<Byte, MessageType>();
-
-        static {
-            for (MessageType messageTypId : MessageType.values()) {
-                quickLookupMap.put(messageTypId.getValue(), messageTypId);
-            }
-        }
-
-        MessageType(int value) {
-            this.value = (byte) value;
-        }
-
-        /** Returns the value of this chunk type */
-        public byte getValue() {
-            return value;
-        }
-
-        public static MessageType valueOf(byte messageTypeId) {
-            if (quickLookupMap.containsKey(messageTypeId)) {
-                return quickLookupMap.get(messageTypeId);
-            } else {
-                throw new IllegalArgumentException("Unknown message type byte: " + Util.toHexString(messageTypeId));
-            }
-        }
-    }
-
-    public enum ChunkType {
-
-        /** Full 12-byte RTMP chunk header */
-        TYPE_0_FULL(0x00),
-        /** Relative 8-byte RTMP chunk header (message stream ID is not included) */
-        TYPE_1_RELATIVE_LARGE(0x01),
-        /** Relative 4-byte RTMP chunk header (only timestamp delta) */
-        TYPE_2_RELATIVE_TIMESTAMP_ONLY(0x02),
-        /** Relative 1-byte RTMP chunk header (no "real" header, just the 1-byte indicating chunk header type & chunk stream ID) */
-        TYPE_3_RELATIVE_SINGLE_BYTE(0x03);
-        /** The byte value of this chunk header type */
-        private byte value;
-        /** The full size (in bytes) of this RTMP header (including the basic header byte) */
-        private static final Map<Byte, ChunkType> quickLookupMap = new HashMap<Byte, ChunkType>();
-        
-        static {
-            for (ChunkType messageTypId : ChunkType.values()) {
-                quickLookupMap.put(messageTypId.getValue(), messageTypId);
-            }
-        }
-
-        ChunkType(int byteValue) {
-            this.value = (byte) byteValue;
-        }
-
-        /** Returns the byte value of this chunk header type */
-        public byte getValue() {
-            return value;
-        }
-
-        public static ChunkType valueOf(byte chunkHeaderType) {
-            if (quickLookupMap.containsKey(chunkHeaderType)) {
-                return quickLookupMap.get(chunkHeaderType);
-            } else {
-                throw new IllegalArgumentException("Unknown chunk header type byte: " + Util.toHexString(chunkHeaderType));
-            }
-        }
-    }
-    private ChunkType chunkType;
-    private int chunkStreamId;
-    private int absoluteTimestamp;
-    private int timestampDelta = -1;
-    private int packetLength;
-    private MessageType messageType;
-    private int messageStreamId;
-    private int extendedTimestamp;
-
-    public RtmpHeader() {
-    }
-
-    public RtmpHeader(ChunkType chunkType, int chunkStreamId, MessageType messageType) {
-        this.chunkType = chunkType;
-        this.chunkStreamId = chunkStreamId;
-        this.messageType = messageType;
-    }
-
-    public static RtmpHeader readHeader(InputStream in, RtmpSessionInfo rtmpSessionInfo) throws IOException {
-        RtmpHeader rtmpHeader = new RtmpHeader();
-        rtmpHeader.readHeaderImpl(in, rtmpSessionInfo);
-        return rtmpHeader;
-    }
-
-    private void readHeaderImpl(InputStream in, RtmpSessionInfo rtmpSessionInfo) throws IOException {
-
-        int basicHeaderByte = in.read();
-        if (basicHeaderByte == -1) {
-            throw new EOFException("Unexpected EOF while reading RTMP packet basic header");
-        }
-        // Read byte 0: chunk type and chunk stream ID
-        parseBasicHeader((byte) basicHeaderByte);
-
-        switch (chunkType) {
-            case TYPE_0_FULL: { //  b00 = 12 byte header (full header) 
-                // Read bytes 1-3: Absolute timestamp
-                absoluteTimestamp = Util.readUnsignedInt24(in);
-                timestampDelta = 0;
-                // Read bytes 4-6: Packet length
-                packetLength = Util.readUnsignedInt24(in);
-                // Read byte 7: Message type ID
-                messageType = MessageType.valueOf((byte) in.read());
-                // Read bytes 8-11: Message stream ID (apparently little-endian order)
-                byte[] messageStreamIdBytes = new byte[4];
-                Util.readBytesUntilFull(in, messageStreamIdBytes);
-                messageStreamId = Util.toUnsignedInt32LittleEndian(messageStreamIdBytes);
-                // Read bytes 1-4: Extended timestamp
-                extendedTimestamp = absoluteTimestamp >= 0xffffff ? Util.readUnsignedInt32(in) : 0;
-                if (extendedTimestamp != 0) {
-                    absoluteTimestamp = extendedTimestamp;
-                }
-                break;
-            }
-            case TYPE_1_RELATIVE_LARGE: { // b01 = 8 bytes - like type 0. not including message stream ID (4 last bytes)
-                // Read bytes 1-3: Timestamp delta
-                timestampDelta = Util.readUnsignedInt24(in);
-                // Read bytes 4-6: Packet length
-                packetLength = Util.readUnsignedInt24(in);
-                // Read byte 7: Message type ID
-                messageType = MessageType.valueOf((byte) in.read());
-                // Read bytes 1-4: Extended timestamp delta
-                extendedTimestamp = timestampDelta >= 0xffffff ? Util.readUnsignedInt32(in) : 0;
-                RtmpHeader prevHeader = rtmpSessionInfo.getChunkStreamInfo(chunkStreamId).prevHeaderRx();
-                if (prevHeader != null) {
-                    messageStreamId = prevHeader.messageStreamId;
-                    absoluteTimestamp = extendedTimestamp != 0 ? extendedTimestamp : prevHeader.absoluteTimestamp + timestampDelta;
-                } else {
-                    messageStreamId = 0;
-                    absoluteTimestamp = extendedTimestamp != 0 ? extendedTimestamp : timestampDelta;
-                }
-                break;
-            }
-            case TYPE_2_RELATIVE_TIMESTAMP_ONLY: { // b10 = 4 bytes - Basic Header and timestamp (3 bytes) are included
-                // Read bytes 1-3: Timestamp delta
-                timestampDelta = Util.readUnsignedInt24(in);
-                // Read bytes 1-4: Extended timestamp delta
-                extendedTimestamp = timestampDelta >= 0xffffff ? Util.readUnsignedInt32(in) : 0;
-                RtmpHeader prevHeader = rtmpSessionInfo.getChunkStreamInfo(chunkStreamId).prevHeaderRx();
-                packetLength = prevHeader.packetLength;
-                messageType = prevHeader.messageType;
-                messageStreamId = prevHeader.messageStreamId;
-                absoluteTimestamp = extendedTimestamp != 0 ? extendedTimestamp : prevHeader.absoluteTimestamp + timestampDelta;
-                break;
-            }
-            case TYPE_3_RELATIVE_SINGLE_BYTE: { // b11 = 1 byte: basic header only 
-                RtmpHeader prevHeader = rtmpSessionInfo.getChunkStreamInfo(chunkStreamId).prevHeaderRx();
-                // Read bytes 1-4: Extended timestamp
-                extendedTimestamp = prevHeader.timestampDelta >= 0xffffff ? Util.readUnsignedInt32(in) : 0;
-                timestampDelta = extendedTimestamp != 0 ? 0xffffff : prevHeader.timestampDelta;
-                packetLength = prevHeader.packetLength;
-                messageType = prevHeader.messageType;
-                messageStreamId = prevHeader.messageStreamId;
-                absoluteTimestamp = extendedTimestamp != 0 ? extendedTimestamp : prevHeader.absoluteTimestamp + timestampDelta;
-                break;
-            }
-            default:
-                throw new IOException("Invalid chunk type; basic header byte was: " + Util.toHexString((byte) basicHeaderByte));
-        }
-    }
-
-    public void writeTo(OutputStream out, ChunkType chunkType, final ChunkStreamInfo chunkStreamInfo) throws IOException {
-        // Write basic header byte
-        out.write(((byte) (chunkType.getValue() << 6) | chunkStreamId));
-        switch (chunkType) {
-            case TYPE_0_FULL: { //  b00 = 12 byte header (full header)
-                chunkStreamInfo.markDeltaTimestampTx();
-                Util.writeUnsignedInt24(out, absoluteTimestamp >= 0xffffff ? 0xffffff : absoluteTimestamp);
-                Util.writeUnsignedInt24(out, packetLength);
-                out.write(messageType.getValue());
-                Util.writeUnsignedInt32LittleEndian(out, messageStreamId);
-                if (absoluteTimestamp >= 0xffffff) {
-                    extendedTimestamp = absoluteTimestamp;
-                    Util.writeUnsignedInt32(out, extendedTimestamp);
-                }
-                break;
-            }
-            case TYPE_1_RELATIVE_LARGE: { // b01 = 8 bytes - like type 0. not including message ID (4 last bytes)
-                timestampDelta = (int) chunkStreamInfo.markDeltaTimestampTx();
-                absoluteTimestamp = chunkStreamInfo.getPrevHeaderTx().getAbsoluteTimestamp() + timestampDelta;
-                Util.writeUnsignedInt24(out, absoluteTimestamp >= 0xffffff ? 0xffffff : timestampDelta);
-                Util.writeUnsignedInt24(out, packetLength);
-                out.write(messageType.getValue());
-                if (absoluteTimestamp >= 0xffffff) {
-                    extendedTimestamp = absoluteTimestamp;
-                    Util.writeUnsignedInt32(out, absoluteTimestamp);
-                }
-                break;
-            }
-            case TYPE_2_RELATIVE_TIMESTAMP_ONLY: { // b10 = 4 bytes - Basic Header and timestamp (3 bytes) are included
-                timestampDelta = (int) chunkStreamInfo.markDeltaTimestampTx();
-                absoluteTimestamp = chunkStreamInfo.getPrevHeaderTx().getAbsoluteTimestamp() + timestampDelta;
-                Util.writeUnsignedInt24(out, (absoluteTimestamp >= 0xffffff) ? 0xffffff : timestampDelta);
-                if (absoluteTimestamp >= 0xffffff) {
-                    extendedTimestamp = absoluteTimestamp;
-                    Util.writeUnsignedInt32(out, extendedTimestamp);
-                }
-                break;
-            }
-            case TYPE_3_RELATIVE_SINGLE_BYTE: { // b11 = 1 byte: basic header only
-                timestampDelta = (int) chunkStreamInfo.markDeltaTimestampTx();
-                absoluteTimestamp = chunkStreamInfo.getPrevHeaderTx().getAbsoluteTimestamp() + timestampDelta;
-                if (absoluteTimestamp >= 0xffffff) {
-                    extendedTimestamp = absoluteTimestamp;
-                    Util.writeUnsignedInt32(out, extendedTimestamp);
-                }
-                break;
-            }
-            default:
-                throw new IOException("Invalid chunk type: " + chunkType);
-        }
-    }
-
-    private void parseBasicHeader(byte basicHeaderByte) {
-        chunkType = ChunkType.valueOf((byte) ((0xff & basicHeaderByte) >>> 6)); // 2 most significant bits define the chunk type
-        chunkStreamId = basicHeaderByte & 0x3F; // 6 least significant bits define chunk stream ID
-    }
-
-    /** @return the RTMP chunk stream ID (channel ID) for this chunk */
-    public int getChunkStreamId() {
-        return chunkStreamId;
-    }
-
-    public ChunkType getChunkType() {
-        return chunkType;
-    }
-
-    public int getPacketLength() {
-        return packetLength;
-    }
-
-    public int getMessageStreamId() {
-        return messageStreamId;
-    }
-
-    public MessageType getMessageType() {
-        return messageType;
-    }
-
-    public int getAbsoluteTimestamp() {
-        return absoluteTimestamp;
-    }
-
-    public void setAbsoluteTimestamp(int absoluteTimestamp) {
-        this.absoluteTimestamp = absoluteTimestamp;
-    }
-
-    public int getTimestampDelta() {
-        return timestampDelta;
-    }
-
-    public void setTimestampDelta(int timestampDelta) {
-        this.timestampDelta = timestampDelta;
-    }
-
-    /** Sets the RTMP chunk stream ID (channel ID) for this chunk */
-    public void setChunkStreamId(int channelId) {
-        this.chunkStreamId = channelId;
-    }
-
-    public void setChunkType(ChunkType chunkType) {
-        this.chunkType = chunkType;
-    }
-
-    public void setMessageStreamId(int messageStreamId) {
-        this.messageStreamId = messageStreamId;
-    }
-
-    public void setMessageType(MessageType messageType) {
-        this.messageType = messageType;
-    }
-
-    public void setPacketLength(int packetLength) {
-        this.packetLength = packetLength;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpPacket.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpPacket.java
deleted file mode 100755
index 044d10e..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/RtmpPacket.java
+++ /dev/null
@@ -1,55 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import android.content.res.Configuration;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- *
- * @author francois, leo
- */
-public abstract class RtmpPacket {
-     
-    protected RtmpHeader header;
-
-    public RtmpPacket(RtmpHeader header) {
-        this.header = header;
-    }
-
-    public RtmpHeader getHeader() {
-        return header;
-    }
-    
-    public abstract void readBody(InputStream in) throws IOException;    
-    
-    protected abstract void writeBody(OutputStream out) throws IOException;
-
-    protected abstract byte[] array();
-
-    protected abstract int size();
-
-    public void writeTo(OutputStream out, final int chunkSize, final ChunkStreamInfo chunkStreamInfo) throws IOException {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        writeBody(baos);
-        byte[] body = this instanceof ContentData ? array() : baos.toByteArray();
-        int length = this instanceof ContentData ? size() : body.length;
-        header.setPacketLength(length);
-        // Write header for first chunk
-        header.writeTo(out, RtmpHeader.ChunkType.TYPE_0_FULL, chunkStreamInfo);
-        int pos = 0;
-        while (length > chunkSize) {
-            // Write packet for chunk
-            out.write(body, pos, chunkSize);
-            length -= chunkSize;
-            pos += chunkSize;
-            // Write header for remain chunk
-            header.writeTo(out, RtmpHeader.ChunkType.TYPE_3_RELATIVE_SINGLE_BYTE, chunkStreamInfo);
-        }
-        out.write(body, pos, length);
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/SetChunkSize.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/SetChunkSize.java
deleted file mode 100755
index 4a7b6e3..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/SetChunkSize.java
+++ /dev/null
@@ -1,56 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * A "Set chunk size" RTMP message, received on chunk stream ID 2 (control channel)
- * 
- * @author francois
- */
-public class SetChunkSize extends RtmpPacket {
-
-    private int chunkSize;
-
-    public SetChunkSize(RtmpHeader header) {
-        super(header);
-    }
-
-    public SetChunkSize(int chunkSize) {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_1_RELATIVE_LARGE, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.SET_CHUNK_SIZE));
-        this.chunkSize = chunkSize;
-    }
-
-    public int getChunkSize() {
-        return chunkSize;
-    }
-
-    public void setChunkSize(int chunkSize) {
-        this.chunkSize = chunkSize;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        // Value is received in the 4 bytes of the body
-        chunkSize = Util.readUnsignedInt32(in);
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        Util.writeUnsignedInt32(out, chunkSize);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/SetPeerBandwidth.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/SetPeerBandwidth.java
deleted file mode 100755
index 8dd8b0f..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/SetPeerBandwidth.java
+++ /dev/null
@@ -1,115 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * Set Peer Bandwidth
- * 
- * Also known as ClientrBW ("client bandwidth") in some RTMP implementations.
- * 
- * @author francois
- */
-public class SetPeerBandwidth extends RtmpPacket {
-
-    /**
-     * Bandwidth limiting type
-     */
-    public static enum LimitType {
-
-        /** 
-         * In a hard (0) request, the peer must send the data in the provided bandwidth. 
-         */
-        HARD(0),
-        /** 
-         * In a soft (1) request, the bandwidth is at the discretion of the peer
-         * and the sender can limit the bandwidth.
-         */
-        SOFT(1),
-        /**
-         * In a dynamic (2) request, the bandwidth can be hard or soft.
-         */
-        DYNAMIC(2);
-        private int intValue;
-        private static final Map<Integer, LimitType> quickLookupMap = new HashMap<Integer, LimitType>();
-        
-        static {
-            for (LimitType type : LimitType.values()) {
-                quickLookupMap.put(type.getIntValue(), type);
-            }
-        }
-        
-        private LimitType(int intValue) {
-            this.intValue = intValue;
-        }
-        
-        public int getIntValue() {
-            return intValue;
-        }
-        
-        public static LimitType valueOf(int intValue) {
-            return quickLookupMap.get(intValue);
-        }
-    }
-    private int acknowledgementWindowSize;
-    private LimitType limitType;
-    
-    public SetPeerBandwidth(RtmpHeader header) {
-        super(header);
-    }
-    
-    public SetPeerBandwidth(int acknowledgementWindowSize, LimitType limitType, ChunkStreamInfo channelInfo) {
-        super(new RtmpHeader(channelInfo.canReusePrevHeaderTx(RtmpHeader.MessageType.SET_PEER_BANDWIDTH) ? RtmpHeader.ChunkType.TYPE_2_RELATIVE_TIMESTAMP_ONLY : RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.WINDOW_ACKNOWLEDGEMENT_SIZE));
-        this.acknowledgementWindowSize = acknowledgementWindowSize;
-        this.limitType = limitType;
-    }
-    
-    public int getAcknowledgementWindowSize() {
-        return acknowledgementWindowSize;
-    }
-    
-    public void setAcknowledgementWindowSize(int acknowledgementWindowSize) {
-        this.acknowledgementWindowSize = acknowledgementWindowSize;
-    }
-    
-    public LimitType getLimitType() {
-        return limitType;
-    }
-    
-    public void setLimitType(LimitType limitType) {
-        this.limitType = limitType;
-    }
-    
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        acknowledgementWindowSize = Util.readUnsignedInt32(in);
-        limitType = LimitType.valueOf(in.read());
-    }
-    
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        Util.writeUnsignedInt32(out, acknowledgementWindowSize);
-        out.write(limitType.getIntValue());
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Set Peer Bandwidth";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/UserControl.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/UserControl.java
deleted file mode 100755
index 487d84d..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/UserControl.java
+++ /dev/null
@@ -1,252 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * User Control message, such as ping
- * 
- * @author francois
- */
-public class UserControl extends RtmpPacket {
-
-    /**
-     * Control message type
-     * Docstring adapted from the official Adobe RTMP spec, section 3.7
-     */
-    public static enum Type {
-
-        /** 
-         * Type: 0
-         * The server sends this event to notify the client that a stream has become 
-         * functional and can be used for communication. By default, this event
-         * is sent on ID 0 after the application connect command is successfully
-         * received from the client. 
-         * 
-         * Event Data:
-         * eventData[0] (int) the stream ID of the stream that became functional
-         */
-        STREAM_BEGIN(0),
-        /**
-         * Type: 1
-         * The server sends this event to notify the client that the playback of
-         * data is over as requested on this stream. No more data is sent without
-         * issuing additional commands. The client discards the messages received
-         * for the stream.
-         * 
-         * Event Data:
-         * eventData[0]: the ID of thestream on which playback has ended.
-         */
-        STREAM_EOF(1),
-        /**
-         * Type: 2
-         * The server sends this event to notify the client that there is no 
-         * more data on the stream. If the server does not detect any message for
-         * a time period, it can notify the subscribed clients that the stream is 
-         * dry. 
-         * 
-         * Event Data:
-         * eventData[0]: the stream ID of the dry stream. 
-         */
-        STREAM_DRY(2),
-        /**
-         * Type: 3
-         * The client sends this event to inform the server of the buffer size 
-         * (in milliseconds) that is used to buffer any data coming over a stream.
-         * This event is sent before the server starts  processing the stream.
-         * 
-         * Event Data:
-         * eventData[0]: the stream ID and 
-         * eventData[1]: the buffer length, in milliseconds.
-         */
-        SET_BUFFER_LENGTH(3),
-        /**
-         * Type: 4
-         * The server sends this event to notify the client that the stream is a
-         * recorded stream.
-         * 
-         * Event Data:
-         * eventData[0]: the stream ID of the recorded stream.         
-         */
-        STREAM_IS_RECORDED(4),
-        /**
-         * Type: 6
-         * The server sends this event to test whether the client is reachable. 
-         * 
-         * Event Data:
-         * eventData[0]: a timestamp representing the local server time when the server dispatched the command. 
-         * 
-         * The client responds with PING_RESPONSE on receiving PING_REQUEST.
-         */
-        PING_REQUEST(6),
-        /**
-         * Type: 7
-         * The client sends this event to the server in response to the ping request. 
-         * 
-         * Event Data:
-         * eventData[0]: the 4-byte timestamp which was received with the PING_REQUEST.
-         */
-        PONG_REPLY(7),
-        /**
-         * Type: 31 (0x1F)
-         * 
-         * This user control type is not specified in any official documentation, but
-         * is sent by Flash Media Server 3.5. Thanks to the rtmpdump devs for their
-         * explanation: 
-         * 
-         * Buffer Empty (unofficial name): After the server has sent a complete buffer, and
-         * sends this Buffer Empty message, it will wait until the play
-         * duration of that buffer has passed before sending a new buffer.
-         * The Buffer Ready message will be sent when the new buffer starts.
-         *
-         * (see also: http://repo.or.cz/w/rtmpdump.git/blob/8880d1456b282ee79979adbe7b6a6eb8ad371081:/librtmp/rtmp.c#l2787)         
-         */
-        BUFFER_EMPTY(31),
-        /**
-         * Type: 32 (0x20)
-         * 
-         * This user control type is not specified in any official documentation, but
-         * is sent by Flash Media Server 3.5. Thanks to the rtmpdump devs for their
-         * explanation: 
-         * 
-         * Buffer Ready (unofficial name): After the server has sent a complete buffer, and
-         * sends a Buffer Empty message, it will wait until the play
-         * duration of that buffer has passed before sending a new buffer.
-         * The Buffer Ready message will be sent when the new buffer starts.
-         * (There is no BufferReady message for the very first buffer;
-         * presumably the Stream Begin message is sufficient for that
-         * purpose.)
-         *
-         * (see also: http://repo.or.cz/w/rtmpdump.git/blob/8880d1456b282ee79979adbe7b6a6eb8ad371081:/librtmp/rtmp.c#l2787)         
-         */
-        BUFFER_READY(32);
-        
-        private int intValue;
-        private static final Map<Integer, Type> quickLookupMap = new HashMap<Integer, Type>();
-
-        static {
-            for (Type type : Type.values()) {
-                quickLookupMap.put(type.getIntValue(), type);
-            }
-        }
-
-        private Type(int intValue) {
-            this.intValue = intValue;
-        }
-
-        public int getIntValue() {
-            return intValue;
-        }
-
-        public static Type valueOf(int intValue) {
-            return quickLookupMap.get(intValue);
-        }
-    }
-    private Type type;
-    private int[] eventData;
-
-    public UserControl(RtmpHeader header) {
-        super(header);
-    }
-
-    public UserControl(ChunkStreamInfo channelInfo) {
-        super(new RtmpHeader(channelInfo.canReusePrevHeaderTx(RtmpHeader.MessageType.USER_CONTROL_MESSAGE) ? RtmpHeader.ChunkType.TYPE_2_RELATIVE_TIMESTAMP_ONLY : RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.USER_CONTROL_MESSAGE));
-    }
-
-    /** Convenience construtor that creates a "pong" message for the specified ping */
-    public UserControl(UserControl replyToPing, ChunkStreamInfo channelInfo) {
-        this(Type.PONG_REPLY, channelInfo);
-        this.eventData = replyToPing.eventData;
-    }
-
-    public UserControl(Type type, ChunkStreamInfo channelInfo) {
-        this(channelInfo);
-        this.type = type;
-    }
-
-    public Type getType() {
-        return type;
-    }
-
-    public void setType(Type type) {
-        this.type = type;
-    }
-
-    /** 
-     * Convenience method for getting the first event data item, as most user control
-     * message types only have one event data item anyway
-     * This is equivalent to calling <code>getEventData()[0]</code>
-     */
-    public int getFirstEventData() {
-        return eventData[0];
-    }
-
-    public int[] getEventData() {
-        return eventData;
-    }
-
-    /** Used to set (a single) event data for most user control message types */
-    public void setEventData(int eventData) {
-        if (type == Type.SET_BUFFER_LENGTH) {
-            throw new IllegalStateException("SET_BUFFER_LENGTH requires two event data values; use setEventData(int, int) instead");
-        }
-        this.eventData = new int[]{eventData};
-    }
-
-    /** Used to set event data for the SET_BUFFER_LENGTH user control message types */
-    public void setEventData(int streamId, int bufferLength) {
-        if (type != Type.SET_BUFFER_LENGTH) {
-            throw new IllegalStateException("User control type " + type + " requires only one event data value; use setEventData(int) instead");
-        }
-        this.eventData = new int[]{streamId, bufferLength};
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        // Bytes 0-1: first parameter: ping type (mandatory)
-        type = Type.valueOf(Util.readUnsignedInt16(in));
-        int bytesRead = 2;
-        // Event data (1 for most types, 2 for SET_BUFFER_LENGTH)
-        if (type == Type.SET_BUFFER_LENGTH) {
-            setEventData(Util.readUnsignedInt32(in), Util.readUnsignedInt32(in));
-            bytesRead += 8;
-        } else {
-            setEventData(Util.readUnsignedInt32(in));
-            bytesRead += 4;
-        }
-        // To ensure some strange non-specified UserControl/ping message does not slip through
-        assert header.getPacketLength() == bytesRead;
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        // Write the user control message type
-        Util.writeUnsignedInt16(out, type.getIntValue());
-        // Now write the event data
-        Util.writeUnsignedInt32(out, eventData[0]);
-        if (type == Type.SET_BUFFER_LENGTH) {
-            Util.writeUnsignedInt32(out, eventData[1]);
-        }
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP User Control (type: " + type + ", event data: " + eventData + ")";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/VariableBodyRtmpPacket.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/VariableBodyRtmpPacket.java
deleted file mode 100755
index c670f61..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/VariableBodyRtmpPacket.java
+++ /dev/null
@@ -1,78 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.github.faucamp.simplertmp.amf.AmfBoolean;
-import com.github.faucamp.simplertmp.amf.AmfData;
-import com.github.faucamp.simplertmp.amf.AmfDecoder;
-import com.github.faucamp.simplertmp.amf.AmfNull;
-import com.github.faucamp.simplertmp.amf.AmfNumber;
-import com.github.faucamp.simplertmp.amf.AmfString;
-
-/**
- * RTMP packet with a "variable" body structure (i.e. the structure of the
- * body depends on some other state/parameter in the packet.
- * 
- * Examples of this type of packet are Command and Data; this abstract class
- * exists mostly for code re-use.
- * 
- * @author francois
- */
-public abstract class VariableBodyRtmpPacket extends RtmpPacket {
-
-    protected List<AmfData> data;
-
-    public VariableBodyRtmpPacket(RtmpHeader header) {
-        super(header);
-    }
-
-    public List<AmfData> getData() {
-        return data;
-    }
-
-    public void addData(String string) {
-        addData(new AmfString(string));
-    }
-
-    public void addData(double number) {
-        addData(new AmfNumber(number));
-    }
-    
-    public void addData(boolean bool) {
-        addData(new AmfBoolean(bool));
-    }
-
-    public void addData(AmfData dataItem) {
-        if (data == null) {
-            this.data = new ArrayList<AmfData>();
-        }
-        if (dataItem == null) {
-            dataItem = new AmfNull();
-        }
-        this.data.add(dataItem);
-    }
-
-    protected void readVariableData(final InputStream in, int bytesAlreadyRead) throws IOException {
-        // ...now read in arguments (if any)
-        do {
-            AmfData dataItem = AmfDecoder.readFrom(in);
-            addData(dataItem);
-            bytesAlreadyRead += dataItem.getSize();
-        } while (bytesAlreadyRead < header.getPacketLength());
-    }
-
-    protected void writeVariableData(final OutputStream out) throws IOException {
-        if (data != null) {
-            for (AmfData dataItem : data) {
-                dataItem.writeTo(out);
-            }
-        } else {
-            // Write a null
-            AmfNull.writeNullTo(out);
-        }
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/Video.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/Video.java
deleted file mode 100755
index 64d9050..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/Video.java
+++ /dev/null
@@ -1,24 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * Video data packet
- *  
- * @author francois
- */
-public class Video extends ContentData {
-
-    public Video(RtmpHeader header) {
-        super(header);
-    }
-
-    public Video() {
-        super(new RtmpHeader(RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_VIDEO, RtmpHeader.MessageType.VIDEO));
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Video";
-    }
-}
diff --git a/android/src/main/java/com/github/faucamp/simplertmp/packets/WindowAckSize.java b/android/src/main/java/com/github/faucamp/simplertmp/packets/WindowAckSize.java
deleted file mode 100755
index 66d396f..0000000
--- a/android/src/main/java/com/github/faucamp/simplertmp/packets/WindowAckSize.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package com.github.faucamp.simplertmp.packets;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import com.github.faucamp.simplertmp.Util;
-import com.github.faucamp.simplertmp.io.ChunkStreamInfo;
-
-/**
- * Window Acknowledgement Size
- * 
- * Also known as ServerBW ("Server bandwidth") in some RTMP implementations.
- * 
- * @author francois
- */
-public class WindowAckSize extends RtmpPacket {
-
-    private int acknowledgementWindowSize;
-
-    public WindowAckSize(RtmpHeader header) {
-        super(header);
-    }
-    
-    public WindowAckSize(int acknowledgementWindowSize, ChunkStreamInfo channelInfo) {
-        super(new RtmpHeader(channelInfo.canReusePrevHeaderTx(RtmpHeader.MessageType.WINDOW_ACKNOWLEDGEMENT_SIZE) ? RtmpHeader.ChunkType.TYPE_2_RELATIVE_TIMESTAMP_ONLY : RtmpHeader.ChunkType.TYPE_0_FULL, ChunkStreamInfo.RTMP_CID_PROTOCOL_CONTROL, RtmpHeader.MessageType.WINDOW_ACKNOWLEDGEMENT_SIZE));
-        this.acknowledgementWindowSize = acknowledgementWindowSize;
-    }
-
-
-    public int getAcknowledgementWindowSize() {
-        return acknowledgementWindowSize;
-    }
-
-    public void setAcknowledgementWindowSize(int acknowledgementWindowSize) {
-        this.acknowledgementWindowSize = acknowledgementWindowSize;
-    }
-
-    @Override
-    public void readBody(InputStream in) throws IOException {
-        acknowledgementWindowSize = Util.readUnsignedInt32(in);
-    }
-
-    @Override
-    protected void writeBody(OutputStream out) throws IOException {
-        Util.writeUnsignedInt32(out, acknowledgementWindowSize);
-    }
-
-    @Override
-    protected byte[] array() {
-        return null;
-    }
-
-    @Override
-    protected int size() {
-        return 0;
-    }
-
-    @Override
-    public String toString() {
-        return "RTMP Window Acknowledgment Size";
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/AbstractBox.java b/android/src/main/java/com/googlecode/mp4parser/AbstractBox.java
deleted file mode 100755
index a2a4541..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/AbstractBox.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/*  
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.googlecode.mp4parser;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.ChannelHelper;
-import com.coremedia.iso.Hex;
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.coremedia.iso.boxes.UserBox;
-import com.googlecode.mp4parser.annotations.DoNotParseDetail;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.logging.Logger;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * A basic on-demand parsing box. Requires the implementation of three methods to become a fully working box:
- * <ol>
- * <li>{@link #_parseDetails(java.nio.ByteBuffer)}</li>
- * <li>{@link #getContent(java.nio.ByteBuffer)}</li>
- * <li>{@link #getContentSize()}</li>
- * </ol>
- * additionally this new box has to be put into the <code>isoparser-default.properties</code> file so that
- * it is accessible by the <code>PropertyBoxParserImpl</code>
- */
-public abstract class AbstractBox implements Box {
-    public static int MEM_MAP_THRESHOLD = 100 * 1024;
-    private static Logger LOG = Logger.getLogger(AbstractBox.class.getName());
-
-    protected String type;
-    private byte[] userType;
-    private ContainerBox parent;
-
-    private ByteBuffer content;
-    private ByteBuffer deadBytes = null;
-
-
-    protected AbstractBox(String type) {
-        this.type = type;
-    }
-
-    protected AbstractBox(String type, byte[] userType) {
-        this.type = type;
-        this.userType = userType;
-    }
-
-    /**
-     * Get the box's content size without its header. This must be the exact number of bytes
-     * that <code>getContent(ByteBuffer)</code> writes.
-     *
-     * @return Gets the box's content size in bytes
-     * @see #getContent(java.nio.ByteBuffer)
-     */
-    protected abstract long getContentSize();
-
-    /**
-     * Write the box's content into the given <code>ByteBuffer</code>. This must include flags
-     * and version in case of a full box. <code>byteBuffer</code> has been initialized with
-     * <code>getSize()</code> bytes.
-     *
-     * @param byteBuffer the sink for the box's content
-     */
-    protected abstract void getContent(ByteBuffer byteBuffer);
-
-    /**
-     * Parse the box's fields and child boxes if any.
-     *
-     * @param content the box's raw content beginning after the 4-cc field.
-     */
-    protected abstract void _parseDetails(ByteBuffer content);
-
-    /**
-     * Read the box's content from a byte channel without parsing it. Parsing is done on-demand.
-     *
-     * @param readableByteChannel the (part of the) iso file to parse
-     * @param contentSize         expected contentSize of the box
-     * @param boxParser           creates inner boxes
-     * @throws IOException in case of an I/O error.
-     */
-    @DoNotParseDetail
-    public void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException {
-        if (readableByteChannel instanceof FileChannel && contentSize > MEM_MAP_THRESHOLD) {
-            // todo: if I map this here delayed I could use transferFrom/transferTo in the getBox method
-            // todo: potentially this could speed up writing.
-            //
-            // It's quite expensive to map a file into the memory. Just do it when the box is larger than a MB.
-            content = ((FileChannel) readableByteChannel).map(FileChannel.MapMode.READ_ONLY, ((FileChannel) readableByteChannel).position(), contentSize);
-            ((FileChannel) readableByteChannel).position(((FileChannel) readableByteChannel).position() + contentSize);
-        } else {
-            assert contentSize < Integer.MAX_VALUE;
-            content = ChannelHelper.readFully(readableByteChannel, contentSize);
-        }
-        if (isParsed() == false) {
-            parseDetails();
-        }
-
-    }
-
-    public void getBox(WritableByteChannel os) throws IOException {
-        ByteBuffer bb = ByteBuffer.allocate(l2i(getSize()));
-        getHeader(bb);
-        if (content == null) {
-            getContent(bb);
-            if (deadBytes != null) {
-                deadBytes.rewind();
-                while (deadBytes.remaining() > 0) {
-                    bb.put(deadBytes);
-                }
-            }
-        } else {
-            content.rewind();
-            bb.put(content);
-        }
-        bb.rewind();
-        os.write(bb);
-    }
-
-
-    /**
-     * Parses the raw content of the box. It surrounds the actual parsing
-     * which is done
-     */
-    synchronized final void parseDetails() {
-        if (content != null) {
-            ByteBuffer content = this.content;
-            this.content = null;
-            content.rewind();
-            _parseDetails(content);
-            if (content.remaining() > 0) {
-                deadBytes = content.slice();
-            }
-            assert verify(content);
-        }
-    }
-
-    /**
-     * Sets the 'dead' bytes. These bytes are left if the content of the box
-     * has been parsed but not all bytes have been used up.
-     *
-     * @param newDeadBytes the unused bytes with no meaning but required for bytewise reconstruction
-     */
-    protected void setDeadBytes(ByteBuffer newDeadBytes) {
-        deadBytes = newDeadBytes;
-    }
-
-
-    /**
-     * Gets the full size of the box including header and content.
-     *
-     * @return the box's size
-     */
-    public long getSize() {
-        long size = (content == null ? getContentSize() : content.limit());
-        size += (8 + // size|type
-                (size >= ((1L << 32) - 8) ? 8 : 0) + // 32bit - 8 byte size and type
-                (UserBox.TYPE.equals(getType()) ? 16 : 0));
-        size += (deadBytes == null ? 0 : deadBytes.limit());
-        return size;
-    }
-
-    @DoNotParseDetail
-    public String getType() {
-        return type;
-    }
-
-    @DoNotParseDetail
-    public byte[] getUserType() {
-        return userType;
-    }
-
-    @DoNotParseDetail
-    public ContainerBox getParent() {
-        return parent;
-    }
-
-    @DoNotParseDetail
-    public void setParent(ContainerBox parent) {
-        this.parent = parent;
-    }
-
-    @DoNotParseDetail
-    public IsoFile getIsoFile() {
-        return parent.getIsoFile();
-    }
-
-    /**
-     * Check if details are parsed.
-     *
-     * @return <code>true</code> whenever the content <code>ByteBuffer</code> is not <code>null</code>
-     */
-    public boolean isParsed() {
-        return content == null;
-    }
-
-
-    /**
-     * Verifies that a box can be reconstructed byte-exact after parsing.
-     *
-     * @param content the raw content of the box
-     * @return <code>true</code> if raw content exactly matches the reconstructed content
-     */
-    private boolean verify(ByteBuffer content) {
-        ByteBuffer bb = ByteBuffer.allocate(l2i(getContentSize() + (deadBytes != null ? deadBytes.limit() : 0)));
-        getContent(bb);
-        if (deadBytes != null) {
-            deadBytes.rewind();
-            while (deadBytes.remaining() > 0) {
-                bb.put(deadBytes);
-            }
-        }
-        content.rewind();
-        bb.rewind();
-
-
-        if (content.remaining() != bb.remaining()) {
-            LOG.severe(this.getType() + ": remaining differs " + content.remaining() + " vs. " + bb.remaining());
-            return false;
-        }
-        int p = content.position();
-        for (int i = content.limit() - 1, j = bb.limit() - 1; i >= p; i--, j--) {
-            byte v1 = content.get(i);
-            byte v2 = bb.get(j);
-            if (v1 != v2) {
-                LOG.severe(String.format("%s: buffers differ at %d: %2X/%2X", this.getType(), i, v1, v2));
-                byte[] b1 = new byte[content.remaining()];
-                byte[] b2 = new byte[bb.remaining()];
-                content.get(b1);
-                bb.get(b2);
-                System.err.println("original      : " + Hex.encodeHex(b1, 4));
-                System.err.println("reconstructed : " + Hex.encodeHex(b2, 4));
-                return false;
-            }
-        }
-        return true;
-
-    }
-
-    private boolean isSmallBox() {
-        return (content == null ? (getContentSize() + (deadBytes != null ? deadBytes.limit() : 0) + 8) : content.limit()) < 1L << 32;
-    }
-
-    private void getHeader(ByteBuffer byteBuffer) {
-        if (isSmallBox()) {
-            IsoTypeWriter.writeUInt32(byteBuffer, this.getSize());
-            byteBuffer.put(IsoFile.fourCCtoBytes(getType()));
-        } else {
-            IsoTypeWriter.writeUInt32(byteBuffer, 1);
-            byteBuffer.put(IsoFile.fourCCtoBytes(getType()));
-            IsoTypeWriter.writeUInt64(byteBuffer, getSize());
-        }
-        if (UserBox.TYPE.equals(getType())) {
-            byteBuffer.put(getUserType());
-        }
-
-
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/AbstractContainerBox.java b/android/src/main/java/com/googlecode/mp4parser/AbstractContainerBox.java
deleted file mode 100755
index e1f944f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/AbstractContainerBox.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.googlecode.mp4parser;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.googlecode.mp4parser.util.ByteBufferByteChannel;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.logging.Logger;
-
-
-/**
- * Abstract base class suitable for most boxes acting purely as container for other boxes.
- */
-public abstract class AbstractContainerBox extends AbstractBox implements ContainerBox {
-    private static Logger LOG = Logger.getLogger(AbstractContainerBox.class.getName());
-
-    protected List<Box> boxes = new LinkedList<Box>();
-    protected BoxParser boxParser;
-
-    @Override
-    protected long getContentSize() {
-        long contentSize = 0;
-        for (Box boxe : boxes) {
-            contentSize += boxe.getSize();
-        }
-        return contentSize;
-    }
-
-    public AbstractContainerBox(String type) {
-        super(type);
-    }
-
-    public List<Box> getBoxes() {
-        return Collections.unmodifiableList(boxes);
-    }
-
-    public void setBoxes(List<Box> boxes) {
-        this.boxes = new LinkedList<Box>(boxes);
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz) {
-        return getBoxes(clazz, false);
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz, boolean recursive) {
-        List<T> boxesToBeReturned = new ArrayList<T>(2);
-        for (Box boxe : boxes) {
-            //clazz.isInstance(boxe) / clazz == boxe.getClass()?
-            // I hereby finally decide to use isInstance
-
-            if (clazz.isInstance(boxe)) {
-                boxesToBeReturned.add((T) boxe);
-            }
-
-            if (recursive && boxe instanceof ContainerBox) {
-                boxesToBeReturned.addAll(((ContainerBox) boxe).getBoxes(clazz, recursive));
-            }
-        }
-        return boxesToBeReturned;
-    }
-
-    /**
-     * Add <code>b</code> to the container and sets the parent correctly.
-     *
-     * @param b will be added to the container
-     */
-    public void addBox(Box b) {
-        b.setParent(this);
-        boxes.add(b);
-    }
-
-    public void removeBox(Box b) {
-        b.setParent(this);
-        boxes.remove(b);
-    }
-
-    @Override
-    public void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException {
-        this.boxParser = boxParser;
-        super.parse(readableByteChannel, header, contentSize, boxParser);
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseChildBoxes(content);
-    }
-
-
-    public String toString() {
-        StringBuilder buffer = new StringBuilder();
-
-        buffer.append(this.getClass().getSimpleName()).append("[");
-        for (int i = 0; i < boxes.size(); i++) {
-            if (i > 0) {
-                buffer.append(";");
-            }
-            buffer.append(boxes.get(i).toString());
-        }
-        buffer.append("]");
-        return buffer.toString();
-    }
-
-    /**
-     * The number of bytes from box start (first length byte) to the
-     * first length byte of the first child box
-     *
-     * @return offset to first child box
-     */
-    public long getNumOfBytesToFirstChild() {
-        return 8;
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeChildBoxes(byteBuffer);
-    }
-
-    protected final void parseChildBoxes(ByteBuffer content) {
-        try {
-            while (content.remaining() >= 8) { //  8 is the minimal size for a sane box
-                boxes.add(boxParser.parseBox(new ByteBufferByteChannel(content), this));
-            }
-
-            if (content.remaining() != 0) {
-                setDeadBytes(content.slice());
-                LOG.warning("Something's wrong with the sizes. There are dead bytes in a container box.");
-            }
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    protected final void writeChildBoxes(ByteBuffer bb) {
-        WritableByteChannel wbc = new ByteBufferByteChannel(bb);
-        for (Box box : boxes) {
-            try {
-                box.getBox(wbc);
-            } catch (IOException e) {
-                // My WritableByteChannel won't throw any excpetion
-                throw new RuntimeException("Cannot happen to me", e);
-            }
-        }
-    }
-
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/AbstractFullBox.java b/android/src/main/java/com/googlecode/mp4parser/AbstractFullBox.java
deleted file mode 100755
index bec8975..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/AbstractFullBox.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.googlecode.mp4parser;
-
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.boxes.FullBox;
-
-import java.nio.ByteBuffer;
-
-/**
- * Base class for all ISO Full boxes.
- */
-public abstract class AbstractFullBox extends AbstractBox implements FullBox {
-    private int version;
-    private int flags;
-
-    protected AbstractFullBox(String type) {
-        super(type);
-    }
-
-    protected AbstractFullBox(String type, byte[] userType) {
-        super(type, userType);
-    }
-
-    public int getVersion() {
-        return version;
-    }
-
-    public void setVersion(int version) {
-        this.version = version;
-    }
-
-    public int getFlags() {
-        return flags;
-    }
-
-    public void setFlags(int flags) {
-        this.flags = flags;
-    }
-
-
-    /**
-     * Parses the version/flags header and returns the remaining box size.
-     *
-     * @param content
-     * @return number of bytes read
-     */
-    protected final long parseVersionAndFlags(ByteBuffer content) {
-        version = IsoTypeReader.readUInt8(content);
-        flags = IsoTypeReader.readUInt24(content);
-        return 4;
-    }
-
-    protected final void writeVersionAndFlags(ByteBuffer bb) {
-        IsoTypeWriter.writeUInt8(bb, version);
-        IsoTypeWriter.writeUInt24(bb, flags);
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/FullContainerBox.java b/android/src/main/java/com/googlecode/mp4parser/FullContainerBox.java
deleted file mode 100755
index 30eb285..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/FullContainerBox.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/*  
- * Copyright 2008 CoreMedia AG, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License); 
- * you may not use this file except in compliance with the License. 
- * You may obtain a copy of the License at 
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software 
- * distributed under the License is distributed on an AS IS BASIS, 
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-package com.googlecode.mp4parser;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.googlecode.mp4parser.util.ByteBufferByteChannel;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.logging.Logger;
-
-/**
- * Abstract base class for a full iso box only containing ither boxes.
- */
-public abstract class FullContainerBox extends AbstractFullBox implements ContainerBox {
-    protected List<Box> boxes = new LinkedList<Box>();
-    private static Logger LOG = Logger.getLogger(FullContainerBox.class.getName());
-    BoxParser boxParser;
-
-    public void setBoxes(List<Box> boxes) {
-        this.boxes = new LinkedList<Box>(boxes);
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz) {
-        return getBoxes(clazz, false);
-    }
-
-    @SuppressWarnings("unchecked")
-    public <T extends Box> List<T> getBoxes(Class<T> clazz, boolean recursive) {
-        List<T> boxesToBeReturned = new ArrayList<T>(2);
-        for (Box boxe : boxes) { //clazz.isInstance(boxe) / clazz == boxe.getClass()?
-            if (clazz == boxe.getClass()) {
-                boxesToBeReturned.add((T) boxe);
-            }
-
-            if (recursive && boxe instanceof ContainerBox) {
-                boxesToBeReturned.addAll((((ContainerBox) boxe).getBoxes(clazz, recursive)));
-            }
-        }
-        // Optimize here! Spare object creation work on arrays directly! System.arrayCopy
-        return boxesToBeReturned;
-        //return (T[]) boxesToBeReturned.toArray();
-    }
-
-    protected long getContentSize() {
-        long contentSize = 4; // flags and version
-        for (Box boxe : boxes) {
-            contentSize += boxe.getSize();
-        }
-        return contentSize;
-    }
-
-    public void addBox(Box b) {
-        b.setParent(this);
-        boxes.add(b);
-    }
-
-    public void removeBox(Box b) {
-        b.setParent(null);
-        boxes.remove(b);
-    }
-
-    public FullContainerBox(String type) {
-        super(type);
-    }
-
-    public List<Box> getBoxes() {
-        return boxes;
-    }
-
-    @Override
-    public void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException {
-        this.boxParser = boxParser;
-        super.parse(readableByteChannel, header, contentSize, boxParser);
-
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        parseChildBoxes(content);
-    }
-
-    protected final void parseChildBoxes(ByteBuffer content) {
-        try {
-            while (content.remaining() >= 8) { //  8 is the minimal size for a sane box
-                boxes.add(boxParser.parseBox(new ByteBufferByteChannel(content), this));
-            }
-
-            if (content.remaining() != 0) {
-                setDeadBytes(content.slice());
-                LOG.severe("Some sizes are wrong");
-            }
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public String toString() {
-        StringBuilder buffer = new StringBuilder();
-        buffer.append(this.getClass().getSimpleName()).append("[");
-        for (int i = 0; i < boxes.size(); i++) {
-            if (i > 0) {
-                buffer.append(";");
-            }
-            buffer.append(boxes.get(i).toString());
-        }
-        buffer.append("]");
-        return buffer.toString();
-    }
-
-
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        writeChildBoxes(byteBuffer);
-    }
-
-    protected final void writeChildBoxes(ByteBuffer bb) {
-        WritableByteChannel wbc = new ByteBufferByteChannel(bb);
-        for (Box box : boxes) {
-            try {
-                box.getBox(wbc);
-            } catch (IOException e) {
-                // cannot happen since my WritableByteChannel won't throw any excpetion
-                throw new RuntimeException("Cannot happen.", e);
-            }
-
-        }
-    }
-
-    public long getNumOfBytesToFirstChild() {
-        long sizeOfChildren = 0;
-        for (Box box : boxes) {
-            sizeOfChildren += box.getSize();
-        }
-        return getSize() - sizeOfChildren;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/Version.java b/android/src/main/java/com/googlecode/mp4parser/Version.java
deleted file mode 100755
index f93816f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/Version.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.googlecode.mp4parser;
-
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.util.logging.Logger;
-
-/**
- * The classic version object.
- */
-public class Version {
-    private static final Logger LOG = Logger.getLogger(Version.class.getName());
-    public static final String VERSION;
-
-    static {
-        LineNumberReader lnr = new LineNumberReader(new InputStreamReader(Version.class.getResourceAsStream("/version.txt")));
-        String version;
-        try {
-            version = lnr.readLine();
-        } catch (IOException e) {
-            LOG.warning(e.getMessage());
-            version = "unknown";
-        }
-        VERSION = version;
-
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/annotations/DoNotParseDetail.java b/android/src/main/java/com/googlecode/mp4parser/annotations/DoNotParseDetail.java
deleted file mode 100755
index c08460f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/annotations/DoNotParseDetail.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.annotations;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Inherited;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- *
- */
-
-@Target({ElementType.METHOD, ElementType.TYPE})
-@Retention(RetentionPolicy.RUNTIME)
-@Inherited
-@Documented
-/**
- * Mark a method with this annotation to prevent triggering the call of
- * <code>AbstractBox#parseDetails()</code> before actually executing the
- * method.
- * @see com.googlecode.mp4parser.RequiresParseDetailAspect
- */
-public @interface DoNotParseDetail {
-}
-
-
diff --git a/android/src/main/java/com/googlecode/mp4parser/annotations/ParseDetail.java b/android/src/main/java/com/googlecode/mp4parser/annotations/ParseDetail.java
deleted file mode 100755
index 7b66d53..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/annotations/ParseDetail.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.googlecode.mp4parser.annotations;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Inherited;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-
-@Target({ElementType.METHOD, ElementType.TYPE})
-@Retention(RetentionPolicy.RUNTIME)
-@Inherited
-@Documented
-public @interface ParseDetail {
-}
-
-
diff --git a/android/src/main/java/com/googlecode/mp4parser/authoring/DateHelper.java b/android/src/main/java/com/googlecode/mp4parser/authoring/DateHelper.java
deleted file mode 100755
index 0252859..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/authoring/DateHelper.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.googlecode.mp4parser.authoring;
-
-import java.util.Date;
-
-/**
- * Converts ISO Dates (seconds since 1/1/1904) to Date and vice versa.
- */
-public class DateHelper {
-    /**
-     * Converts a long value with seconds since 1/1/1904 to Date.
-     *
-     * @param secondsSince seconds since 1/1/1904
-     * @return date the corresponding <code>Date</code>
-     */
-    static public Date convert(long secondsSince) {
-        return new Date((secondsSince - 2082844800L) * 1000L);
-    }
-
-
-    /**
-     * Converts a date as long to a mac date as long
-     *
-     * @param date date to convert
-     * @return date in mac format
-     */
-    static public long convert(Date date) {
-        return (date.getTime() / 1000L) + 2082844800L;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/AbstractDescriptorBox.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/AbstractDescriptorBox.java
deleted file mode 100755
index d223e40..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/AbstractDescriptorBox.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4;
-
-import com.googlecode.mp4parser.AbstractFullBox;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.BaseDescriptor;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.ObjectDescriptorFactory;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-/**
- * ES Descriptor Box.
- */
-public class AbstractDescriptorBox extends AbstractFullBox {
-    private static Logger log = Logger.getLogger(AbstractDescriptorBox.class.getName());
-
-
-    public BaseDescriptor descriptor;
-    public ByteBuffer data;
-
-    public AbstractDescriptorBox(String type) {
-        super(type);
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        data.rewind(); // has been fforwarded by parsing
-        byteBuffer.put(data);
-    }
-
-    @Override
-    protected long getContentSize() {
-        return 4 + data.limit();
-    }
-
-    public BaseDescriptor getDescriptor() {
-        return descriptor;
-    }
-
-    public void setDescriptor(BaseDescriptor descriptor) {
-        this.descriptor = descriptor;
-    }
-
-    public String getDescriptorAsString() {
-        return descriptor.toString();
-    }
-
-    public void setData(ByteBuffer data) {
-        this.data = data;
-    }
-
-    @Override
-    public void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        data = content.slice();
-        content.position(content.position() + content.remaining());
-        try {
-            data.rewind();
-            descriptor = ObjectDescriptorFactory.createFrom(-1, data);
-        } catch (IOException e) {
-            log.log(Level.WARNING, "Error parsing ObjectDescriptor", e);
-            //that's why we copied it ;)
-        } catch (IndexOutOfBoundsException e) {
-            log.log(Level.WARNING, "Error parsing ObjectDescriptor", e);
-            //that's why we copied it ;)
-        }
-
-    }
-
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ESDescriptorBox.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ESDescriptorBox.java
deleted file mode 100755
index 102f268..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ESDescriptorBox.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4;
-
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.ESDescriptor;
-
-/**
- * ES Descriptor Box.
- */
-public class ESDescriptorBox extends AbstractDescriptorBox {
-    public static final String TYPE = "esds";
-
-    public ESDescriptorBox() {
-        super(TYPE);
-    }
-
-    public ESDescriptor getEsDescriptor() {
-        return (ESDescriptor) super.getDescriptor();
-    }
-
-    public void setEsDescriptor(ESDescriptor esDescriptor) {
-        super.setDescriptor(esDescriptor);
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ObjectDescriptorBox.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ObjectDescriptorBox.java
deleted file mode 100755
index c9e7493..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/ObjectDescriptorBox.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4;
-
-/**
- * This object contains an Object Descriptor or an Initial Object Descriptor.
- * There are a number of possible file types based on usage, depending on the descriptor:
- * <ul>
- * <li>Presentation, contains IOD which contains a BIFS stream (MP4 file);
- * <li>Sub-part of a presentation, contains an IOD without a BIFS stream (MP4 file);</li>
- * <li>Sub-part of a presentation, contains an OD (MP4 file);</li>
- * <li>Free-form file, referenced by MP4 data references (free-format);</li>
- * <li>Sub-part of a presentation, referenced by an ES URL.</li>
- * </ul>
- * NOTE: <br/>
- * The first three are MP4 files, a file referenced by a data reference is not necessarily an MP4 file, as it is
- * free-format. Files referenced by ES URLs, by data references, or intended as input to an editing process, need not have
- * an Object Descriptor Box. <br/>
- * An OD URL may point to an MP4 file. Implicitly, the target of such a URL is the OD/IOD located in the 'iods'
- * atom in that file.</br/>
- * If an MP4 file contains several object descriptors, only the OD/IOD in the 'iods' atom can be addressed using
- * an OD URL from a remote MPEG-4 presentation.
- */
-public class ObjectDescriptorBox extends AbstractDescriptorBox {
-    public static final String TYPE = "iods";
-
-    public ObjectDescriptorBox() {
-        super(TYPE);
-    }
-
-
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/AudioSpecificConfig.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/AudioSpecificConfig.java
deleted file mode 100755
index 86e319e..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/AudioSpecificConfig.java
+++ /dev/null
@@ -1,1176 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.Hex;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
-
-//
-//GetAudioObjectType()
-//{
-//audioObjectType; 5 uimsbf
-//if (audioObjectType == 31) {
-//audioObjectType = 32 + audioObjectTypeExt; 6 uimsbf
-//}
-//return audioObjectType;
-//}
-//AudioSpecificConfig ()
-//{
-//audioObjectType = GetAudioObjectType();
-//samplingFrequencyIndex; 4 bslbf
-//if ( samplingFrequencyIndex == 0xf ) {
-//samplingFrequency; 24 uimsbf
-//}
-//channelConfiguration; 4 bslbf
-//sbrPresentFlag = -1;
-//psPresentFlag = -1;
-//if ( audioObjectType == 5 ||
-//audioObjectType == 29 ) {
-//extensionAudioObjectType = 5;
-//sbrPresentFlag = 1;
-//if ( audioObjectType == 29 ) {
-//psPresentFlag = 1;
-//}
-//extensionSamplingFrequencyIndex; 4 uimsbf
-//if ( extensionSamplingFrequencyIndex == 0xf )
-//extensionSamplingFrequency; 24 uimsbf
-//audioObjectType = GetAudioObjectType();
-//if ( audioObjectType == 22 )
-//extensionChannelConfiguration; 4 uimsbf
-//}
-//else {
-//extensionAudioObjectType = 0;
-//}
-//switch (audioObjectType) {
-//case 1:
-//case 2:
-//case 3:
-//case 4:
-//case 6:
-//case 7:
-//case 17:
-//case 19:
-//case 20:
-//case 21:
-//case 22:
-//case 23:
-//GASpecificConfig();
-//break:
-//case 8:
-//CelpSpecificConfig();
-//break;
-//case 9:
-//HvxcSpecificConfig();
-//break:
-//case 12:
-//TTSSpecificConfig();
-//break;
-//case 13:
-//case 14:
-//case 15:
-//case 16:
-//StructuredAudioSpecificConfig();
-//break;
-//case 24:
-//ErrorResilientCelpSpecificConfig();
-//break;
-//case 25:
-//ErrorResilientHvxcSpecificConfig();
-//break;
-//case 26:
-//case 27:
-//ParametricSpecificConfig();
-//break;
-// case 28:
-//SSCSpecificConfig();
-//break;
-//case 30:
-//sacPayloadEmbedding; 1 uimsbf
-//SpatialSpecificConfig();
-//break;
-//case 32:
-//case 33:
-//case 34:
-//MPEG_1_2_SpecificConfig();
-//break;
-//case 35:
-//DSTSpecificConfig();
-//break;
-//case 36:
-//fillBits; 5 bslbf
-//ALSSpecificConfig();
-//break;
-//case 37:
-//case 38:
-//SLSSpecificConfig();
-//break;
-//case 39:
-//ELDSpecificConfig(channelConfiguration);
-//break:
-//case 40:
-//case 41:
-//SymbolicMusicSpecificConfig();
-//break;
-//default:
-///* reserved */
-//}
-//switch (audioObjectType) {
-//case 17:
-//case 19:
-//case 20:
-//case 21:
-//case 22:
-//case 23:
-//case 24:
-//case 25:
-//case 26:
-//case 27:
-//case 39:
-//epConfig; 2 bslbf
-//if ( epConfig == 2 || epConfig == 3 ) {
-//ErrorProtectionSpecificConfig();
-//}
-//if ( epConfig == 3 ) {
-//directMapping; 1 bslbf
-//if ( ! directMapping ) {
-///* tbd */
-//}
-//}
-//}
-//if ( extensionAudioObjectType != 5 && bits_to_decode() >= 16 ) {
-//syncExtensionType; 11 bslbf
-//if (syncExtensionType == 0x2b7) {
-//        extensionAudioObjectType = GetAudioObjectType();
-//if ( extensionAudioObjectType == 5 ) {
-//sbrPresentFlag; 1 uimsbf
-//if (sbrPresentFlag == 1) {
-//extensionSamplingFrequencyIndex; 4 uimsbf
-//if ( extensionSamplingFrequencyIndex == 0xf ) {
-//extensionSamplingFrequency; 24 uimsbf
-//}
-//if ( bits_to_decode() >= 12 ) {
-//syncExtensionType; 11 bslbf
-//if (syncExtesionType == 0x548) {
-//psPresentFlag; 1 uimsbf
-//}
-//}
-//}
-//}
-//if ( extensionAudioObjectType == 22 ) {
-//sbrPresentFlag; 1 uimsbf
-//if (sbrPresentFlag == 1) {
-//extensionSamplingFrequencyIndex; 4 uimsbf
-//if ( extensionSamplingFrequencyIndex == 0xf ) {
-//extensionSamplingFrequency; 24 uimsbf
-//}
-//}
-//extensionChannelConfiguration; 4 uimsbf
-//}
-//}
-//}
-//}
-//        }
-//
-// TFCodingType
-//0x0 AAC scaleable
-//0x1 BSAC
-//0x2 TwinVQ
-//0x3 AAC non scaleable (i.e. multichannel)
-//
-// class TFSpecificConfig( uint(4) samplingFrequencyIndex, uint(4) channelConfiguration ) {
-//uint(2) TFCodingType;
-//uint(1) frameLength;
-//uint(1) dependsOnCoreCoder;
-//if (dependsOnCoreCoder == 1){
-//uint(14)coreCoderDelay
-//}
-//if (TFCodingType==BSAC) {
-//uint(11) lslayer_length
-//}
-//uint (1) extensionFlag;
-//if (channelConfiguration == 0 ){
-//program_config_element();
-//}
-//if (extensionFlag==1){
-//<to be defined in mpeg4 phase 2>
-//}
-//}
-//
-//program_config_element()
-//{
-//element_instance_tag 4 uimsbf
-//profile 2 uimsbf
-//sampling_frequency_index 4 uimsbf
-//num_front_channel_elements 4 uimsbf
-//num_side_channel_elements 4 uimsbf
-//num_back_channel_elements 4 uimsbf
-// num_lfe_channel_elements 2 uimsbf
-//num_assoc_data_elements 3 uimsbf
-//num_valid_cc_elements 4 uimsbf
-//mono_mixdown_present 1 uimsbf
-//if ( mono_mixdown_present == 1 )
-//mono_mixdown_element_number 4 uimsbf
-//stereo_mixdown_present 1 uimsbf
-//if ( stereo_mixdown_present == 1 )
-//stereo_mixdown_element_number 4 uimsbf
-//matrix_mixdown_idx_present 1 uimsbf
-//if ( matrix_mixdown_idx_present == 1 ) {
-//matrix_mixdown_idx 2 uimsbf
-//pseudo_surround_enable 1 uimsbf
-//}
-//for ( i = 0; i < num_front_channel_elements; i++) {
-//front_element_is_cpe[i]; 1 bslbf
-//front_element_tag_select[i]; 4 uimsbf
-//}
-//for ( i = 0; i < num_side_channel_elements; i++) {
-//side_element_is_cpe[i]; 1 bslbf
-//side_element_tag_select[i]; 4 uimsbf
-//}
-//for ( i = 0; i < num_back_channel_elements; i++) {
-//back_element_is_cpe[i]; 1 bslbf
-//back_element_tag_select[i]; 4 uimsbf
-//}
-//for ( i = 0; i < num_lfe_channel_elements; i++)
-//lfe_element_tag_select[i]; 4 uimsbf
-//for ( i = 0; i < num_assoc_data_elements; i++)
-//assoc_data_element_tag_select[i]; 4 uimsbf
-//for ( i = 0; i < num_valid_cc_elements; i++) {
-//cc_element_is_ind_sw[i]; 1 uimsbf
-//valid_cc_element_tag_select[i]; 4 uimsbf
-//}
-//byte_alignment()
-//comment_field_bytes 8 uimsbf
-//for ( i = 0; i < comment_field_bytes; i++)
-//comment_field_data[i]; 8 uimsbf
-//}
-
-@Descriptor(tags = 0x5, objectTypeIndication = 0x40)
-public class AudioSpecificConfig extends BaseDescriptor {
-    byte[] configBytes;
-
-    public static Map<Integer, Integer> samplingFrequencyIndexMap = new HashMap<Integer, Integer>();
-    public static Map<Integer, String> audioObjectTypeMap = new HashMap<Integer, String>();
-    int audioObjectType;
-    int samplingFrequencyIndex;
-    int samplingFrequency;
-    int channelConfiguration;
-    int extensionAudioObjectType;
-    int sbrPresentFlag;
-    int psPresentFlag;
-    int extensionSamplingFrequencyIndex;
-    int extensionSamplingFrequency;
-    int extensionChannelConfiguration;
-    int sacPayloadEmbedding;
-    int fillBits;
-    int epConfig;
-    int directMapping;
-    int syncExtensionType;
-
-    //GASpecificConfig
-    int frameLengthFlag;
-    int dependsOnCoreCoder;
-    int coreCoderDelay;
-    int extensionFlag;
-    int layerNr;
-    int numOfSubFrame;
-    int layer_length;
-    int aacSectionDataResilienceFlag;
-    int aacScalefactorDataResilienceFlag;
-    int aacSpectralDataResilienceFlag;
-    int extensionFlag3;
-    boolean gaSpecificConfig;
-
-    //ParametricSpecificConfig
-    int isBaseLayer;
-    int paraMode;
-    int paraExtensionFlag;
-    int hvxcVarMode;
-    int hvxcRateMode;
-    int erHvxcExtensionFlag;
-    int var_ScalableFlag;
-    int hilnQuantMode;
-    int hilnMaxNumLine;
-    int hilnSampleRateCode;
-    int hilnFrameLength;
-    int hilnContMode;
-    int hilnEnhaLayer;
-    int hilnEnhaQuantMode;
-    boolean parametricSpecificConfig;
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        ByteBuffer configBytes = bb.slice();
-        configBytes.limit(sizeOfInstance);
-        bb.position(bb.position() + sizeOfInstance);
-
-        //copy original bytes to internal array for constructing codec config strings (todo until writing of the config is supported)
-        this.configBytes = new byte[sizeOfInstance];
-        configBytes.get(this.configBytes);
-        configBytes.rewind();
-
-        BitReaderBuffer bitReaderBuffer = new BitReaderBuffer(configBytes);
-        audioObjectType = getAudioObjectType(bitReaderBuffer);
-        samplingFrequencyIndex = bitReaderBuffer.readBits(4);
-
-        if (samplingFrequencyIndex == 0xf) {
-            samplingFrequency = bitReaderBuffer.readBits(24);
-        }
-
-        channelConfiguration = bitReaderBuffer.readBits(4);
-
-        if (audioObjectType == 5 ||
-                audioObjectType == 29) {
-            extensionAudioObjectType = 5;
-            sbrPresentFlag = 1;
-            if (audioObjectType == 29) {
-                psPresentFlag = 1;
-            }
-            extensionSamplingFrequencyIndex = bitReaderBuffer.readBits(4);
-            if (extensionSamplingFrequencyIndex == 0xf)
-                extensionSamplingFrequency = bitReaderBuffer.readBits(24);
-            audioObjectType = getAudioObjectType(bitReaderBuffer);
-            if (audioObjectType == 22)
-                extensionChannelConfiguration = bitReaderBuffer.readBits(4);
-        } else {
-            extensionAudioObjectType = 0;
-        }
-
-        switch (audioObjectType) {
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 6:
-            case 7:
-            case 17:
-            case 19:
-            case 20:
-            case 21:
-            case 22:
-            case 23:
-                parseGaSpecificConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, bitReaderBuffer);
-                //GASpecificConfig();
-                break;
-            case 8:
-                throw new UnsupportedOperationException("can't parse CelpSpecificConfig yet");
-                //CelpSpecificConfig();
-                //break;
-            case 9:
-                throw new UnsupportedOperationException("can't parse HvxcSpecificConfig yet");
-                //HvxcSpecificConfig();
-                //break;
-            case 12:
-                throw new UnsupportedOperationException("can't parse TTSSpecificConfig yet");
-                //TTSSpecificConfig();
-                //break;
-            case 13:
-            case 14:
-            case 15:
-            case 16:
-                throw new UnsupportedOperationException("can't parse StructuredAudioSpecificConfig yet");
-                //StructuredAudioSpecificConfig();
-                //break;
-            case 24:
-                throw new UnsupportedOperationException("can't parse ErrorResilientCelpSpecificConfig yet");
-                //ErrorResilientCelpSpecificConfig();
-                //break;
-            case 25:
-                throw new UnsupportedOperationException("can't parse ErrorResilientHvxcSpecificConfig yet");
-                //ErrorResilientHvxcSpecificConfig();
-                //break;
-            case 26:
-            case 27:
-                parseParametricSpecificConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, bitReaderBuffer);
-                //ParametricSpecificConfig();
-                break;
-            case 28:
-                throw new UnsupportedOperationException("can't parse SSCSpecificConfig yet");
-                //SSCSpecificConfig();
-                //break;
-            case 30:
-                sacPayloadEmbedding = bitReaderBuffer.readBits(1);
-                throw new UnsupportedOperationException("can't parse SpatialSpecificConfig yet");
-                //SpatialSpecificConfig();
-                //break;
-            case 32:
-            case 33:
-            case 34:
-                throw new UnsupportedOperationException("can't parse MPEG_1_2_SpecificConfig yet");
-                //MPEG_1_2_SpecificConfig();
-                //break;
-            case 35:
-                throw new UnsupportedOperationException("can't parse DSTSpecificConfig yet");
-                //DSTSpecificConfig();
-                //break;
-            case 36:
-                fillBits = bitReaderBuffer.readBits(5);
-                throw new UnsupportedOperationException("can't parse ALSSpecificConfig yet");
-                //ALSSpecificConfig();
-                //break;
-            case 37:
-            case 38:
-                throw new UnsupportedOperationException("can't parse SLSSpecificConfig yet");
-                //SLSSpecificConfig();
-                //break;
-            case 39:
-                throw new UnsupportedOperationException("can't parse ELDSpecificConfig yet");
-                //ELDSpecificConfig(channelConfiguration);
-                //break;
-            case 40:
-            case 41:
-                throw new UnsupportedOperationException("can't parse SymbolicMusicSpecificConfig yet");
-                //SymbolicMusicSpecificConfig();
-                //break;
-            default:
-                /* reserved */
-        }
-
-        switch (audioObjectType) {
-            case 17:
-            case 19:
-            case 20:
-            case 21:
-            case 22:
-            case 23:
-            case 24:
-            case 25:
-            case 26:
-            case 27:
-            case 39:
-                epConfig = bitReaderBuffer.readBits(2);
-                if (epConfig == 2 || epConfig == 3) {
-                    throw new UnsupportedOperationException("can't parse ErrorProtectionSpecificConfig yet");
-                    //ErrorProtectionSpecificConfig();
-                }
-                if (epConfig == 3) {
-                    directMapping = bitReaderBuffer.readBits(1);
-                    if (directMapping == 0) {
-                        /* tbd */
-                        throw new RuntimeException("not implemented");
-                    }
-                }
-        }
-
-        if (extensionAudioObjectType != 5 && bitReaderBuffer.remainingBits() >= 16) {
-            syncExtensionType = bitReaderBuffer.readBits(11);
-            if (syncExtensionType == 0x2b7) {
-                extensionAudioObjectType = getAudioObjectType(bitReaderBuffer);
-                if (extensionAudioObjectType == 5) {
-                    sbrPresentFlag = bitReaderBuffer.readBits(1);
-                    if (sbrPresentFlag == 1) {
-                        extensionSamplingFrequencyIndex = bitReaderBuffer.readBits(4);
-                        if (extensionSamplingFrequencyIndex == 0xf) {
-                            extensionSamplingFrequency = bitReaderBuffer.readBits(24);
-                        }
-                        if (bitReaderBuffer.remainingBits() >= 12) {
-                            syncExtensionType = bitReaderBuffer.readBits(11); //10101001000
-                            if (syncExtensionType == 0x548) {
-                                psPresentFlag = bitReaderBuffer.readBits(1);
-                            }
-                        }
-                    }
-                }
-                if (extensionAudioObjectType == 22) {
-                    sbrPresentFlag = bitReaderBuffer.readBits(1);
-                    if (sbrPresentFlag == 1) {
-                        extensionSamplingFrequencyIndex = bitReaderBuffer.readBits(4);
-                        if (extensionSamplingFrequencyIndex == 0xf) {
-                            extensionSamplingFrequency = bitReaderBuffer.readBits(24);
-                        }
-                    }
-                    extensionChannelConfiguration = bitReaderBuffer.readBits(4);
-                }
-            }
-        }
-    }
-
-    private int gaSpecificConfigSize() {
-        return 0;
-    }
-
-    public int serializedSize() {
-        int out = 4;
-        if (audioObjectType == 2) {
-            out += gaSpecificConfigSize();
-        } else {
-            throw new UnsupportedOperationException("can't serialize that yet");
-        }
-        return out;
-    }
-
-    public ByteBuffer serialize() {
-        ByteBuffer out = ByteBuffer.allocate(serializedSize());
-        IsoTypeWriter.writeUInt8(out, 5);
-        IsoTypeWriter.writeUInt8(out, serializedSize() - 2);
-        BitWriterBuffer bwb = new BitWriterBuffer(out);
-        bwb.writeBits(audioObjectType, 5);
-        bwb.writeBits(samplingFrequencyIndex, 4);
-        if (samplingFrequencyIndex == 0xf) {
-            throw new UnsupportedOperationException("can't serialize that yet");
-        }
-        bwb.writeBits(channelConfiguration, 4);
-
-        // Don't support any extensions, unusual GASpecificConfig other than the default or anything...
-
-        return out;
-    }
-
-    private int getAudioObjectType(BitReaderBuffer in) throws IOException {
-        int audioObjectType = in.readBits(5);
-        if (audioObjectType == 31) {
-            audioObjectType = 32 + in.readBits(6);
-        }
-        return audioObjectType;
-    }
-
-    private void parseGaSpecificConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-//    GASpecificConfig (samplingFrequencyIndex,
-//            channelConfiguration,
-//            audioObjectType)
-//    {
-        frameLengthFlag = in.readBits(1);
-        dependsOnCoreCoder = in.readBits(1);
-        if (dependsOnCoreCoder == 1) {
-            coreCoderDelay = in.readBits(14);
-        }
-        extensionFlag = in.readBits(1);
-        if (channelConfiguration == 0) {
-            throw new UnsupportedOperationException("can't parse program_config_element yet");
-            //program_config_element ();
-        }
-        if ((audioObjectType == 6) || (audioObjectType == 20)) {
-            layerNr = in.readBits(3);
-        }
-        if (extensionFlag == 1) {
-            if (audioObjectType == 22) {
-                numOfSubFrame = in.readBits(5);
-                layer_length = in.readBits(11);
-            }
-            if (audioObjectType == 17 || audioObjectType == 19 ||
-                    audioObjectType == 20 || audioObjectType == 23) {
-                aacSectionDataResilienceFlag = in.readBits(1);
-                aacScalefactorDataResilienceFlag = in.readBits(1);
-                aacSpectralDataResilienceFlag = in.readBits(1);
-            }
-            extensionFlag3 = in.readBits(1);
-            if (extensionFlag3 == 1) {
-                /* tbd in version 3 */
-            }
-        }
-//    }
-        gaSpecificConfig = true;
-    }
-
-    private void parseParametricSpecificConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-        /*
-        ParametricSpecificConfig() {
-            isBaseLayer; 1 uimsbf
-            if (isBaseLayer) {
-                PARAconfig();
-            } else {
-                HILNenexConfig();
-            }
-        }
-        */
-        isBaseLayer = in.readBits(1);
-        if (isBaseLayer == 1) {
-            parseParaConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, in);
-        } else {
-            parseHilnEnexConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, in);
-        }
-    }
-
-    private void parseParaConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-        /*
-        PARAconfig()
-        {
-            PARAmode; 2 uimsbf
-            if (PARAmode != 1) {
-                ErHVXCconfig();
-            }
-            if (PARAmode != 0) {
-                HILNconfig();
-            }
-            PARAextensionFlag; 1 uimsbf
-            if (PARAextensionFlag) {
-                // to be defined in MPEG-4 Phase 3
-            }
-        }
-        */
-        paraMode = in.readBits(2);
-
-        if (paraMode != 1) {
-            parseErHvxcConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, in);
-        }
-        if (paraMode != 0) {
-            parseHilnConfig(samplingFrequencyIndex, channelConfiguration, audioObjectType, in);
-        }
-
-        paraExtensionFlag = in.readBits(1);
-        parametricSpecificConfig = true;
-    }
-
-    private void parseErHvxcConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-        /*
-        ErHVXCconfig()
-        {
-            HVXCvarMode; 1 uimsbf
-                HVXCrateMode; 2 uimsbf
-                extensionFlag; 1 uimsbf
-            if (extensionFlag) {
-                var_ScalableFlag; 1 uimsbf
-            }
-        }
-        */
-        hvxcVarMode = in.readBits(1);
-        hvxcRateMode = in.readBits(2);
-        erHvxcExtensionFlag = in.readBits(1);
-
-        if (erHvxcExtensionFlag == 1) {
-            var_ScalableFlag = in.readBits(1);
-        }
-    }
-
-    private void parseHilnConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-        /*
-        HILNconfig()
-        {
-            HILNquantMode; 1 uimsbf
-            HILNmaxNumLine; 8 uimsbf
-            HILNsampleRateCode; 4 uimsbf
-            HILNframeLength; 12 uimsbf
-            HILNcontMode; 2 uimsbf
-        }
-        */
-        hilnQuantMode = in.readBits(1);
-        hilnMaxNumLine = in.readBits(8);
-        hilnSampleRateCode = in.readBits(4);
-        hilnFrameLength = in.readBits(12);
-        hilnContMode = in.readBits(2);
-    }
-
-    private void parseHilnEnexConfig(int samplingFrequencyIndex, int channelConfiguration, int audioObjectType, BitReaderBuffer in) throws IOException {
-        /*
-        HILNenexConfig()
-        {
-            HILNenhaLayer; 1 uimsbf
-            if (HILNenhaLayer) {
-                HILNenhaQuantMode; 2 uimsbf
-            }
-        }
-        */
-        hilnEnhaLayer = in.readBits(1);
-        if (hilnEnhaLayer == 1) {
-            hilnEnhaQuantMode = in.readBits(2);
-        }
-    }
-
-    public byte[] getConfigBytes() {
-        return configBytes;
-    }
-
-    public int getAudioObjectType() {
-        return audioObjectType;
-    }
-
-    public int getExtensionAudioObjectType() {
-        return extensionAudioObjectType;
-    }
-
-    public int getSbrPresentFlag() {
-        return sbrPresentFlag;
-    }
-
-    public int getPsPresentFlag() {
-        return psPresentFlag;
-    }
-
-    public void setAudioObjectType(int audioObjectType) {
-        this.audioObjectType = audioObjectType;
-    }
-
-    public void setSamplingFrequencyIndex(int samplingFrequencyIndex) {
-        this.samplingFrequencyIndex = samplingFrequencyIndex;
-    }
-
-    public void setSamplingFrequency(int samplingFrequency) {
-        this.samplingFrequency = samplingFrequency;
-    }
-
-    public void setChannelConfiguration(int channelConfiguration) {
-        this.channelConfiguration = channelConfiguration;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("AudioSpecificConfig");
-        sb.append("{configBytes=").append(Hex.encodeHex(configBytes));
-        sb.append(", audioObjectType=").append(audioObjectType).append(" (").append(audioObjectTypeMap.get(audioObjectType)).append(")");
-        sb.append(", samplingFrequencyIndex=").append(samplingFrequencyIndex).append(" (").append(samplingFrequencyIndexMap.get(samplingFrequencyIndex)).append(")");
-        sb.append(", samplingFrequency=").append(samplingFrequency);
-        sb.append(", channelConfiguration=").append(channelConfiguration);
-        if (extensionAudioObjectType > 0) {
-            sb.append(", extensionAudioObjectType=").append(extensionAudioObjectType).append(" (").append(audioObjectTypeMap.get(extensionAudioObjectType)).append(")");
-            sb.append(", sbrPresentFlag=").append(sbrPresentFlag);
-            sb.append(", psPresentFlag=").append(psPresentFlag);
-            sb.append(", extensionSamplingFrequencyIndex=").append(extensionSamplingFrequencyIndex).append(" (").append(samplingFrequencyIndexMap.get(extensionSamplingFrequencyIndex)).append(")");
-            sb.append(", extensionSamplingFrequency=").append(extensionSamplingFrequency);
-            sb.append(", extensionChannelConfiguration=").append(extensionChannelConfiguration);
-        }
-//    sb.append(", sacPayloadEmbedding=").append(sacPayloadEmbedding);
-//    sb.append(", fillBits=").append(fillBits);
-//    sb.append(", epConfig=").append(epConfig);
-//    sb.append(", directMapping=").append(directMapping);
-        sb.append(", syncExtensionType=").append(syncExtensionType);
-        if (gaSpecificConfig) {
-            sb.append(", frameLengthFlag=").append(frameLengthFlag);
-            sb.append(", dependsOnCoreCoder=").append(dependsOnCoreCoder);
-            sb.append(", coreCoderDelay=").append(coreCoderDelay);
-            sb.append(", extensionFlag=").append(extensionFlag);
-            sb.append(", layerNr=").append(layerNr);
-            sb.append(", numOfSubFrame=").append(numOfSubFrame);
-            sb.append(", layer_length=").append(layer_length);
-            sb.append(", aacSectionDataResilienceFlag=").append(aacSectionDataResilienceFlag);
-            sb.append(", aacScalefactorDataResilienceFlag=").append(aacScalefactorDataResilienceFlag);
-            sb.append(", aacSpectralDataResilienceFlag=").append(aacSpectralDataResilienceFlag);
-            sb.append(", extensionFlag3=").append(extensionFlag3);
-        }
-        if (parametricSpecificConfig) {
-            sb.append(", isBaseLayer=").append(isBaseLayer);
-            sb.append(", paraMode=").append(paraMode);
-            sb.append(", paraExtensionFlag=").append(paraExtensionFlag);
-            sb.append(", hvxcVarMode=").append(hvxcVarMode);
-            sb.append(", hvxcRateMode=").append(hvxcRateMode);
-            sb.append(", erHvxcExtensionFlag=").append(erHvxcExtensionFlag);
-            sb.append(", var_ScalableFlag=").append(var_ScalableFlag);
-            sb.append(", hilnQuantMode=").append(hilnQuantMode);
-            sb.append(", hilnMaxNumLine=").append(hilnMaxNumLine);
-            sb.append(", hilnSampleRateCode=").append(hilnSampleRateCode);
-            sb.append(", hilnFrameLength=").append(hilnFrameLength);
-            sb.append(", hilnContMode=").append(hilnContMode);
-            sb.append(", hilnEnhaLayer=").append(hilnEnhaLayer);
-            sb.append(", hilnEnhaQuantMode=").append(hilnEnhaQuantMode);
-        }
-        sb.append('}');
-        return sb.toString();
-    }
-
-    static {
-        // sampling_frequency_index sampling frequeny
-//0x0 96000
-//0x1 88200
-//0x2 64000
-//0x3 48000
-//0x4 44100
-//0x5 32000
-//0x6 24000
-//0x7 22050
-//0x8 16000
-//0x9 12000
-//0xa 11025
-//0xb 8000
-//0xc reserved
-//0xd reserved
-//0xe reserved
-//0xf reserved
-        samplingFrequencyIndexMap.put(0x0, 96000);
-        samplingFrequencyIndexMap.put(0x1, 88200);
-        samplingFrequencyIndexMap.put(0x2, 64000);
-        samplingFrequencyIndexMap.put(0x3, 48000);
-        samplingFrequencyIndexMap.put(0x4, 44100);
-        samplingFrequencyIndexMap.put(0x5, 32000);
-        samplingFrequencyIndexMap.put(0x6, 24000);
-        samplingFrequencyIndexMap.put(0x7, 22050);
-        samplingFrequencyIndexMap.put(0x8, 16000);
-        samplingFrequencyIndexMap.put(0x9, 12000);
-        samplingFrequencyIndexMap.put(0xa, 11025);
-        samplingFrequencyIndexMap.put(0xb, 8000);
-
-        /* audioObjectType IDs
-          0 Null
-        1 AAC main X X
-        2 AAC LC X X X X X X X
-        3 AAC SSR X X
-        4 AAC LTP X X X X
-        5 SBR X X
-        6 AAC Scalable X X X X
-        7 TwinVQ X X X
-        8 CELP X X X X X X
-        9 HVXC X X X X X
-        10 (reserved)
-        11 (reserved)
-        12 TTSI X X X X X X
-        13 Main synthetic X X
-        14 Wavetable synthesis X* X*
-        15 General MIDI X* X*
-        16 Algorithmic Synthesis and Audio FX X* X*
-        17 ER AAC LC X X X
-        18 (reserved)
-        19 ER AAC LTP X X
-        20 ER AAC Scalable X X X
-        21 ER TwinVQ X X
-        22 ER BSAC X X
-        23 ER AAC LD X X X X
-        24 ER CELP X X X
-        25 ER HVXC X X
-        26 ER HILN X
-        27 ER Parametric X
-        28 SSC
-        29 PS X
-        30 MPEG Surround
-        31 (escape)
-        32 Layer-1
-        33 Layer-2
-        34 Layer-3
-        35 DST
-        36 ALS
-        37 SLS
-        38 SLS non-core
-        39 ER AAC ELD
-        40 SMR Simple
-        41 SMR Main
-        */
-        audioObjectTypeMap.put(1, "AAC main");
-        audioObjectTypeMap.put(2, "AAC LC");
-        audioObjectTypeMap.put(3, "AAC SSR");
-        audioObjectTypeMap.put(4, "AAC LTP");
-        audioObjectTypeMap.put(5, "SBR");
-        audioObjectTypeMap.put(6, "AAC Scalable");
-        audioObjectTypeMap.put(7, "TwinVQ");
-        audioObjectTypeMap.put(8, "CELP");
-        audioObjectTypeMap.put(9, "HVXC");
-        audioObjectTypeMap.put(10, "(reserved)");
-        audioObjectTypeMap.put(11, "(reserved)");
-        audioObjectTypeMap.put(12, "TTSI");
-        audioObjectTypeMap.put(13, "Main synthetic");
-        audioObjectTypeMap.put(14, "Wavetable synthesis");
-        audioObjectTypeMap.put(15, "General MIDI");
-        audioObjectTypeMap.put(16, "Algorithmic Synthesis and Audio FX");
-        audioObjectTypeMap.put(17, "ER AAC LC");
-        audioObjectTypeMap.put(18, "(reserved)");
-        audioObjectTypeMap.put(19, "ER AAC LTP");
-        audioObjectTypeMap.put(20, "ER AAC Scalable");
-        audioObjectTypeMap.put(21, "ER TwinVQ");
-        audioObjectTypeMap.put(22, "ER BSAC");
-        audioObjectTypeMap.put(23, "ER AAC LD");
-        audioObjectTypeMap.put(24, "ER CELP");
-        audioObjectTypeMap.put(25, "ER HVXC");
-        audioObjectTypeMap.put(26, "ER HILN");
-        audioObjectTypeMap.put(27, "ER Parametric");
-        audioObjectTypeMap.put(28, "SSC");
-        audioObjectTypeMap.put(29, "PS");
-        audioObjectTypeMap.put(30, "MPEG Surround");
-        audioObjectTypeMap.put(31, "(escape)");
-        audioObjectTypeMap.put(32, "Layer-1");
-        audioObjectTypeMap.put(33, "Layer-2");
-        audioObjectTypeMap.put(34, "Layer-3");
-        audioObjectTypeMap.put(35, "DST");
-        audioObjectTypeMap.put(36, "ALS");
-        audioObjectTypeMap.put(37, "SLS");
-        audioObjectTypeMap.put(38, "SLS non-core");
-        audioObjectTypeMap.put(39, "ER AAC ELD");
-        audioObjectTypeMap.put(40, "SMR Simple");
-        audioObjectTypeMap.put(41, "SMR Main");
-
-        /* profileLevelIds
-       0x00 Reserved for ISO use -
-     0x01 Main Audio Profile L1
-     0x02 Main Audio Profile L2
-     0x03 Main Audio Profile L3
-     0x04 Main Audio Profile L4
-     0x05 Scalable Audio Profile L1
-     0x06 Scalable Audio Profile L2
-     0x07 Scalable Audio Profile L3
-     0x08 Scalable Audio Profile L4
-     0x09 Speech Audio Profile L1
-     0x0A Speech Audio Profile L2
-     0x0B Synthetic Audio Profile L1
-     0x0C Synthetic Audio Profile L2
-     0x0D Synthetic Audio Profile L3
-     0x0E High Quality Audio Profile L1
-     0x0F High Quality Audio Profile L2
-     0x10 High Quality Audio Profile L3
-     0x11 High Quality Audio Profile L4
-     0x12 High Quality Audio Profile L5
-     0x13 High Quality Audio Profile L6
-     0x14 High Quality Audio Profile L7
-     0x15 High Quality Audio Profile L8
-     0x16 Low Delay Audio Profile L1
-     0x17 Low Delay Audio Profile L2
-     0x18 Low Delay Audio Profile L3
-     0x19 Low Delay Audio Profile L4
-     0x1A Low Delay Audio Profile L5
-     0x1B Low Delay Audio Profile L6
-     0x1C Low Delay Audio Profile L7
-     0x1D Low Delay Audio Profile L8
-     0x1E Natural Audio Profile L1
-     0x1F Natural Audio Profile L2
-     0x20 Natural Audio Profile L3
-     0x21 Natural Audio Profile L4
-     0x22 Mobile Audio Internetworking Profile L1
-     0x23 Mobile Audio Internetworking Profile L2
-     0x24 Mobile Audio Internetworking Profile L3
-     0x25 Mobile Audio Internetworking Profile L4
-     0x26 Mobile Audio Internetworking Profile L5
-     0x27 Mobile Audio Internetworking Profile L6
-     0x28 AAC Profile L1
-     0x29 AAC Profile L2
-     0x2A AAC Profile L4
-     0x2B AAC Profile L5
-     0x2C High Efficiency AAC Profile L2
-     0x2D High Efficiency AAC Profile L3
-     0x2E High Efficiency AAC Profile L4
-     0x2F High Efficiency AAC Profile L5
-     0x30 High Efficiency AAC v2 Profile L2
-     0x31 High Efficiency AAC v2 Profile L3
-     0x32 High Efficiency AAC v2 Profile L4
-     0x33 High Efficiency AAC v2 Profile L5
-     0x34 Low Delay AAC Profile L1
-     0x35 Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L1
-     0x36 Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L2
-     0x37 Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L3
-     0x38 Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L4
-     0c39 Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L5
-     0x3A Baseline MPEG Surround Profile (see ISO/IEC
-     23003-1)
-     L6
-     0x3B - 0x7F reserved for ISO use -
-     0x80 - 0xFD user private -
-     0xFE no audio profile specified -
-     0xFF no audio capability required -
-
-        */
-    }
-
-
-    public int getSamplingFrequency() {
-        return samplingFrequencyIndex == 0xf ? samplingFrequency : samplingFrequencyIndexMap.get(samplingFrequencyIndex);
-    }
-
-    public int getChannelConfiguration() {
-        return channelConfiguration;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        AudioSpecificConfig that = (AudioSpecificConfig) o;
-
-        if (aacScalefactorDataResilienceFlag != that.aacScalefactorDataResilienceFlag) {
-            return false;
-        }
-        if (aacSectionDataResilienceFlag != that.aacSectionDataResilienceFlag) {
-            return false;
-        }
-        if (aacSpectralDataResilienceFlag != that.aacSpectralDataResilienceFlag) {
-            return false;
-        }
-        if (audioObjectType != that.audioObjectType) {
-            return false;
-        }
-        if (channelConfiguration != that.channelConfiguration) {
-            return false;
-        }
-        if (coreCoderDelay != that.coreCoderDelay) {
-            return false;
-        }
-        if (dependsOnCoreCoder != that.dependsOnCoreCoder) {
-            return false;
-        }
-        if (directMapping != that.directMapping) {
-            return false;
-        }
-        if (epConfig != that.epConfig) {
-            return false;
-        }
-        if (erHvxcExtensionFlag != that.erHvxcExtensionFlag) {
-            return false;
-        }
-        if (extensionAudioObjectType != that.extensionAudioObjectType) {
-            return false;
-        }
-        if (extensionChannelConfiguration != that.extensionChannelConfiguration) {
-            return false;
-        }
-        if (extensionFlag != that.extensionFlag) {
-            return false;
-        }
-        if (extensionFlag3 != that.extensionFlag3) {
-            return false;
-        }
-        if (extensionSamplingFrequency != that.extensionSamplingFrequency) {
-            return false;
-        }
-        if (extensionSamplingFrequencyIndex != that.extensionSamplingFrequencyIndex) {
-            return false;
-        }
-        if (fillBits != that.fillBits) {
-            return false;
-        }
-        if (frameLengthFlag != that.frameLengthFlag) {
-            return false;
-        }
-        if (gaSpecificConfig != that.gaSpecificConfig) {
-            return false;
-        }
-        if (hilnContMode != that.hilnContMode) {
-            return false;
-        }
-        if (hilnEnhaLayer != that.hilnEnhaLayer) {
-            return false;
-        }
-        if (hilnEnhaQuantMode != that.hilnEnhaQuantMode) {
-            return false;
-        }
-        if (hilnFrameLength != that.hilnFrameLength) {
-            return false;
-        }
-        if (hilnMaxNumLine != that.hilnMaxNumLine) {
-            return false;
-        }
-        if (hilnQuantMode != that.hilnQuantMode) {
-            return false;
-        }
-        if (hilnSampleRateCode != that.hilnSampleRateCode) {
-            return false;
-        }
-        if (hvxcRateMode != that.hvxcRateMode) {
-            return false;
-        }
-        if (hvxcVarMode != that.hvxcVarMode) {
-            return false;
-        }
-        if (isBaseLayer != that.isBaseLayer) {
-            return false;
-        }
-        if (layerNr != that.layerNr) {
-            return false;
-        }
-        if (layer_length != that.layer_length) {
-            return false;
-        }
-        if (numOfSubFrame != that.numOfSubFrame) {
-            return false;
-        }
-        if (paraExtensionFlag != that.paraExtensionFlag) {
-            return false;
-        }
-        if (paraMode != that.paraMode) {
-            return false;
-        }
-        if (parametricSpecificConfig != that.parametricSpecificConfig) {
-            return false;
-        }
-        if (psPresentFlag != that.psPresentFlag) {
-            return false;
-        }
-        if (sacPayloadEmbedding != that.sacPayloadEmbedding) {
-            return false;
-        }
-        if (samplingFrequency != that.samplingFrequency) {
-            return false;
-        }
-        if (samplingFrequencyIndex != that.samplingFrequencyIndex) {
-            return false;
-        }
-        if (sbrPresentFlag != that.sbrPresentFlag) {
-            return false;
-        }
-        if (syncExtensionType != that.syncExtensionType) {
-            return false;
-        }
-        if (var_ScalableFlag != that.var_ScalableFlag) {
-            return false;
-        }
-        if (!Arrays.equals(configBytes, that.configBytes)) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = configBytes != null ? Arrays.hashCode(configBytes) : 0;
-        result = 31 * result + audioObjectType;
-        result = 31 * result + samplingFrequencyIndex;
-        result = 31 * result + samplingFrequency;
-        result = 31 * result + channelConfiguration;
-        result = 31 * result + extensionAudioObjectType;
-        result = 31 * result + sbrPresentFlag;
-        result = 31 * result + psPresentFlag;
-        result = 31 * result + extensionSamplingFrequencyIndex;
-        result = 31 * result + extensionSamplingFrequency;
-        result = 31 * result + extensionChannelConfiguration;
-        result = 31 * result + sacPayloadEmbedding;
-        result = 31 * result + fillBits;
-        result = 31 * result + epConfig;
-        result = 31 * result + directMapping;
-        result = 31 * result + syncExtensionType;
-        result = 31 * result + frameLengthFlag;
-        result = 31 * result + dependsOnCoreCoder;
-        result = 31 * result + coreCoderDelay;
-        result = 31 * result + extensionFlag;
-        result = 31 * result + layerNr;
-        result = 31 * result + numOfSubFrame;
-        result = 31 * result + layer_length;
-        result = 31 * result + aacSectionDataResilienceFlag;
-        result = 31 * result + aacScalefactorDataResilienceFlag;
-        result = 31 * result + aacSpectralDataResilienceFlag;
-        result = 31 * result + extensionFlag3;
-        result = 31 * result + (gaSpecificConfig ? 1 : 0);
-        result = 31 * result + isBaseLayer;
-        result = 31 * result + paraMode;
-        result = 31 * result + paraExtensionFlag;
-        result = 31 * result + hvxcVarMode;
-        result = 31 * result + hvxcRateMode;
-        result = 31 * result + erHvxcExtensionFlag;
-        result = 31 * result + var_ScalableFlag;
-        result = 31 * result + hilnQuantMode;
-        result = 31 * result + hilnMaxNumLine;
-        result = 31 * result + hilnSampleRateCode;
-        result = 31 * result + hilnFrameLength;
-        result = 31 * result + hilnContMode;
-        result = 31 * result + hilnEnhaLayer;
-        result = 31 * result + hilnEnhaQuantMode;
-        result = 31 * result + (parametricSpecificConfig ? 1 : 0);
-        return result;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BaseDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BaseDescriptor.java
deleted file mode 100755
index 6d94680..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BaseDescriptor.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.IsoTypeReader;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-/*
-abstract aligned(8) expandable(228-1) class BaseDescriptor : bit(8) tag=0 {
-// empty. To be filled by classes extending this class.
-}
-
-int sizeOfInstance = 0;
-bit(1) nextByte;
-bit(7) sizeOfInstance;
-while(nextByte) {
-bit(1) nextByte;
-bit(7) sizeByte;
-sizeOfInstance = sizeOfInstance<<7 | sizeByte;
-}
- */
-@Descriptor(tags = 0x00)
-public abstract class BaseDescriptor {
-    int tag;
-    int sizeOfInstance;
-    int sizeBytes;
-
-    public BaseDescriptor() {
-    }
-
-    public int getTag() {
-        return tag;
-    }
-
-    public int getSize() {
-        return sizeOfInstance
-                + 1//1 for the tag
-                + sizeBytes;
-    }
-
-    public int getSizeOfInstance() {
-        return sizeOfInstance;
-    }
-
-    public int getSizeBytes() {
-        return sizeBytes;
-    }
-
-    public final void parse(int tag, ByteBuffer bb) throws IOException {
-        this.tag = tag;
-
-        int i = 0;
-        int tmp = IsoTypeReader.readUInt8(bb);
-        i++;
-        sizeOfInstance = tmp & 0x7f;
-        while (tmp >>> 7 == 1) { //nextbyte indicator bit
-            tmp = IsoTypeReader.readUInt8(bb);
-            i++;
-            //sizeOfInstance = sizeOfInstance<<7 | sizeByte;
-            sizeOfInstance = sizeOfInstance << 7 | tmp & 0x7f;
-        }
-        sizeBytes = i;
-        ByteBuffer detailSource = bb.slice();
-        detailSource.limit(sizeOfInstance);
-        parseDetail(detailSource);
-        assert detailSource.remaining() == 0: this.getClass().getSimpleName() + " has not been fully parsed";
-        bb.position(bb.position() + sizeOfInstance);
-    }
-    
-    public abstract void parseDetail(ByteBuffer bb) throws IOException;
-
-
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("BaseDescriptor");
-        sb.append("{tag=").append(tag);
-        sb.append(", sizeOfInstance=").append(sizeOfInstance);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitReaderBuffer.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitReaderBuffer.java
deleted file mode 100755
index 7221503..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitReaderBuffer.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import java.nio.ByteBuffer;
-
-public class BitReaderBuffer {
-
-    private ByteBuffer buffer;
-    int initialPos;
-    int position;
-
-    public BitReaderBuffer(ByteBuffer buffer) {
-        this.buffer = buffer;
-        initialPos = buffer.position();
-    }
-
-    public int readBits(int i) {
-        byte b = buffer.get(initialPos + position / 8);
-        int v = b < 0 ? b + 256 : b;
-        int left = 8 - position % 8;
-        int rc;
-        if (i <= left) {
-            rc = (v << (position % 8) & 0xFF) >> ((position % 8) + (left - i));
-            position += i;
-        } else {
-            int now = left;
-            int then = i - left;
-            rc = readBits(now);
-            rc = rc << then;
-            rc += readBits(then);
-        }
-        buffer.position(initialPos + (int) Math.ceil((double) position / 8));
-        return rc;
-    }
-
-    public int getPosition() {
-        return position;
-    }
-
-    public int byteSync() {
-        int left = 8 - position % 8;
-        if (left == 8) {
-            left = 0;
-        }
-        readBits(left);
-        return left;
-    }
-
-    public int remainingBits() {
-        return buffer.limit() * 8 - position;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitWriterBuffer.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitWriterBuffer.java
deleted file mode 100755
index e6ea67f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/BitWriterBuffer.java
+++ /dev/null
@@ -1,36 +0,0 @@
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import java.nio.ByteBuffer;
-
-public class BitWriterBuffer {
-
-    private ByteBuffer buffer;
-    int initialPos;
-    int position = 0;
-
-    public BitWriterBuffer(ByteBuffer buffer) {
-        this.buffer = buffer;
-        this.initialPos = buffer.position();
-    }
-
-    public void writeBits(int i, int numBits) {
-        assert i <= ((1 << numBits)-1): String.format("Trying to write a value bigger (%s) than the number bits (%s) allows. " +
-                "Please mask the value before writing it and make your code is really working as intended.", i, (1<<numBits)-1);
-
-        int left = 8 - position % 8;
-        if (numBits <= left) {
-            int current = (buffer.get(initialPos + position / 8));
-            current = current < 0 ? current + 256 : current;
-            current += i << (left - numBits);
-            buffer.put(initialPos + position / 8, (byte) (current > 127 ? current - 256 : current));
-            position += numBits;
-        } else {
-            int bitsSecondWrite = numBits - left;
-            writeBits(i >> bitsSecondWrite, left);
-            writeBits(i & (1 << bitsSecondWrite) - 1, bitsSecondWrite);
-        }
-        buffer.position(initialPos + position / 8 + ((position % 8 > 0) ? 1 : 0));
-    }
-
-
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderConfigDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderConfigDescriptor.java
deleted file mode 100755
index 69d603a..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderConfigDescriptor.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.Hex;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.logging.Logger;
-
-/**
- * class DecoderConfigDescriptor extends BaseDescriptor : bit(8)
- * tag=DecoderConfigDescrTag {
- * bit(8) objectTypeIndication;
- * bit(6) streamType;
- * bit(1) upStream;
- * const bit(1) reserved=1;
- * bit(24) bufferSizeDB;
- * bit(32) maxBitrate;
- * bit(32) avgBitrate;
- * DecoderSpecificInfo decSpecificInfo[0 .. 1];
- * profileLevelIndicationIndexDescriptor profileLevelIndicationIndexDescr
- * [0..255];
- * }
- */
-@Descriptor(tags = {0x04})
-public class DecoderConfigDescriptor extends BaseDescriptor {
-    private static Logger log = Logger.getLogger(DecoderConfigDescriptor.class.getName());
-
-
-    int objectTypeIndication;
-    int streamType;
-    int upStream;
-    int bufferSizeDB;
-    long maxBitRate;
-    long avgBitRate;
-
-    DecoderSpecificInfo decoderSpecificInfo;
-    AudioSpecificConfig audioSpecificInfo;
-    List<ProfileLevelIndicationDescriptor> profileLevelIndicationDescriptors = new ArrayList<ProfileLevelIndicationDescriptor>();
-    byte[] configDescriptorDeadBytes;
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        objectTypeIndication = IsoTypeReader.readUInt8(bb);
-
-        int data = IsoTypeReader.readUInt8(bb);
-        streamType = data >>> 2;
-        upStream = (data >> 1) & 0x1;
-
-        bufferSizeDB = IsoTypeReader.readUInt24(bb);
-        maxBitRate = IsoTypeReader.readUInt32(bb);
-        avgBitRate = IsoTypeReader.readUInt32(bb);
-
-
-
-        BaseDescriptor descriptor;
-        if (bb.remaining() > 2) { //1byte tag + at least 1byte size
-            final int begin = bb.position();
-            descriptor = ObjectDescriptorFactory.createFrom(objectTypeIndication, bb);
-            final int read = bb.position() - begin;
-            log.finer(descriptor + " - DecoderConfigDescr1 read: " + read + ", size: " + (descriptor != null ? descriptor.getSize() : null));
-            if (descriptor != null) {
-                final int size = descriptor.getSize();
-                if (read < size) {
-                    //skip
-                    configDescriptorDeadBytes = new byte[size - read];
-                    bb.get(configDescriptorDeadBytes);
-                }
-            }
-            if (descriptor instanceof DecoderSpecificInfo) {
-                decoderSpecificInfo = (DecoderSpecificInfo) descriptor;
-            }
-            if (descriptor instanceof AudioSpecificConfig) {
-                audioSpecificInfo = (AudioSpecificConfig) descriptor;
-            }
-        }
-
-        while (bb.remaining() > 2) {
-            final long begin = bb.position();
-            descriptor = ObjectDescriptorFactory.createFrom(objectTypeIndication, bb);
-            final long read = bb.position() - begin;
-            log.finer(descriptor + " - DecoderConfigDescr2 read: " + read + ", size: " + (descriptor != null ? descriptor.getSize() : null));
-            if (descriptor instanceof ProfileLevelIndicationDescriptor) {
-                profileLevelIndicationDescriptors.add((ProfileLevelIndicationDescriptor) descriptor);
-            }
-        }
-    }
-    public int serializedSize() {
-        return 15 + audioSpecificInfo.serializedSize();
-    }
-
-    public ByteBuffer serialize() {
-        ByteBuffer out = ByteBuffer.allocate(serializedSize());
-        IsoTypeWriter.writeUInt8(out, 4);
-        IsoTypeWriter.writeUInt8(out, serializedSize() - 2);
-        IsoTypeWriter.writeUInt8(out, objectTypeIndication);
-        int flags = (streamType << 2) | (upStream << 1) | 1;
-        IsoTypeWriter.writeUInt8(out, flags);
-        IsoTypeWriter.writeUInt24(out, bufferSizeDB);
-        IsoTypeWriter.writeUInt32(out, maxBitRate);
-        IsoTypeWriter.writeUInt32(out, avgBitRate);
-        out.put(audioSpecificInfo.serialize().array());
-        return out;
-    }
-
-    public DecoderSpecificInfo getDecoderSpecificInfo() {
-        return decoderSpecificInfo;
-    }
-
-    public AudioSpecificConfig getAudioSpecificInfo() {
-        return audioSpecificInfo;
-    }
-
-    public void setAudioSpecificInfo(AudioSpecificConfig audioSpecificInfo) {
-        this.audioSpecificInfo = audioSpecificInfo;
-    }
-
-    public List<ProfileLevelIndicationDescriptor> getProfileLevelIndicationDescriptors() {
-        return profileLevelIndicationDescriptors;
-    }
-
-    public int getObjectTypeIndication() {
-        return objectTypeIndication;
-    }
-
-    public void setObjectTypeIndication(int objectTypeIndication) {
-        this.objectTypeIndication = objectTypeIndication;
-    }
-
-    public int getStreamType() {
-        return streamType;
-    }
-
-    public void setStreamType(int streamType) {
-        this.streamType = streamType;
-    }
-
-    public int getUpStream() {
-        return upStream;
-    }
-
-    public void setUpStream(int upStream) {
-        this.upStream = upStream;
-    }
-
-    public int getBufferSizeDB() {
-        return bufferSizeDB;
-    }
-
-    public void setBufferSizeDB(int bufferSizeDB) {
-        this.bufferSizeDB = bufferSizeDB;
-    }
-
-    public long getMaxBitRate() {
-        return maxBitRate;
-    }
-
-    public void setMaxBitRate(long maxBitRate) {
-        this.maxBitRate = maxBitRate;
-    }
-
-    public long getAvgBitRate() {
-        return avgBitRate;
-    }
-
-    public void setAvgBitRate(long avgBitRate) {
-        this.avgBitRate = avgBitRate;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("DecoderConfigDescriptor");
-        sb.append("{objectTypeIndication=").append(objectTypeIndication);
-        sb.append(", streamType=").append(streamType);
-        sb.append(", upStream=").append(upStream);
-        sb.append(", bufferSizeDB=").append(bufferSizeDB);
-        sb.append(", maxBitRate=").append(maxBitRate);
-        sb.append(", avgBitRate=").append(avgBitRate);
-        sb.append(", decoderSpecificInfo=").append(decoderSpecificInfo);
-        sb.append(", audioSpecificInfo=").append(audioSpecificInfo);
-        sb.append(", configDescriptorDeadBytes=").append(Hex.encodeHex(configDescriptorDeadBytes != null ? configDescriptorDeadBytes : new byte[]{}));
-        sb.append(", profileLevelIndicationDescriptors=").append(profileLevelIndicationDescriptors == null ? "null" : Arrays.asList(profileLevelIndicationDescriptors).toString());
-        sb.append('}');
-        return sb.toString();
-    }
-    /*objectTypeIndication values
-      0x00 Forbidden
-    0x01 Systems ISO/IEC 14496-1 a
-    0x02 Systems ISO/IEC 14496-1 b
-    0x03 Interaction Stream
-    0x04 Systems ISO/IEC 14496-1 Extended BIFS Configuration c
-    0x05 Systems ISO/IEC 14496-1 AFX d
-    0x06 Font Data Stream
-    0x07 Synthesized Texture Stream
-    0x08 Streaming Text Stream
-    0x09-0x1F reserved for ISO use
-    0x20 Visual ISO/IEC 14496-2 e
-    0x21 Visual ITU-T Recommendation H.264 | ISO/IEC 14496-10 f
-    0x22 Parameter Sets for ITU-T Recommendation H.264 | ISO/IEC 14496-10 f
-    0x23-0x3F reserved for ISO use
-    0x40 Audio ISO/IEC 14496-3 g
-    0x41-0x5F reserved for ISO use
-    0x60 Visual ISO/IEC 13818-2 Simple Profile
-    0x61 Visual ISO/IEC 13818-2 Main Profile
-    0x62 Visual ISO/IEC 13818-2 SNR Profile
-    0x63 Visual ISO/IEC 13818-2 Spatial Profile
-    0x64 Visual ISO/IEC 13818-2 High Profile
-    0x65 Visual ISO/IEC 13818-2 422 Profile
-    0x66 Audio ISO/IEC 13818-7 Main Profile
-    0x67 Audio ISO/IEC 13818-7 LowComplexity Profile
-    0x68 Audio ISO/IEC 13818-7 Scaleable Sampling Rate Profile
-    0x69 Audio ISO/IEC 13818-3
-    0x6A Visual ISO/IEC 11172-2
-    0x6B Audio ISO/IEC 11172-3
-    0x6C Visual ISO/IEC 10918-1
-    0x6D reserved for registration authority i
-    0x6E Visual ISO/IEC 15444-1
-    0x6F - 0x9F reserved for ISO use
-    0xA0 - 0xBF reserved for registration authority i
-    0xC0 - 0xE0 user private
-    0xE1 reserved for registration authority i
-    0xE2 - 0xFE user private
-    0xFF no object type specified h
-    */
-    /* streamType values
-      0x00 Forbidden
-    0x01 ObjectDescriptorStream (see 7.2.5)
-    0x02 ClockReferenceStream (see 7.3.2.5)
-    0x03 SceneDescriptionStream (see ISO/IEC 14496-11)
-    0x04 VisualStream
-    0x05 AudioStream
-    0x06 MPEG7Stream
-    0x07 IPMPStream (see 7.2.3.2)
-    0x08 ObjectContentInfoStream (see 7.2.4.2)
-    0x09 MPEGJStream
-    0x0A Interaction Stream
-    0x0B IPMPToolStream (see [ISO/IEC 14496-13])
-    0x0C - 0x1F reserved for ISO use
-    0x20 - 0x3F user private
-    */
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderSpecificInfo.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderSpecificInfo.java
deleted file mode 100755
index 574943c..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/DecoderSpecificInfo.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.Hex;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-
-/**
- * abstract class DecoderSpecificInfo extends BaseDescriptor : bit(8)
- * tag=DecSpecificInfoTag
- * {
- * // empty. To be filled by classes extending this class.
- * }
- */
-@Descriptor(tags = 0x05)
-public class DecoderSpecificInfo extends BaseDescriptor {
-    byte[] bytes;
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        if (sizeOfInstance > 0) {
-            bytes = new byte[sizeOfInstance];
-            bb.get(bytes);
-        }
-    }
-
-    public int serializedSize() {
-        return bytes.length;
-    }
-
-    public ByteBuffer serialize() {
-        ByteBuffer out = ByteBuffer.wrap(bytes);
-
-        return out;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("DecoderSpecificInfo");
-        sb.append("{bytes=").append(bytes == null ? "null" : Hex.encodeHex(bytes));
-        sb.append('}');
-        return sb.toString();
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        DecoderSpecificInfo that = (DecoderSpecificInfo) o;
-
-        if (!Arrays.equals(bytes, that.bytes)) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return bytes != null ? Arrays.hashCode(bytes) : 0;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/Descriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/Descriptor.java
deleted file mode 100755
index 11020c7..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/Descriptor.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * Created by IntelliJ IDEA.
- * User: mstattma
- * Date: 06.08.2010
- * Time: 06:54:58
- * To change this template use File | Settings | File Templates.
- */
-@Documented
-@Target(ElementType.TYPE)
-@Retention(RetentionPolicy.RUNTIME)
-public @interface Descriptor {
-    int[] tags();
-
-    int objectTypeIndication() default -1;
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ESDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ESDescriptor.java
deleted file mode 100755
index 3bb4821..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ESDescriptor.java
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.logging.Logger;
-
-/*
-class ES_Descriptor extends BaseDescriptor : bit(8) tag=ES_DescrTag {
-bit(16) ES_ID;
-bit(1) streamDependenceFlag;
-bit(1) URL_Flag;
-bit(1) OCRstreamFlag;
-bit(5) streamPriority;
-if (streamDependenceFlag)
-bit(16) dependsOn_ES_ID;
-if (URL_Flag) {
-bit(8) URLlength;
-bit(8) URLstring[URLlength];
-}
-if (OCRstreamFlag)
-bit(16) OCR_ES_Id;
-DecoderConfigDescriptor decConfigDescr;
-if (ODProfileLevelIndication==0x01) //no SL extension.
-{
-SLConfigDescriptor slConfigDescr;
-}
-else // SL extension is possible.
-{
-SLConfigDescriptor slConfigDescr;
-}
-IPI_DescrPointer ipiPtr[0 .. 1];
-IP_IdentificationDataSet ipIDS[0 .. 255];
-IPMP_DescriptorPointer ipmpDescrPtr[0 .. 255];
-LanguageDescriptor langDescr[0 .. 255];
-QoS_Descriptor qosDescr[0 .. 1];
-RegistrationDescriptor regDescr[0 .. 1];
-ExtensionDescriptor extDescr[0 .. 255];
-}
- */
-@Descriptor(tags = {0x03})
-public class ESDescriptor extends BaseDescriptor {
-    private static Logger log = Logger.getLogger(ESDescriptor.class.getName());
-
-    int esId;
-    int streamDependenceFlag;
-    int URLFlag;
-    int oCRstreamFlag;
-    int streamPriority;
-
-
-    int URLLength = 0;
-    String URLString;
-    int remoteODFlag;
-
-    int dependsOnEsId;
-    int oCREsId;
-
-    DecoderConfigDescriptor decoderConfigDescriptor;
-    SLConfigDescriptor slConfigDescriptor;
-    List<BaseDescriptor> otherDescriptors = new ArrayList<BaseDescriptor>();
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        esId = IsoTypeReader.readUInt16(bb);
-
-        int data = IsoTypeReader.readUInt8(bb);
-        streamDependenceFlag = data >>> 7;
-        URLFlag = (data >>> 6) & 0x1;
-        oCRstreamFlag = (data >>> 5) & 0x1;
-        streamPriority = data & 0x1f;
-
-        if (streamDependenceFlag == 1) {
-            dependsOnEsId = IsoTypeReader.readUInt16(bb);
-        }
-        if (URLFlag == 1) {
-            URLLength = IsoTypeReader.readUInt8(bb);
-            URLString = IsoTypeReader.readString(bb, URLLength);
-        }
-        if (oCRstreamFlag == 1) {
-            oCREsId = IsoTypeReader.readUInt16(bb);
-        }
-
-        int baseSize = 1 /*tag*/ + getSizeBytes() + 2 + 1 + (streamDependenceFlag == 1 ? 2 : 0) + (URLFlag == 1 ? 1 + URLLength : 0) + (oCRstreamFlag == 1 ? 2 : 0);
-
-        int begin = bb.position();
-        if (getSize() > baseSize + 2) {
-            BaseDescriptor descriptor = ObjectDescriptorFactory.createFrom(-1, bb);
-            final long read = bb.position() - begin;
-            log.finer(descriptor + " - ESDescriptor1 read: " + read + ", size: " + (descriptor != null ? descriptor.getSize() : null));
-            if (descriptor != null) {
-                final int size = descriptor.getSize();
-                bb.position(begin + size);
-                baseSize += size;
-            } else {
-                baseSize += read;
-            }
-            if (descriptor instanceof DecoderConfigDescriptor) {
-                decoderConfigDescriptor = (DecoderConfigDescriptor) descriptor;
-            }
-        }
-
-        begin = bb.position();
-        if (getSize() > baseSize + 2) {
-            BaseDescriptor descriptor = ObjectDescriptorFactory.createFrom(-1, bb);
-            final long read = bb.position() - begin;
-            log.finer(descriptor + " - ESDescriptor2 read: " + read + ", size: " + (descriptor != null ? descriptor.getSize() : null));
-            if (descriptor != null) {
-                final int size = descriptor.getSize();
-                bb.position(begin + size);
-                baseSize += size;
-            } else {
-                baseSize += read;
-            }
-            if (descriptor instanceof SLConfigDescriptor) {
-                slConfigDescriptor = (SLConfigDescriptor) descriptor;
-            }
-        } else {
-            log.warning("SLConfigDescriptor is missing!");
-        }
-
-        while (getSize() - baseSize > 2) {
-            begin = bb.position();
-            BaseDescriptor descriptor = ObjectDescriptorFactory.createFrom(-1, bb);
-            final long read = bb.position() - begin;
-            log.finer(descriptor + " - ESDescriptor3 read: " + read + ", size: " + (descriptor != null ? descriptor.getSize() : null));
-            if (descriptor != null) {
-                final int size = descriptor.getSize();
-                bb.position(begin + size);
-                baseSize += size;
-            } else {
-                baseSize += read;
-            }
-            otherDescriptors.add(descriptor);
-        }
-    }
-    public int serializedSize() {
-        int out = 5;
-        if (streamDependenceFlag > 0) {
-            out += 2;
-        }
-        if (URLFlag > 0) {
-            out += 1 + URLLength;
-        }
-        if (oCRstreamFlag > 0) {
-            out += 2;
-        }
-
-        out += decoderConfigDescriptor.serializedSize();
-        out += slConfigDescriptor.serializedSize();
-
-        // Doesn't handle other descriptors yet
-
-        return out;
-    }
-
-    public ByteBuffer serialize() {
-        ByteBuffer out = ByteBuffer.allocate(serializedSize()); // Usually is around 30 bytes, so 200 should be enough...
-        IsoTypeWriter.writeUInt8(out, 3);
-        IsoTypeWriter.writeUInt8(out, serializedSize() - 2); // Not OK for longer sizes!
-        IsoTypeWriter.writeUInt16(out, esId);
-        int flags = (streamDependenceFlag << 7) | (URLFlag << 6) | (oCRstreamFlag << 5) | (streamPriority & 0x1f);
-        IsoTypeWriter.writeUInt8(out, flags);
-        if (streamDependenceFlag > 0) {
-            IsoTypeWriter.writeUInt16(out, dependsOnEsId);
-        }
-        if (URLFlag > 0) {
-            IsoTypeWriter.writeUInt8(out, URLLength);
-            IsoTypeWriter.writeUtf8String(out, URLString);
-        }
-        if (oCRstreamFlag > 0) {
-            IsoTypeWriter.writeUInt16(out, oCREsId);
-        }
-
-        ByteBuffer dec = decoderConfigDescriptor.serialize();
-        ByteBuffer sl = slConfigDescriptor.serialize();
-        out.put(dec.array());
-        out.put(sl.array());
-
-        // Doesn't handle other descriptors yet
-
-        return out;
-    }
-
-//  @Override
-//  public int getSize() {
-//    return 3 + (streamDependenceFlag == 1 ? 2 : 0) +
-//            (URLFlag == 1 ? 1 + 8 * URLLength : 0) +
-//            (oCRstreamFlag == 1 ? 2 : 0);
-//  }
-
-    public DecoderConfigDescriptor getDecoderConfigDescriptor() {
-        return decoderConfigDescriptor;
-    }
-
-    public SLConfigDescriptor getSlConfigDescriptor() {
-        return slConfigDescriptor;
-    }
-
-    public void setDecoderConfigDescriptor(DecoderConfigDescriptor decoderConfigDescriptor) {
-        this.decoderConfigDescriptor = decoderConfigDescriptor;
-    }
-
-    public void setSlConfigDescriptor(SLConfigDescriptor slConfigDescriptor) {
-        this.slConfigDescriptor = slConfigDescriptor;
-    }
-
-    public List<BaseDescriptor> getOtherDescriptors() {
-        return otherDescriptors;
-    }
-
-    public int getoCREsId() {
-        return oCREsId;
-    }
-
-    public void setoCREsId(int oCREsId) {
-        this.oCREsId = oCREsId;
-    }
-
-    public int getEsId() {
-        return esId;
-    }
-
-    public void setEsId(int esId) {
-        this.esId = esId;
-    }
-
-    public int getStreamDependenceFlag() {
-        return streamDependenceFlag;
-    }
-
-    public void setStreamDependenceFlag(int streamDependenceFlag) {
-        this.streamDependenceFlag = streamDependenceFlag;
-    }
-
-    public int getURLFlag() {
-        return URLFlag;
-    }
-
-    public void setURLFlag(int URLFlag) {
-        this.URLFlag = URLFlag;
-    }
-
-    public int getoCRstreamFlag() {
-        return oCRstreamFlag;
-    }
-
-    public void setoCRstreamFlag(int oCRstreamFlag) {
-        this.oCRstreamFlag = oCRstreamFlag;
-    }
-
-    public int getStreamPriority() {
-        return streamPriority;
-    }
-
-    public void setStreamPriority(int streamPriority) {
-        this.streamPriority = streamPriority;
-    }
-
-    public int getURLLength() {
-        return URLLength;
-    }
-
-    public void setURLLength(int URLLength) {
-        this.URLLength = URLLength;
-    }
-
-    public String getURLString() {
-        return URLString;
-    }
-
-    public void setURLString(String URLString) {
-        this.URLString = URLString;
-    }
-
-    public int getRemoteODFlag() {
-        return remoteODFlag;
-    }
-
-    public void setRemoteODFlag(int remoteODFlag) {
-        this.remoteODFlag = remoteODFlag;
-    }
-
-    public int getDependsOnEsId() {
-        return dependsOnEsId;
-    }
-
-    public void setDependsOnEsId(int dependsOnEsId) {
-        this.dependsOnEsId = dependsOnEsId;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("ESDescriptor");
-        sb.append("{esId=").append(esId);
-        sb.append(", streamDependenceFlag=").append(streamDependenceFlag);
-        sb.append(", URLFlag=").append(URLFlag);
-        sb.append(", oCRstreamFlag=").append(oCRstreamFlag);
-        sb.append(", streamPriority=").append(streamPriority);
-        sb.append(", URLLength=").append(URLLength);
-        sb.append(", URLString='").append(URLString).append('\'');
-        sb.append(", remoteODFlag=").append(remoteODFlag);
-        sb.append(", dependsOnEsId=").append(dependsOnEsId);
-        sb.append(", oCREsId=").append(oCREsId);
-        sb.append(", decoderConfigDescriptor=").append(decoderConfigDescriptor);
-        sb.append(", slConfigDescriptor=").append(slConfigDescriptor);
-        sb.append('}');
-        return sb.toString();
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-
-        ESDescriptor that = (ESDescriptor) o;
-
-        if (URLFlag != that.URLFlag) return false;
-        if (URLLength != that.URLLength) return false;
-        if (dependsOnEsId != that.dependsOnEsId) return false;
-        if (esId != that.esId) return false;
-        if (oCREsId != that.oCREsId) return false;
-        if (oCRstreamFlag != that.oCRstreamFlag) return false;
-        if (remoteODFlag != that.remoteODFlag) return false;
-        if (streamDependenceFlag != that.streamDependenceFlag) return false;
-        if (streamPriority != that.streamPriority) return false;
-        if (URLString != null ? !URLString.equals(that.URLString) : that.URLString != null) return false;
-        if (decoderConfigDescriptor != null ? !decoderConfigDescriptor.equals(that.decoderConfigDescriptor) : that.decoderConfigDescriptor != null)
-            return false;
-        if (otherDescriptors != null ? !otherDescriptors.equals(that.otherDescriptors) : that.otherDescriptors != null)
-            return false;
-        if (slConfigDescriptor != null ? !slConfigDescriptor.equals(that.slConfigDescriptor) : that.slConfigDescriptor != null)
-            return false;
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = esId;
-        result = 31 * result + streamDependenceFlag;
-        result = 31 * result + URLFlag;
-        result = 31 * result + oCRstreamFlag;
-        result = 31 * result + streamPriority;
-        result = 31 * result + URLLength;
-        result = 31 * result + (URLString != null ? URLString.hashCode() : 0);
-        result = 31 * result + remoteODFlag;
-        result = 31 * result + dependsOnEsId;
-        result = 31 * result + oCREsId;
-        result = 31 * result + (decoderConfigDescriptor != null ? decoderConfigDescriptor.hashCode() : 0);
-        result = 31 * result + (slConfigDescriptor != null ? slConfigDescriptor.hashCode() : 0);
-        result = 31 * result + (otherDescriptors != null ? otherDescriptors.hashCode() : 0);
-        return result;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionDescriptor.java
deleted file mode 100755
index 7933f5a..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionDescriptor.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.Hex;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.logging.Logger;
-
-/**
- * abstract class ExtensionDescriptor extends BaseDescriptor
- * : bit(8) tag = ExtensionProfileLevelDescrTag, ExtDescrTagStartRange ..
- * ExtDescrTagEndRange {
- * // empty. To be filled by classes extending this class.
- * }
- */
-@Descriptor(tags = {0x13, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253})
-public class ExtensionDescriptor extends BaseDescriptor {
-    private static Logger log = Logger.getLogger(ExtensionDescriptor.class.getName());
-
-    byte[] bytes;
-
-
-    //todo: add this better to the tags list?
-    //14496-1:2010 p.20:
-    //0x6A-0xBF Reserved for ISO use
-    //0xC0-0xFE User private
-    //
-    //ExtDescrTagStartRange = 0x6A
-    //ExtDescrTagEndRange = 0xFE
-    static int[] allTags() {
-        int[] ints = new int[0xFE - 0x6A];
-
-        for (int i = 0x6A; i < 0xFE; i++) {
-            final int pos = i - 0x6A;
-            log.finest("pos:" + pos);
-            ints[pos] = i;
-        }
-        return ints;
-    }
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        if (getSize() > 0) {
-            bytes = new byte[sizeOfInstance];
-            bb.get(bytes);
-        }
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("ExtensionDescriptor");
-        sb.append("{bytes=").append(bytes == null ? "null" : Hex.encodeHex(bytes));
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionProfileLevelDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionProfileLevelDescriptor.java
deleted file mode 100755
index 0cf4915..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ExtensionProfileLevelDescriptor.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.Hex;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-/**
- * abstract class ExtensionDescriptor extends BaseDescriptor
- * : bit(8) tag = ExtensionProfileLevelDescrTag, ExtDescrTagStartRange ..
- * ExtDescrTagEndRange {
- * // empty. To be filled by classes extending this class.
- * }
- */
-@Descriptor(tags = {0x13})
-public class ExtensionProfileLevelDescriptor extends BaseDescriptor {
-    byte[] bytes;
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        if (getSize() > 0) {
-            bytes = new byte[getSize()];
-            bb.get(bytes);
-        }
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("ExtensionDescriptor");
-        sb.append("{bytes=").append(bytes == null ? "null" : Hex.encodeHex(bytes));
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/InitialObjectDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/InitialObjectDescriptor.java
deleted file mode 100755
index 7a1f094..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/InitialObjectDescriptor.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-
-import com.coremedia.iso.IsoTypeReader;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-/*
-class InitialObjectDescriptor extends ObjectDescriptorBase : bit(8)
-tag=InitialObjectDescrTag {
-bit(10) ObjectDescriptorID;
-bit(1) URL_Flag;
-bit(1) includeInlineProfileLevelFlag;
-const bit(4) reserved=0b1111;
-if (URL_Flag) {
-bit(8) URLlength;
-bit(8) URLstring[URLlength];
-} else {
-bit(8) ODProfileLevelIndication;
-bit(8) sceneProfileLevelIndication;
-bit(8) audioProfileLevelIndication;
-bit(8) visualProfileLevelIndication;
-bit(8) graphicsProfileLevelIndication;
-ES_Descriptor esDescr[1 .. 255];
-OCI_Descriptor ociDescr[0 .. 255];
-IPMP_DescriptorPointer ipmpDescrPtr[0 .. 255];
-IPMP_Descriptor ipmpDescr [0 .. 255];
-IPMP_ToolListDescriptor toolListDescr[0 .. 1];
-}
-ExtensionDescriptor extDescr[0 .. 255];
-}
-*/
-//@Descriptor(tags = {0x02, 0x10})
-public class InitialObjectDescriptor extends ObjectDescriptorBase {
-    private int objectDescriptorId;
-    int urlFlag;
-    int includeInlineProfileLevelFlag;
-
-    int urlLength;
-    String urlString;
-
-    int oDProfileLevelIndication;
-    int sceneProfileLevelIndication;
-    int audioProfileLevelIndication;
-    int visualProfileLevelIndication;
-    int graphicsProfileLevelIndication;
-
-    List<ESDescriptor> esDescriptors = new ArrayList<ESDescriptor>();
-
-    List<ExtensionDescriptor> extensionDescriptors = new ArrayList<ExtensionDescriptor>();
-
-    List<BaseDescriptor> unknownDescriptors = new ArrayList<BaseDescriptor>();
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        int data = IsoTypeReader.readUInt16(bb);
-        objectDescriptorId = (data & 0xFFC0) >> 6;
-
-        urlFlag = (data & 0x3F) >> 5;
-        includeInlineProfileLevelFlag = (data & 0x1F) >> 4;
-
-        int sizeLeft = getSize() - 2;
-        if (urlFlag == 1) {
-            urlLength = IsoTypeReader.readUInt8(bb);
-            urlString = IsoTypeReader.readString(bb, urlLength);
-            sizeLeft = sizeLeft - (1 + urlLength);
-        } else {
-            oDProfileLevelIndication = IsoTypeReader.readUInt8(bb);
-            sceneProfileLevelIndication = IsoTypeReader.readUInt8(bb);
-            audioProfileLevelIndication = IsoTypeReader.readUInt8(bb);
-            visualProfileLevelIndication = IsoTypeReader.readUInt8(bb);
-            graphicsProfileLevelIndication = IsoTypeReader.readUInt8(bb);
-
-            sizeLeft = sizeLeft - 5;
-
-            if (sizeLeft > 2) {
-                final BaseDescriptor descriptor = ObjectDescriptorFactory.createFrom(-1, bb);
-                sizeLeft = sizeLeft - descriptor.getSize();
-                if (descriptor instanceof ESDescriptor) {
-                    esDescriptors.add((ESDescriptor) descriptor);
-                } else {
-                    unknownDescriptors.add(descriptor);
-                }
-            }
-        }
-
-        if (sizeLeft > 2) {
-            final BaseDescriptor descriptor = ObjectDescriptorFactory.createFrom(-1, bb);
-            if (descriptor instanceof ExtensionDescriptor) {
-                extensionDescriptors.add((ExtensionDescriptor) descriptor);
-            } else {
-                unknownDescriptors.add(descriptor);
-            }
-        }
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("InitialObjectDescriptor");
-        sb.append("{objectDescriptorId=").append(objectDescriptorId);
-        sb.append(", urlFlag=").append(urlFlag);
-        sb.append(", includeInlineProfileLevelFlag=").append(includeInlineProfileLevelFlag);
-        sb.append(", urlLength=").append(urlLength);
-        sb.append(", urlString='").append(urlString).append('\'');
-        sb.append(", oDProfileLevelIndication=").append(oDProfileLevelIndication);
-        sb.append(", sceneProfileLevelIndication=").append(sceneProfileLevelIndication);
-        sb.append(", audioProfileLevelIndication=").append(audioProfileLevelIndication);
-        sb.append(", visualProfileLevelIndication=").append(visualProfileLevelIndication);
-        sb.append(", graphicsProfileLevelIndication=").append(graphicsProfileLevelIndication);
-        sb.append(", esDescriptors=").append(esDescriptors);
-        sb.append(", extensionDescriptors=").append(extensionDescriptors);
-        sb.append(", unknownDescriptors=").append(unknownDescriptors);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorBase.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorBase.java
deleted file mode 100755
index 69a8684..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorBase.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-/*
-abstract class ObjectDescriptorBase extends BaseDescriptor : bit(8)
-tag=[ObjectDescrTag..InitialObjectDescrTag] {
-// empty. To be filled by classes extending this class.
-}
- */
-@Descriptor(tags = 0x00)
-public abstract class ObjectDescriptorBase extends BaseDescriptor {
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorFactory.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorFactory.java
deleted file mode 100755
index 6afba55..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ObjectDescriptorFactory.java
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.IsoTypeReader;
-
-import java.io.IOException;
-import java.lang.reflect.Modifier;
-import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-/* class tag values of 14496-1
-0x00 Forbidden
-0x01 ObjectDescrTag
-0x02 InitialObjectDescrTag
-0x03 ES_DescrTag
-0x04 DecoderConfigDescrTag
-0x05 DecSpecificInfoTag
-0x06 SLConfigDescrTag
-0x07 ContentIdentDescrTag
-0x08 SupplContentIdentDescrTag
-0x09 IPI_DescrPointerTag
-0x0A IPMP_DescrPointerTag
-0x0B IPMP_DescrTag
-0x0C QoS_DescrTag
-0x0D RegistrationDescrTag
-0x0E ES_ID_IncTag
-0x0F ES_ID_RefTag
-0x10 MP4_IOD_Tag
-0x11 MP4_OD_Tag
-0x12 IPL_DescrPointerRefTag
-0x13 ExtensionProfileLevelDescrTag
-0x14 profileLevelIndicationIndexDescrTag
-0x15-0x3F Reserved for ISO use
-0x40 ContentClassificationDescrTag
-0x41 KeyWordDescrTag
-0x42 RatingDescrTag
-0x43 LanguageDescrTag
-0x44 ShortTextualDescrTag
-0x45 ExpandedTextualDescrTag
-0x46 ContentCreatorNameDescrTag
-0x47 ContentCreationDateDescrTag
-0x48 OCICreatorNameDescrTag
-0x49 OCICreationDateDescrTag
-0x4A SmpteCameraPositionDescrTag
-0x4B SegmentDescrTag
-0x4C MediaTimeDescrTag
-0x4D-0x5F Reserved for ISO use (OCI extensions)
-0x60 IPMP_ToolsListDescrTag
-0x61 IPMP_ToolTag
-0x62 M4MuxTimingDescrTag
-0x63 M4MuxCodeTableDescrTag
-0x64 ExtSLConfigDescrTag
-0x65 M4MuxBufferSizeDescrTag
-0x66 M4MuxIdentDescrTag
-0x67 DependencyPointerTag
-0x68 DependencyMarkerTag
-0x69 M4MuxChannelDescrTag
-0x6A-0xBF Reserved for ISO use
-0xC0-0xFE User private
-0xFF Forbidden
- */
-
-/* objectTypeIndication as of 14496-1
-0x00 Forbidden
-0x01 Systems ISO/IEC 14496-1 a
-0x02 Systems ISO/IEC 14496-1 b
-0x03 Interaction Stream
-0x04 Systems ISO/IEC 14496-1 Extended BIFS Configuration c
-0x05 Systems ISO/IEC 14496-1 AFX d
-0x06 Font Data Stream
-0x07 Synthesized Texture Stream
-0x08 Streaming Text Stream
-0x09-0x1F reserved for ISO use
-0x20 Visual ISO/IEC 14496-2 e
-0x21 Visual ITU-T Recommendation H.264 | ISO/IEC 14496-10 f
-0x22 Parameter Sets for ITU-T Recommendation H.264 | ISO/IEC 14496-10 f
-0x23-0x3F reserved for ISO use
-0x40 Audio ISO/IEC 14496-3 g
-0x41-0x5F reserved for ISO use
-0x60 Visual ISO/IEC 13818-2 Simple Profile
-0x61 Visual ISO/IEC 13818-2 Main Profile
-0x62 Visual ISO/IEC 13818-2 SNR Profile
-0x63 Visual ISO/IEC 13818-2 Spatial Profile
-0x64 Visual ISO/IEC 13818-2 High Profile
-0x65 Visual ISO/IEC 13818-2 422 Profile
-0x66 Audio ISO/IEC 13818-7 Main Profile
-0x67 Audio ISO/IEC 13818-7 LowComplexity Profile
-0x68 Audio ISO/IEC 13818-7 Scaleable Sampling Rate Profile
-0x69 Audio ISO/IEC 13818-3
-0x6A Visual ISO/IEC 11172-2
-0x6B Audio ISO/IEC 11172-3
-0x6C Visual ISO/IEC 10918-1
-0x6D reserved for registration authority
-0x6E Visual ISO/IEC 15444-1
-0x6F - 0x9F reserved for ISO use
-0xA0 - 0xBF reserved for registration authority i
-0xC0 - 0xE0 user private
-0xE1 reserved for registration authority i
-0xE2 - 0xFE user private
-0xFF no object type specified h
- */
-public class ObjectDescriptorFactory {
-    protected static Logger log = Logger.getLogger(ObjectDescriptorFactory.class.getName());
-
-    protected static Map<Integer, Map<Integer, Class<? extends BaseDescriptor>>> descriptorRegistry = new HashMap<Integer, Map<Integer, Class<? extends BaseDescriptor>>>();
-
-    static {
-        Set<Class<? extends BaseDescriptor>> annotated = new HashSet<Class<? extends BaseDescriptor>>();
-
-        annotated.add(DecoderSpecificInfo.class);
-        annotated.add(SLConfigDescriptor.class);
-        annotated.add(BaseDescriptor.class);
-        annotated.add(ExtensionDescriptor.class);
-        annotated.add(ObjectDescriptorBase.class);
-        annotated.add(ProfileLevelIndicationDescriptor.class);
-        annotated.add(AudioSpecificConfig.class);
-        annotated.add(ExtensionProfileLevelDescriptor.class);
-        annotated.add(ESDescriptor.class);
-        annotated.add(DecoderConfigDescriptor.class);
-        //annotated.add(ObjectDescriptor.class);
-
-        for (Class<? extends BaseDescriptor> clazz : annotated) {
-            final Descriptor descriptor = clazz.getAnnotation(Descriptor.class);
-            final int[] tags = descriptor.tags();
-            final int objectTypeInd = descriptor.objectTypeIndication();
-
-            Map<Integer, Class<? extends BaseDescriptor>> tagMap = descriptorRegistry.get(objectTypeInd);
-            if (tagMap == null) {
-                tagMap = new HashMap<Integer, Class<? extends BaseDescriptor>>();
-            }
-            for (int tag : tags) {
-                tagMap.put(tag, clazz);
-            }
-            descriptorRegistry.put(objectTypeInd, tagMap);
-        }
-    }
-
-    public static BaseDescriptor createFrom(int objectTypeIndication, ByteBuffer bb) throws IOException {
-        int tag = IsoTypeReader.readUInt8(bb);
-
-        Map<Integer, Class<? extends BaseDescriptor>> tagMap = descriptorRegistry.get(objectTypeIndication);
-        if (tagMap == null) {
-            tagMap = descriptorRegistry.get(-1);
-        }
-        Class<? extends BaseDescriptor> aClass = tagMap.get(tag);
-
-//    if (tag == 0x00) {
-//      log.warning("Found illegal tag 0x00! objectTypeIndication " + Integer.toHexString(objectTypeIndication) +
-//              " and tag " + Integer.toHexString(tag) + " using: " + aClass);
-//      aClass = BaseDescriptor.class;
-//    }
-
-        BaseDescriptor baseDescriptor;
-        if (aClass == null || aClass.isInterface() || Modifier.isAbstract(aClass.getModifiers())) {
-            log.warning("No ObjectDescriptor found for objectTypeIndication " + Integer.toHexString(objectTypeIndication) +
-                    " and tag " + Integer.toHexString(tag) + " found: " + aClass);
-            baseDescriptor = new UnknownDescriptor();
-        } else {
-            try {
-                baseDescriptor = aClass.newInstance();
-            } catch (Exception e) {
-                log.log(Level.SEVERE, "Couldn't instantiate BaseDescriptor class " + aClass + " for objectTypeIndication " + objectTypeIndication + " and tag " + tag, e);
-                throw new RuntimeException(e);
-            }
-        }
-        baseDescriptor.parse(tag, bb);
-        return baseDescriptor;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ProfileLevelIndicationDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ProfileLevelIndicationDescriptor.java
deleted file mode 100755
index 625277e..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/ProfileLevelIndicationDescriptor.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.IsoTypeReader;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-/**
- * class ProfileLevelIndicationIndexDescriptor () extends BaseDescriptor
- * : bit(8) ProfileLevelIndicationIndexDescrTag {
- * bit(8) profileLevelIndicationIndex;
- * }
- */
-@Descriptor(tags = 0x14)
-public class ProfileLevelIndicationDescriptor extends BaseDescriptor {
-    int profileLevelIndicationIndex;
-
-    @Override
-    public void parseDetail( ByteBuffer bb) throws IOException {
-        profileLevelIndicationIndex = IsoTypeReader.readUInt8(bb);
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("ProfileLevelIndicationDescriptor");
-        sb.append("{profileLevelIndicationIndex=").append(Integer.toHexString(profileLevelIndicationIndex));
-        sb.append('}');
-        return sb.toString();
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        ProfileLevelIndicationDescriptor that = (ProfileLevelIndicationDescriptor) o;
-
-        if (profileLevelIndicationIndex != that.profileLevelIndicationIndex) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return profileLevelIndicationIndex;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/SLConfigDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/SLConfigDescriptor.java
deleted file mode 100755
index 70a58e6..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/SLConfigDescriptor.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-/**
- * class SLConfigDescriptor extends BaseDescriptor : bit(8) tag=SLConfigDescrTag {
- * bit(8) predefined;
- * if (predefined==0) {
- * bit(1) useAccessUnitStartFlag;
- * bit(1) useAccessUnitEndFlag;
- * bit(1) useRandomAccessPointFlag;
- * bit(1) hasRandomAccessUnitsOnlyFlag;
- * bit(1) usePaddingFlag;
- * bit(1) useTimeStampsFlag;
- * bit(1) useIdleFlag;
- * bit(1) durationFlag;
- * bit(32) timeStampResolution;
- * bit(32) OCRResolution;
- * bit(8) timeStampLength; // must be ≤ 64
- * bit(8) OCRLength; // must be ≤ 64
- * bit(8) AU_Length; // must be ≤ 32
- * bit(8) instantBitrateLength;
- * bit(4) degradationPriorityLength;
- * bit(5) AU_seqNumLength; // must be ≤ 16
- * bit(5) packetSeqNumLength; // must be ≤ 16
- * bit(2) reserved=0b11;
- * }
- * if (durationFlag) {
- * bit(32) timeScale;
- * bit(16) accessUnitDuration;
- * bit(16) compositionUnitDuration;
- * }
- * if (!useTimeStampsFlag) {
- * bit(timeStampLength) startDecodingTimeStamp;
- * bit(timeStampLength) startCompositionTimeStamp;
- * }
- * }
- */
-@Descriptor(tags = {0x06})
-public class SLConfigDescriptor extends BaseDescriptor {
-    int predefined;
-
-    public int getPredefined() {
-        return predefined;
-    }
-
-    public void setPredefined(int predefined) {
-        this.predefined = predefined;
-    }
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        predefined =  IsoTypeReader.readUInt8(bb);
-    }
-
-    public int serializedSize() {
-        return 3;
-    }
-
-    public ByteBuffer serialize() {
-        ByteBuffer out = ByteBuffer.allocate(3);
-        IsoTypeWriter.writeUInt8(out, 6);
-        IsoTypeWriter.writeUInt8(out, 1);
-        IsoTypeWriter.writeUInt8(out, predefined);
-        return out;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("SLConfigDescriptor");
-        sb.append("{predefined=").append(predefined);
-        sb.append('}');
-        return sb.toString();
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        SLConfigDescriptor that = (SLConfigDescriptor) o;
-
-        if (predefined != that.predefined) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return predefined;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/UnknownDescriptor.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/UnknownDescriptor.java
deleted file mode 100755
index dd75a0f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/objectdescriptors/UnknownDescriptor.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.objectdescriptors;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.logging.Logger;
-
-public class UnknownDescriptor extends BaseDescriptor {
-    private ByteBuffer data;
-    private static Logger log = Logger.getLogger(UnknownDescriptor.class.getName());
-
-    @Override
-    public void parseDetail(ByteBuffer bb) throws IOException {
-        data = (ByteBuffer) bb.slice().limit(this.getSizeOfInstance());
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("UnknownDescriptor");
-        sb.append("{tag=").append(tag);
-        sb.append(", sizeOfInstance=").append(sizeOfInstance);
-        sb.append(", data=").append(data);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/CencSampleEncryptionInformationGroupEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/CencSampleEncryptionInformationGroupEntry.java
deleted file mode 100755
index b54f4d9..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/CencSampleEncryptionInformationGroupEntry.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.Hex;
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-
-/**
- * Each sample in a protected track shall be associated with an IsEncrypted flag, IV_Size, and KID.
- * This can be accomplished by (a) relying on the default values in the TrackEncryptionBox
- * (see 8.2), or (b) specifying the parameters by sample group, or (c) using a combination of these two techniques.
- * <p/>
- * When specifying the parameters by sample group, the SampleToGroupBox in the sample table or track
- * fragment specifies which samples use which sample group description from the SampleGroupDescriptionBox.
- */
-public class CencSampleEncryptionInformationGroupEntry extends GroupEntry {
-    public static final String TYPE = "seig";
-
-    private int isEncrypted;
-    private byte ivSize;
-    private byte[] kid = new byte[16];
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        isEncrypted = IsoTypeReader.readUInt24(byteBuffer);
-        ivSize = (byte) IsoTypeReader.readUInt8(byteBuffer);
-        kid = new byte[16];
-        byteBuffer.get(kid);
-
-    }
-
-    @Override
-    public ByteBuffer get() {
-        ByteBuffer byteBuffer = ByteBuffer.allocate(20);
-        IsoTypeWriter.writeUInt24(byteBuffer, isEncrypted);
-        IsoTypeWriter.writeUInt8(byteBuffer, ivSize);
-        byteBuffer.put(kid);
-        byteBuffer.rewind();
-        return byteBuffer;
-    }
-
-    public int getEncrypted() {
-        return isEncrypted;
-    }
-
-    public void setEncrypted(int encrypted) {
-        isEncrypted = encrypted;
-    }
-
-    public byte getIvSize() {
-        return ivSize;
-    }
-
-    public void setIvSize(byte ivSize) {
-        this.ivSize = ivSize;
-    }
-
-    public byte[] getKid() {
-        return kid;
-    }
-
-    public void setKid(byte[] kid) {
-        assert kid.length == 16;
-        this.kid = kid;
-    }
-
-    @Override
-    public String toString() {
-        return "CencSampleEncryptionInformationGroupEntry{" +
-                "isEncrypted=" + isEncrypted +
-                ", ivSize=" + ivSize +
-                ", kid=" + Hex.encodeHex(kid) +
-                '}';
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        CencSampleEncryptionInformationGroupEntry that = (CencSampleEncryptionInformationGroupEntry) o;
-
-        if (isEncrypted != that.isEncrypted) {
-            return false;
-        }
-        if (ivSize != that.ivSize) {
-            return false;
-        }
-        if (!Arrays.equals(kid, that.kid)) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = isEncrypted;
-        result = 31 * result + (int) ivSize;
-        result = 31 * result + (kid != null ? Arrays.hashCode(kid) : 0);
-        return result;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/GroupEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/GroupEntry.java
deleted file mode 100755
index 0d78d25..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/GroupEntry.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import java.nio.ByteBuffer;
-
-public abstract class GroupEntry {
-    public abstract void parse(ByteBuffer byteBuffer);
-    public abstract ByteBuffer get();
-
-    public int size() {
-        return get().limit();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RateShareEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RateShareEntry.java
deleted file mode 100755
index ae5d380..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RateShareEntry.java
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-import java.util.LinkedList;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * Each sample of a track may be associated to (zero or) one of a number of sample group descriptions, each of
- * which defines a record of rate-share information. Typically the same rate-share information applies to many
- * consecutive samples and it may therefore be enough to define two or three sample group descriptions that
- * can be used at different time intervals.
- * <p/>
- * The grouping type 'rash' (short for rate share) is defined as the grouping criterion for rate share information.
- * Zero or one sample-to-group box ('sbgp') for the grouping type 'rash' can be contained in the sample
- * table box ('stbl') of a track. It shall reside in a hint track, if a hint track is used, otherwise in a media track.
- * <p/>
- * Target rate share may be specified for several operation points that are defined in terms of the total available
- * bitrate, i.e., the bitrate that should be shared. If only one operation point is defined, the target rate share
- * applies to all available bitrates. If several operation points are defined, then each operation point specifies a
- * target rate share. Target rate share values specified for the first and the last operation points also specify the
- * target rate share values at lower and higher available bitrates, respectively. The target rate share between two
- * operation points is specified to be in the range between the target rate shares of those operation points. One
- * possibility is to estimate with linear interpolation.
- */
-public class RateShareEntry extends GroupEntry {
-    public static final String TYPE = "rash";
-
-    private short operationPointCut;
-    private short targetRateShare;
-    private List<Entry> entries = new LinkedList<Entry>();
-    private int maximumBitrate;
-    private int minimumBitrate;
-    private short discardPriority;
-
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        operationPointCut = byteBuffer.getShort();
-        if (operationPointCut == 1) {
-            targetRateShare = byteBuffer.getShort();
-        } else {
-            int entriesLeft = operationPointCut;
-            while (entriesLeft-- > 0) {
-                entries.add(new Entry(l2i(IsoTypeReader.readUInt32(byteBuffer)), byteBuffer.getShort()));
-            }
-        }
-        maximumBitrate = l2i(IsoTypeReader.readUInt32(byteBuffer));
-        minimumBitrate = l2i(IsoTypeReader.readUInt32(byteBuffer));
-        discardPriority = (short) IsoTypeReader.readUInt8(byteBuffer);
-    }
-
-    @Override
-    public ByteBuffer get() {
-        ByteBuffer buf = ByteBuffer.allocate(operationPointCut == 1?13:(operationPointCut * 6 + 11 ));
-        buf.putShort(operationPointCut);
-        if (operationPointCut == 1) {
-            buf.putShort(targetRateShare );
-        } else {
-            for (Entry entry : entries) {
-                buf.putInt(entry.getAvailableBitrate());
-                buf.putShort(entry.getTargetRateShare());
-            }
-        }
-        buf.putInt(maximumBitrate);
-        buf.putInt(minimumBitrate);
-        IsoTypeWriter.writeUInt8(buf, discardPriority);
-        buf.rewind();
-        return buf;
-    }
-
-    public static class Entry {
-        public Entry(int availableBitrate, short targetRateShare) {
-            this.availableBitrate = availableBitrate;
-            this.targetRateShare = targetRateShare;
-        }
-
-        int availableBitrate;
-        short targetRateShare;
-
-        @Override
-        public String toString() {
-            return "{" +
-                    "availableBitrate=" + availableBitrate +
-                    ", targetRateShare=" + targetRateShare +
-                    '}';
-        }
-
-        public int getAvailableBitrate() {
-            return availableBitrate;
-        }
-
-        public void setAvailableBitrate(int availableBitrate) {
-            this.availableBitrate = availableBitrate;
-        }
-
-        public short getTargetRateShare() {
-            return targetRateShare;
-        }
-
-        public void setTargetRateShare(short targetRateShare) {
-            this.targetRateShare = targetRateShare;
-        }
-
-        @Override
-        public boolean equals(Object o) {
-            if (this == o) {
-                return true;
-            }
-            if (o == null || getClass() != o.getClass()) {
-                return false;
-            }
-
-            Entry entry = (Entry) o;
-
-            if (availableBitrate != entry.availableBitrate) {
-                return false;
-            }
-            if (targetRateShare != entry.targetRateShare) {
-                return false;
-            }
-
-            return true;
-        }
-
-        @Override
-        public int hashCode() {
-            int result = availableBitrate;
-            result = 31 * result + (int) targetRateShare;
-            return result;
-        }
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        RateShareEntry that = (RateShareEntry) o;
-
-        if (discardPriority != that.discardPriority) {
-            return false;
-        }
-        if (maximumBitrate != that.maximumBitrate) {
-            return false;
-        }
-        if (minimumBitrate != that.minimumBitrate) {
-            return false;
-        }
-        if (operationPointCut != that.operationPointCut) {
-            return false;
-        }
-        if (targetRateShare != that.targetRateShare) {
-            return false;
-        }
-        if (entries != null ? !entries.equals(that.entries) : that.entries != null) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = (int) operationPointCut;
-        result = 31 * result + (int) targetRateShare;
-        result = 31 * result + (entries != null ? entries.hashCode() : 0);
-        result = 31 * result + maximumBitrate;
-        result = 31 * result + minimumBitrate;
-        result = 31 * result + (int) discardPriority;
-        return result;
-    }
-
-    public short getOperationPointCut() {
-        return operationPointCut;
-    }
-
-    public void setOperationPointCut(short operationPointCut) {
-        this.operationPointCut = operationPointCut;
-    }
-
-    public short getTargetRateShare() {
-        return targetRateShare;
-    }
-
-    public void setTargetRateShare(short targetRateShare) {
-        this.targetRateShare = targetRateShare;
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-
-    public int getMaximumBitrate() {
-        return maximumBitrate;
-    }
-
-    public void setMaximumBitrate(int maximumBitrate) {
-        this.maximumBitrate = maximumBitrate;
-    }
-
-    public int getMinimumBitrate() {
-        return minimumBitrate;
-    }
-
-    public void setMinimumBitrate(int minimumBitrate) {
-        this.minimumBitrate = minimumBitrate;
-    }
-
-    public short getDiscardPriority() {
-        return discardPriority;
-    }
-
-    public void setDiscardPriority(short discardPriority) {
-        this.discardPriority = discardPriority;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RollRecoveryEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RollRecoveryEntry.java
deleted file mode 100755
index bd5b89e..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/RollRecoveryEntry.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import java.nio.ByteBuffer;
-
-/**
- * roll_distance is a signed integer that gives the number of samples that must be decoded in order for
- * a sample to be decoded correctly. A positive value indicates the number of samples after the sample
- * that is a group member that must be decoded such that at the last of these recovery is complete, i.e.
- * the last sample is correct. A negative value indicates the number of samples before the sample that is
- * a group member that must be decoded in order for recovery to be complete at the marked sample.
- * The value zero must not be used; the sync sample table documents random access points for which
- * no recovery roll is needed.
- */
-public class RollRecoveryEntry extends GroupEntry {
-    public static final String TYPE = "roll";
-    private short rollDistance;
-
-    public short getRollDistance() {
-        return rollDistance;
-    }
-
-    public void setRollDistance(short rollDistance) {
-        this.rollDistance = rollDistance;
-    }
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        rollDistance = byteBuffer.getShort();
-    }
-
-    @Override
-    public ByteBuffer get() {
-        ByteBuffer content = ByteBuffer.allocate(2);
-        content.putShort(rollDistance);
-        content.rewind();
-        return content;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        RollRecoveryEntry entry = (RollRecoveryEntry) o;
-
-        if (rollDistance != entry.rollDistance) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return (int) rollDistance;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleGroupDescriptionBox.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleGroupDescriptionBox.java
deleted file mode 100755
index df4a96f..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleGroupDescriptionBox.java
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.LinkedList;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This description table gives information about the characteristics of sample groups. The descriptive
- * information is any other information needed to define or characterize the sample group.
- * <p/>
- * There may be multiple instances of this box if there is more than one sample grouping for the samples in a
- * track. Each instance of the SampleGroupDescription box has a type code that distinguishes different
- * sample groupings. Within a track, there shall be at most one instance of this box with a particular grouping
- * type. The associated SampleToGroup shall indicate the same value for the grouping type.
- * <p/>
- * The information is stored in the sample group description box after the entry-count. An abstract entry type is
- * defined and sample groupings shall define derived types to represent the description of each sample group.
- * For video tracks, an abstract VisualSampleGroupEntry is used with similar types for audio and hint tracks.
- */
-public class SampleGroupDescriptionBox extends AbstractFullBox {
-    public static final String TYPE = "sgpd";
-
-    private String groupingType;
-    private int defaultLength;
-    private List<GroupEntry> groupEntries = new LinkedList<GroupEntry>();
-    private int descriptionLength;
-
-    public SampleGroupDescriptionBox() {
-        super(TYPE);
-    }
-
-    @Override
-    protected long getContentSize() {
-        long size = 8;
-        if (getVersion() == 1) {
-            size += 4;
-        }
-        size += 4; // entryCount
-        for (GroupEntry groupEntry : groupEntries) {
-            if (getVersion() == 1 && defaultLength == 0) {
-                size += 4;
-            }
-            size += groupEntry.size();
-        }
-        return size;
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        byteBuffer.put(groupingType.getBytes());
-        if (this.getVersion() == 1) {
-            IsoTypeWriter.writeUInt32(byteBuffer, defaultLength);
-        }
-        IsoTypeWriter.writeUInt32(byteBuffer, this.groupEntries.size());
-        for (GroupEntry entry : groupEntries) {
-            if (this.getVersion() == 1 && defaultLength == 0) {
-                IsoTypeWriter.writeUInt32(byteBuffer, entry.get().limit());
-            }
-            byteBuffer.put(entry.get());
-        }
-    }
-
-    @Override
-    protected void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        if (this.getVersion() != 1) {
-            throw new RuntimeException("SampleGroupDescriptionBox are only supported in version 1");
-        }
-        groupingType = IsoTypeReader.read4cc(content);
-        if (this.getVersion() == 1) {
-            defaultLength = l2i(IsoTypeReader.readUInt32(content));
-        }
-        long entryCount = IsoTypeReader.readUInt32(content);
-        while (entryCount-- > 0) {
-            int length = defaultLength;
-            if (this.getVersion() == 1) {
-                if (defaultLength == 0) {
-                    descriptionLength = l2i(IsoTypeReader.readUInt32(content));
-                    length = descriptionLength;
-                }
-            } else {
-                throw new RuntimeException("This should be implemented");
-            }
-            int finalPos = content.position() + length;
-            ByteBuffer parseMe = content.slice();
-            parseMe.limit(length);
-            groupEntries.add(parseGroupEntry(parseMe, groupingType));
-            content.position(finalPos);
-        }
-
-    }
-
-    private GroupEntry parseGroupEntry(ByteBuffer content, String groupingType) {
-        GroupEntry groupEntry;
-        if (RollRecoveryEntry.TYPE.equals(groupingType)) {
-            groupEntry = new RollRecoveryEntry();
-        } else if (RateShareEntry.TYPE.equals(groupingType)) {
-            groupEntry = new RateShareEntry();
-        } else if (CencSampleEncryptionInformationGroupEntry.TYPE.equals(groupingType)) {
-            groupEntry = new CencSampleEncryptionInformationGroupEntry();
-        } else if (VisualRandomAccessEntry.TYPE.equals(groupingType)) {
-            groupEntry = new VisualRandomAccessEntry();
-        } else if (TemporalLevelEntry.TYPE.equals(groupingType)) {
-            groupEntry = new TemporalLevelEntry();
-        } else {
-            groupEntry = new UnknownEntry();
-        }
-        groupEntry.parse(content);
-        return groupEntry;
-    }
-
-
-    public String getGroupingType() {
-        return groupingType;
-    }
-
-    public void setGroupingType(String groupingType) {
-        this.groupingType = groupingType;
-    }
-
-    public int getDefaultLength() {
-        return defaultLength;
-    }
-
-    public void setDefaultLength(int defaultLength) {
-        this.defaultLength = defaultLength;
-    }
-
-    public List<GroupEntry> getGroupEntries() {
-        return groupEntries;
-    }
-
-    public void setGroupEntries(List<GroupEntry> groupEntries) {
-        this.groupEntries = groupEntries;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        SampleGroupDescriptionBox that = (SampleGroupDescriptionBox) o;
-
-        if (defaultLength != that.defaultLength) {
-            return false;
-        }
-        if (groupEntries != null ? !groupEntries.equals(that.groupEntries) : that.groupEntries != null) {
-            return false;
-        }
-        if (groupingType != null ? !groupingType.equals(that.groupingType) : that.groupingType != null) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = groupingType != null ? groupingType.hashCode() : 0;
-        result = 31 * result + defaultLength;
-        result = 31 * result + (groupEntries != null ? groupEntries.hashCode() : 0);
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        return "SampleGroupDescriptionBox{" +
-                "groupingType='" + groupingType + '\'' +
-                ", defaultLength=" + defaultLength +
-                ", groupEntries=" + groupEntries +
-                '}';
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleToGroupBox.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleToGroupBox.java
deleted file mode 100755
index 0fa059e..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/SampleToGroupBox.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-import com.googlecode.mp4parser.AbstractFullBox;
-
-import java.nio.ByteBuffer;
-import java.util.LinkedList;
-import java.util.List;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-/**
- * This table can be used to find the group that a sample belongs to and the associated description of that
- * sample group. The table is compactly coded with each entry giving the index of the first sample of a run of
- * samples with the same sample group descriptor. The sample group description ID is an index that refers to a
- * SampleGroupDescription box, which contains entries describing the characteristics of each sample group.
- * <p/>
- * There may be multiple instances of this box if there is more than one sample grouping for the samples in a
- * track. Each instance of the SampleToGroup box has a type code that distinguishes different sample
- * groupings. Within a track, there shall be at most one instance of this box with a particular grouping type. The
- * associated SampleGroupDescription shall indicate the same value for the grouping type.
- * <p/>
- * Version 1 of this box should only be used if a grouping type parameter is needed.
- */
-public class SampleToGroupBox extends AbstractFullBox {
-    public static final String TYPE = "sbgp";
-
-
-    private String groupingType;
-    private String groupingTypeParameter;
-
-    List<Entry> entries = new LinkedList<Entry>();
-
-    public SampleToGroupBox() {
-        super(TYPE);
-
-    }
-
-    @Override
-    protected long getContentSize() {
-        return this.getVersion() == 1 ? entries.size() * 8 + 16 : entries.size() * 8 + 12;
-    }
-
-    @Override
-    protected void getContent(ByteBuffer byteBuffer) {
-        writeVersionAndFlags(byteBuffer);
-        byteBuffer.put(groupingType.getBytes());
-        if (this.getVersion() == 1) {
-            byteBuffer.put(groupingTypeParameter.getBytes());
-        }
-        IsoTypeWriter.writeUInt32(byteBuffer, entries.size());
-        for (Entry entry : entries) {
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getSampleCount());
-            IsoTypeWriter.writeUInt32(byteBuffer, entry.getGroupDescriptionIndex());
-        }
-
-    }
-
-    @Override
-    protected void _parseDetails(ByteBuffer content) {
-        parseVersionAndFlags(content);
-        groupingType = IsoTypeReader.read4cc(content);
-        if (this.getVersion() == 1) {
-            groupingTypeParameter = IsoTypeReader.read4cc(content);
-        }
-        long entryCount = IsoTypeReader.readUInt32(content);
-        while (entryCount-- > 0) {
-            entries.add(new Entry(l2i(IsoTypeReader.readUInt32(content)), l2i(IsoTypeReader.readUInt32(content))));
-        }
-    }
-
-    public static class Entry {
-        private long sampleCount;
-        private int groupDescriptionIndex;
-
-        public Entry(long sampleCount, int groupDescriptionIndex) {
-            this.sampleCount = sampleCount;
-            this.groupDescriptionIndex = groupDescriptionIndex;
-        }
-
-        public long getSampleCount() {
-            return sampleCount;
-        }
-
-        public void setSampleCount(long sampleCount) {
-            this.sampleCount = sampleCount;
-        }
-
-        public int getGroupDescriptionIndex() {
-            return groupDescriptionIndex;
-        }
-
-        public void setGroupDescriptionIndex(int groupDescriptionIndex) {
-            this.groupDescriptionIndex = groupDescriptionIndex;
-        }
-
-        @Override
-        public String toString() {
-            return "Entry{" +
-                    "sampleCount=" + sampleCount +
-                    ", groupDescriptionIndex=" + groupDescriptionIndex +
-                    '}';
-        }
-
-        @Override
-        public boolean equals(Object o) {
-            if (this == o) {
-                return true;
-            }
-            if (o == null || getClass() != o.getClass()) {
-                return false;
-            }
-
-            Entry entry = (Entry) o;
-
-            if (groupDescriptionIndex != entry.groupDescriptionIndex) {
-                return false;
-            }
-            if (sampleCount != entry.sampleCount) {
-                return false;
-            }
-
-            return true;
-        }
-
-        @Override
-        public int hashCode() {
-            int result = (int) (sampleCount ^ (sampleCount >>> 32));
-            result = 31 * result + groupDescriptionIndex;
-            return result;
-        }
-    }
-
-    public String getGroupingType() {
-        return groupingType;
-    }
-
-    public void setGroupingType(String groupingType) {
-        this.groupingType = groupingType;
-    }
-
-    public String getGroupingTypeParameter() {
-        return groupingTypeParameter;
-    }
-
-    public void setGroupingTypeParameter(String groupingTypeParameter) {
-        this.groupingTypeParameter = groupingTypeParameter;
-    }
-
-    public List<Entry> getEntries() {
-        return entries;
-    }
-
-    public void setEntries(List<Entry> entries) {
-        this.entries = entries;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/TemporalLevelEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/TemporalLevelEntry.java
deleted file mode 100755
index 798fd9c..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/TemporalLevelEntry.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import java.nio.ByteBuffer;
-
-/**
- * The Temporal Level sample grouping ('tele') provides a codec-independent sample grouping that can be used to group samples (access units) in a track (and potential track fragments) according to temporal level, where samples of one temporal level have no coding dependencies on samples of higher temporal levels. The temporal level equals the sample group description index (taking values 1, 2, 3, etc). The bitstream containing only the access units from the first temporal level to a higher temporal level remains conforming to the coding standard.
- *
- * A grouping according to temporal level facilitates easy extraction of temporal subsequences, for instance using the Subsegment Indexing box in 0.
- *
- */
-public class TemporalLevelEntry extends GroupEntry {
-    public static final String TYPE = "tele";
-    private boolean levelIndependentlyDecodable;
-    private short reserved;
-
-    public boolean isLevelIndependentlyDecodable() {
-        return levelIndependentlyDecodable;
-    }
-
-    public void setLevelIndependentlyDecodable(boolean levelIndependentlyDecodable) {
-        this.levelIndependentlyDecodable = levelIndependentlyDecodable;
-    }
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        final byte b = byteBuffer.get();
-        levelIndependentlyDecodable = ((b & 0x80) == 0x80);
-    }
-
-    @Override
-    public ByteBuffer get() {
-        ByteBuffer content = ByteBuffer.allocate(1);
-        content.put((byte) (levelIndependentlyDecodable ? 0x80 : 0x00));
-        content.rewind();
-        return content;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-
-        TemporalLevelEntry that = (TemporalLevelEntry) o;
-
-        if (levelIndependentlyDecodable != that.levelIndependentlyDecodable) return false;
-        if (reserved != that.reserved) return false;
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = (levelIndependentlyDecodable ? 1 : 0);
-        result = 31 * result + (int) reserved;
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("TemporalLevelEntry");
-        sb.append("{levelIndependentlyDecodable=").append(levelIndependentlyDecodable);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/UnknownEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/UnknownEntry.java
deleted file mode 100755
index 9efcbea..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/UnknownEntry.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.Hex;
-
-import java.nio.ByteBuffer;
-
-/**
- *
- */
-public class UnknownEntry extends GroupEntry {
-    private ByteBuffer content;
-
-    public UnknownEntry() {
-    }
-
-    public ByteBuffer getContent() {
-        return content;
-    }
-
-    public void setContent(ByteBuffer content) {
-        this.content = (ByteBuffer) content.duplicate().rewind();
-    }
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        this.content = (ByteBuffer) byteBuffer.duplicate().rewind();
-    }
-
-    @Override
-    public ByteBuffer get() {
-        return content.duplicate();
-    }
-
-    @Override
-    public String toString() {
-        ByteBuffer bb = content.duplicate();
-        bb.rewind();
-        byte[] b = new byte[bb.limit()];
-        bb.get(b);
-        return "UnknownEntry{" +
-                "content=" + Hex.encodeHex(b) +
-                '}';
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        UnknownEntry that = (UnknownEntry) o;
-
-        if (content != null ? !content.equals(that.content) : that.content != null) {
-            return false;
-        }
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return content != null ? content.hashCode() : 0;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/VisualRandomAccessEntry.java b/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/VisualRandomAccessEntry.java
deleted file mode 100755
index ed5d199..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/boxes/mp4/samplegrouping/VisualRandomAccessEntry.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright 2012 castLabs, Berlin
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.googlecode.mp4parser.boxes.mp4.samplegrouping;
-
-import com.coremedia.iso.IsoTypeReader;
-
-import java.nio.ByteBuffer;
-
-/**
- * For some coding systems a sync sample is specified to be a random access point after which all samples in decoding order can be correctly decoded. However, it may be possible to encode an “open” random access point, after which all samples in output order can be correctly decoded, but some samples following the random access point in decoding order and preceding the random access point in output order need not be correctly decodable. For example, an intra picture starting an open group of pictures can be followed in decoding order by (bi-)predicted pictures that however precede the intra picture in output order; though they possibly cannot be correctly decoded if the decoding starts from the intra picture, they are not needed.
- *
- * Such “open” random-access samples can be marked by being a member of this group. Samples marked by this group must be random access points, and may also be sync points (i.e. it is not required that samples marked by the sync sample table be excluded).
- *
- */
-public class VisualRandomAccessEntry extends GroupEntry {
-    public static final String TYPE = "rap ";
-    private boolean numLeadingSamplesKnown;
-    private short numLeadingSamples;
-
-    public boolean isNumLeadingSamplesKnown() {
-        return numLeadingSamplesKnown;
-    }
-
-    public void setNumLeadingSamplesKnown(boolean numLeadingSamplesKnown) {
-        this.numLeadingSamplesKnown = numLeadingSamplesKnown;
-    }
-
-    public short getNumLeadingSamples() {
-        return numLeadingSamples;
-    }
-
-    public void setNumLeadingSamples(short numLeadingSamples) {
-        this.numLeadingSamples = numLeadingSamples;
-    }
-
-    @Override
-    public void parse(ByteBuffer byteBuffer) {
-        final byte b = byteBuffer.get();
-        numLeadingSamplesKnown = ((b & 0x80) == 0x80);
-        numLeadingSamples = (short) (b & 0x7f);
-    }
-
-    @Override
-    public ByteBuffer get() {
-        ByteBuffer content = ByteBuffer.allocate(1);
-        content.put((byte) ((numLeadingSamplesKnown? 0x80 : 0x00)| (numLeadingSamples & 0x7f)));
-        content.rewind();
-        return content;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-
-        VisualRandomAccessEntry that = (VisualRandomAccessEntry) o;
-
-        if (numLeadingSamples != that.numLeadingSamples) return false;
-        if (numLeadingSamplesKnown != that.numLeadingSamplesKnown) return false;
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = (numLeadingSamplesKnown ? 1 : 0);
-        result = 31 * result + (int) numLeadingSamples;
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        final StringBuilder sb = new StringBuilder();
-        sb.append("VisualRandomAccessEntry");
-        sb.append("{numLeadingSamplesKnown=").append(numLeadingSamplesKnown);
-        sb.append(", numLeadingSamples=").append(numLeadingSamples);
-        sb.append('}');
-        return sb.toString();
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/BTree.java b/android/src/main/java/com/googlecode/mp4parser/h264/BTree.java
deleted file mode 100755
index 57391ba..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/BTree.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264;
-
-
-/**
- * Simple BTree implementation needed for haffman tables
- *
- * @author Stanislav Vitvitskiy
- */
-public class BTree {
-    private BTree zero;
-    private BTree one;
-    private Object value;
-
-    /**
-     * Adds a leaf value to a binary path specified by path
-     *
-     * @param str
-     * @param value
-     */
-    public void addString(String path, Object value) {
-        if (path.length() == 0) {
-            this.value = value;
-            return;
-        }
-        char charAt = path.charAt(0);
-        BTree branch;
-        if (charAt == '0') {
-            if (zero == null)
-                zero = new BTree();
-            branch = zero;
-        } else {
-            if (one == null)
-                one = new BTree();
-            branch = one;
-        }
-        branch.addString(path.substring(1), value);
-    }
-
-    public BTree down(int b) {
-        if (b == 0)
-            return zero;
-        else
-            return one;
-    }
-
-    public Object getValue() {
-        return value;
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/CharCache.java b/android/src/main/java/com/googlecode/mp4parser/h264/CharCache.java
deleted file mode 100755
index 2fe8ead..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/CharCache.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264;
-
-public class CharCache {
-    private char[] cache;
-    private int pos;
-
-    public CharCache(int capacity) {
-        cache = new char[capacity];
-    }
-
-    public void append(String str) {
-        char[] chars = str.toCharArray();
-        int available = cache.length - pos;
-        int toWrite = chars.length < available ? chars.length : available;
-        System.arraycopy(chars, 0, cache, pos, toWrite);
-        pos += toWrite;
-    }
-
-    public String toString() {
-        return new String(cache, 0, pos);
-    }
-
-    public void clear() {
-        pos = 0;
-    }
-
-    public void append(char c) {
-        if (pos < cache.length - 1) {
-            cache[pos] = c;
-            pos++;
-        }
-    }
-
-    public int length() {
-        return pos;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/Debug.java b/android/src/main/java/com/googlecode/mp4parser/h264/Debug.java
deleted file mode 100755
index d0bea73..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/Debug.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264;
-
-import java.nio.ShortBuffer;
-
-public class Debug {
-    public final static void print8x8(int[] output) {
-        int i = 0;
-        for (int x = 0; x < 8; x++) {
-            for (int y = 0; y < 8; y++) {
-                System.out.printf("%3d, ", output[i]);
-                i++;
-            }
-            System.out.println();
-        }
-    }
-
-    public final static void print8x8(short[] output) {
-        int i = 0;
-        for (int x = 0; x < 8; x++) {
-            for (int y = 0; y < 8; y++) {
-                System.out.printf("%3d, ", output[i]);
-                i++;
-            }
-            System.out.println();
-        }
-    }
-
-    public final static void print8x8(ShortBuffer output) {
-        for (int x = 0; x < 8; x++) {
-            for (int y = 0; y < 8; y++) {
-                System.out.printf("%3d, ", output.get());
-            }
-            System.out.println();
-        }
-    }
-
-    public static void print(short[] table) {
-        int i = 0;
-        for (int x = 0; x < 8; x++) {
-            for (int y = 0; y < 8; y++) {
-                System.out.printf("%3d, ", table[i]);
-                i++;
-            }
-            System.out.println();
-        }
-    }
-
-    public static void trace(String format, Object... args) {
-        // System.out.printf("> " + format + "\n", args);
-    }
-
-    public final static boolean debug = false;
-
-    public static void print(int i) {
-        if (debug)
-            System.out.print(i);
-    }
-
-    public static void print(String string) {
-        if (debug)
-            System.out.print(string);
-    }
-
-    public static void println(String string) {
-        if (debug)
-            System.out.println(string);
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/AspectRatio.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/AspectRatio.java
deleted file mode 100755
index bc66b1a..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/AspectRatio.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-/**
- * Aspect ratio
- * <p/>
- * dynamic enum
- *
- * @author Stanislav Vitvitskiy
- */
-public class AspectRatio {
-
-    public static final AspectRatio Extended_SAR = new AspectRatio(255);
-
-    private int value;
-
-    private AspectRatio(int value) {
-        this.value = value;
-    }
-
-    public static AspectRatio fromValue(int value) {
-        if (value == Extended_SAR.value) {
-            return Extended_SAR;
-        }
-        return new AspectRatio(value);
-    }
-
-    public int getValue() {
-        return value;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/BitstreamElement.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/BitstreamElement.java
deleted file mode 100755
index f16c5e9..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/BitstreamElement.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-public abstract class BitstreamElement {
-
-    public abstract void write(OutputStream out) throws IOException;
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/ChromaFormat.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/ChromaFormat.java
deleted file mode 100755
index 2af2966..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/ChromaFormat.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-/**
- * Chroma format enum
- *
- * @author Stanislav Vitvitskiy
- */
-public class ChromaFormat {
-    public static ChromaFormat MONOCHROME = new ChromaFormat(0, 0, 0);
-    public static ChromaFormat YUV_420 = new ChromaFormat(1, 2, 2);
-    public static ChromaFormat YUV_422 = new ChromaFormat(2, 2, 1);
-    public static ChromaFormat YUV_444 = new ChromaFormat(3, 1, 1);
-
-    private int id;
-    private int subWidth;
-    private int subHeight;
-
-    public ChromaFormat(int id, int subWidth, int subHeight) {
-        this.id = id;
-        this.subWidth = subWidth;
-        this.subHeight = subHeight;
-    }
-
-    public static ChromaFormat fromId(int id) {
-        if (id == MONOCHROME.id) {
-            return MONOCHROME;
-        } else if (id == YUV_420.id) {
-            return YUV_420;
-        } else if (id == YUV_422.id) {
-            return YUV_422;
-        } else if (id == YUV_444.id) {
-            return YUV_444;
-        }
-        return null;
-    }
-
-    public int getId() {
-        return id;
-    }
-
-    public int getSubWidth() {
-        return subWidth;
-    }
-
-    public int getSubHeight() {
-        return subHeight;
-    }
-
-    @Override
-    public String toString() {
-        return "ChromaFormat{" + "\n" +
-                "id=" + id + ",\n" +
-                " subWidth=" + subWidth + ",\n" +
-                " subHeight=" + subHeight +
-                '}';
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/HRDParameters.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/HRDParameters.java
deleted file mode 100755
index f713ab2..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/HRDParameters.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import java.util.Arrays;
-
-public class HRDParameters {
-
-    public int cpb_cnt_minus1;
-    public int bit_rate_scale;
-    public int cpb_size_scale;
-    public int[] bit_rate_value_minus1;
-    public int[] cpb_size_value_minus1;
-    public boolean[] cbr_flag;
-    public int initial_cpb_removal_delay_length_minus1;
-    public int cpb_removal_delay_length_minus1;
-    public int dpb_output_delay_length_minus1;
-    public int time_offset_length;
-
-    @Override
-    public String toString() {
-        return "HRDParameters{" +
-                "cpb_cnt_minus1=" + cpb_cnt_minus1 +
-                ", bit_rate_scale=" + bit_rate_scale +
-                ", cpb_size_scale=" + cpb_size_scale +
-                ", bit_rate_value_minus1=" + Arrays.toString(bit_rate_value_minus1) +
-                ", cpb_size_value_minus1=" + Arrays.toString(cpb_size_value_minus1) +
-                ", cbr_flag=" + Arrays.toString(cbr_flag) +
-                ", initial_cpb_removal_delay_length_minus1=" + initial_cpb_removal_delay_length_minus1 +
-                ", cpb_removal_delay_length_minus1=" + cpb_removal_delay_length_minus1 +
-                ", dpb_output_delay_length_minus1=" + dpb_output_delay_length_minus1 +
-                ", time_offset_length=" + time_offset_length +
-                '}';
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/PictureParameterSet.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/PictureParameterSet.java
deleted file mode 100755
index 9154c38..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/PictureParameterSet.java
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import com.googlecode.mp4parser.h264.read.CAVLCReader;
-import com.googlecode.mp4parser.h264.write.CAVLCWriter;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-
-/**
- * Picture Parameter Set entity of H264 bitstream
- * <p/>
- * capable to serialize / deserialize with CAVLC bitstream
- *
- * @author Stanislav Vitvitskiy
- */
-public class PictureParameterSet extends BitstreamElement {
-
-    public static class PPSExt {
-        public boolean transform_8x8_mode_flag;
-        public ScalingMatrix scalindMatrix = new ScalingMatrix();
-        public int second_chroma_qp_index_offset;
-        public boolean[] pic_scaling_list_present_flag;
-
-        @Override
-        public String toString() {
-            return "PPSExt{" +
-                    "transform_8x8_mode_flag=" + transform_8x8_mode_flag +
-                    ", scalindMatrix=" + scalindMatrix +
-                    ", second_chroma_qp_index_offset=" + second_chroma_qp_index_offset +
-                    ", pic_scaling_list_present_flag=" + pic_scaling_list_present_flag +
-                    '}';
-        }
-    }
-
-    public boolean entropy_coding_mode_flag;
-    public int num_ref_idx_l0_active_minus1;
-    public int num_ref_idx_l1_active_minus1;
-    public int slice_group_change_rate_minus1;
-    public int pic_parameter_set_id;
-    public int seq_parameter_set_id;
-    public boolean pic_order_present_flag;
-    public int num_slice_groups_minus1;
-    public int slice_group_map_type;
-    public boolean weighted_pred_flag;
-    public int weighted_bipred_idc;
-    public int pic_init_qp_minus26;
-    public int pic_init_qs_minus26;
-    public int chroma_qp_index_offset;
-    public boolean deblocking_filter_control_present_flag;
-    public boolean constrained_intra_pred_flag;
-    public boolean redundant_pic_cnt_present_flag;
-    public int[] top_left;
-    public int[] bottom_right;
-    public int[] run_length_minus1;
-    public boolean slice_group_change_direction_flag;
-    public int[] slice_group_id;
-    public PPSExt extended;
-
-    public static PictureParameterSet read(byte[] b) throws IOException {
-        return read(new ByteArrayInputStream(b));
-    }
-
-    public static PictureParameterSet read(InputStream is) throws IOException {
-        CAVLCReader reader = new CAVLCReader(is);
-        PictureParameterSet pps = new PictureParameterSet();
-
-        pps.pic_parameter_set_id = reader.readUE("PPS: pic_parameter_set_id");
-        pps.seq_parameter_set_id = reader.readUE("PPS: seq_parameter_set_id");
-        pps.entropy_coding_mode_flag = reader
-                .readBool("PPS: entropy_coding_mode_flag");
-        pps.pic_order_present_flag = reader
-                .readBool("PPS: pic_order_present_flag");
-        pps.num_slice_groups_minus1 = reader
-                .readUE("PPS: num_slice_groups_minus1");
-        if (pps.num_slice_groups_minus1 > 0) {
-            pps.slice_group_map_type = reader
-                    .readUE("PPS: slice_group_map_type");
-            pps.top_left = new int[pps.num_slice_groups_minus1 + 1];
-            pps.bottom_right = new int[pps.num_slice_groups_minus1 + 1];
-            pps.run_length_minus1 = new int[pps.num_slice_groups_minus1 + 1];
-            if (pps.slice_group_map_type == 0)
-                for (int iGroup = 0; iGroup <= pps.num_slice_groups_minus1; iGroup++)
-                    pps.run_length_minus1[iGroup] = reader
-                            .readUE("PPS: run_length_minus1");
-            else if (pps.slice_group_map_type == 2)
-                for (int iGroup = 0; iGroup < pps.num_slice_groups_minus1; iGroup++) {
-                    pps.top_left[iGroup] = reader.readUE("PPS: top_left");
-                    pps.bottom_right[iGroup] = reader
-                            .readUE("PPS: bottom_right");
-                }
-            else if (pps.slice_group_map_type == 3
-                    || pps.slice_group_map_type == 4
-                    || pps.slice_group_map_type == 5) {
-                pps.slice_group_change_direction_flag = reader
-                        .readBool("PPS: slice_group_change_direction_flag");
-                pps.slice_group_change_rate_minus1 = reader
-                        .readUE("PPS: slice_group_change_rate_minus1");
-            } else if (pps.slice_group_map_type == 6) {
-                int NumberBitsPerSliceGroupId;
-                if (pps.num_slice_groups_minus1 + 1 > 4)
-                    NumberBitsPerSliceGroupId = 3;
-                else if (pps.num_slice_groups_minus1 + 1 > 2)
-                    NumberBitsPerSliceGroupId = 2;
-                else
-                    NumberBitsPerSliceGroupId = 1;
-                int pic_size_in_map_units_minus1 = reader
-                        .readUE("PPS: pic_size_in_map_units_minus1");
-                pps.slice_group_id = new int[pic_size_in_map_units_minus1 + 1];
-                for (int i = 0; i <= pic_size_in_map_units_minus1; i++) {
-                    pps.slice_group_id[i] = reader.readU(
-                            NumberBitsPerSliceGroupId, "PPS: slice_group_id ["
-                            + i + "]f");
-                }
-            }
-        }
-        pps.num_ref_idx_l0_active_minus1 = reader
-                .readUE("PPS: num_ref_idx_l0_active_minus1");
-        pps.num_ref_idx_l1_active_minus1 = reader
-                .readUE("PPS: num_ref_idx_l1_active_minus1");
-        pps.weighted_pred_flag = reader.readBool("PPS: weighted_pred_flag");
-        pps.weighted_bipred_idc = (int) reader.readNBit(2,
-                "PPS: weighted_bipred_idc");
-        pps.pic_init_qp_minus26 = reader.readSE("PPS: pic_init_qp_minus26");
-        pps.pic_init_qs_minus26 = reader.readSE("PPS: pic_init_qs_minus26");
-        pps.chroma_qp_index_offset = reader
-                .readSE("PPS: chroma_qp_index_offset");
-        pps.deblocking_filter_control_present_flag = reader
-                .readBool("PPS: deblocking_filter_control_present_flag");
-        pps.constrained_intra_pred_flag = reader
-                .readBool("PPS: constrained_intra_pred_flag");
-        pps.redundant_pic_cnt_present_flag = reader
-                .readBool("PPS: redundant_pic_cnt_present_flag");
-        if (reader.moreRBSPData()) {
-            pps.extended = new PictureParameterSet.PPSExt();
-            pps.extended.transform_8x8_mode_flag = reader
-                    .readBool("PPS: transform_8x8_mode_flag");
-            boolean pic_scaling_matrix_present_flag = reader
-                    .readBool("PPS: pic_scaling_matrix_present_flag");
-            if (pic_scaling_matrix_present_flag) {
-                for (int i = 0; i < 6 + 2 * (pps.extended.transform_8x8_mode_flag ? 1
-                        : 0); i++) {
-                    boolean pic_scaling_list_present_flag = reader
-                            .readBool("PPS: pic_scaling_list_present_flag");
-                    if (pic_scaling_list_present_flag) {
-                        pps.extended.scalindMatrix.ScalingList4x4 = new ScalingList[8];
-                        pps.extended.scalindMatrix.ScalingList8x8 = new ScalingList[8];
-                        if (i < 6) {
-                            pps.extended.scalindMatrix.ScalingList4x4[i] = ScalingList
-                                    .read(reader, 16);
-                        } else {
-                            pps.extended.scalindMatrix.ScalingList8x8[i - 6] = ScalingList
-                                    .read(reader, 64);
-                        }
-                    }
-                }
-            }
-            pps.extended.second_chroma_qp_index_offset = reader
-                    .readSE("PPS: second_chroma_qp_index_offset");
-        }
-
-        reader.readTrailingBits();
-
-        return pps;
-    }
-
-    public void write(OutputStream out) throws IOException {
-        CAVLCWriter writer = new CAVLCWriter(out);
-
-        writer.writeUE(pic_parameter_set_id, "PPS: pic_parameter_set_id");
-        writer.writeUE(seq_parameter_set_id, "PPS: seq_parameter_set_id");
-        writer.writeBool(entropy_coding_mode_flag,
-                "PPS: entropy_coding_mode_flag");
-        writer.writeBool(pic_order_present_flag, "PPS: pic_order_present_flag");
-        writer.writeUE(num_slice_groups_minus1, "PPS: num_slice_groups_minus1");
-        if (num_slice_groups_minus1 > 0) {
-            writer.writeUE(slice_group_map_type, "PPS: slice_group_map_type");
-            int[] top_left = new int[1];
-            int[] bottom_right = new int[1];
-            int[] run_length_minus1 = new int[1];
-            if (slice_group_map_type == 0) {
-                for (int iGroup = 0; iGroup <= num_slice_groups_minus1; iGroup++) {
-                    writer.writeUE(run_length_minus1[iGroup], "PPS: ");
-                }
-            } else if (slice_group_map_type == 2) {
-                for (int iGroup = 0; iGroup < num_slice_groups_minus1; iGroup++) {
-                    writer.writeUE(top_left[iGroup], "PPS: ");
-                    writer.writeUE(bottom_right[iGroup], "PPS: ");
-                }
-            } else if (slice_group_map_type == 3 || slice_group_map_type == 4
-                    || slice_group_map_type == 5) {
-                writer.writeBool(slice_group_change_direction_flag,
-                        "PPS: slice_group_change_direction_flag");
-                writer.writeUE(slice_group_change_rate_minus1,
-                        "PPS: slice_group_change_rate_minus1");
-            } else if (slice_group_map_type == 6) {
-                int NumberBitsPerSliceGroupId;
-                if (num_slice_groups_minus1 + 1 > 4)
-                    NumberBitsPerSliceGroupId = 3;
-                else if (num_slice_groups_minus1 + 1 > 2)
-                    NumberBitsPerSliceGroupId = 2;
-                else
-                    NumberBitsPerSliceGroupId = 1;
-                writer.writeUE(slice_group_id.length, "PPS: ");
-                for (int i = 0; i <= slice_group_id.length; i++) {
-                    writer.writeU(slice_group_id[i], NumberBitsPerSliceGroupId);
-                }
-            }
-        }
-        writer.writeUE(num_ref_idx_l0_active_minus1,
-                "PPS: num_ref_idx_l0_active_minus1");
-        writer.writeUE(num_ref_idx_l1_active_minus1,
-                "PPS: num_ref_idx_l1_active_minus1");
-        writer.writeBool(weighted_pred_flag, "PPS: weighted_pred_flag");
-        writer.writeNBit(weighted_bipred_idc, 2, "PPS: weighted_bipred_idc");
-        writer.writeSE(pic_init_qp_minus26, "PPS: pic_init_qp_minus26");
-        writer.writeSE(pic_init_qs_minus26, "PPS: pic_init_qs_minus26");
-        writer.writeSE(chroma_qp_index_offset, "PPS: chroma_qp_index_offset");
-        writer.writeBool(deblocking_filter_control_present_flag,
-                "PPS: deblocking_filter_control_present_flag");
-        writer.writeBool(constrained_intra_pred_flag,
-                "PPS: constrained_intra_pred_flag");
-        writer.writeBool(redundant_pic_cnt_present_flag,
-                "PPS: redundant_pic_cnt_present_flag");
-        if (extended != null) {
-            writer.writeBool(extended.transform_8x8_mode_flag,
-                    "PPS: transform_8x8_mode_flag");
-            writer.writeBool(extended.scalindMatrix != null,
-                    "PPS: scalindMatrix");
-            if (extended.scalindMatrix != null) {
-                for (int i = 0; i < 6 + 2 * (extended.transform_8x8_mode_flag ? 1
-                        : 0); i++) {
-                    if (i < 6) {
-                        writer
-                                .writeBool(
-                                        extended.scalindMatrix.ScalingList4x4[i] != null,
-                                        "PPS: ");
-                        if (extended.scalindMatrix.ScalingList4x4[i] != null) {
-                            extended.scalindMatrix.ScalingList4x4[i]
-                                    .write(writer);
-                        }
-
-                    } else {
-                        writer
-                                .writeBool(
-                                        extended.scalindMatrix.ScalingList8x8[i - 6] != null,
-                                        "PPS: ");
-                        if (extended.scalindMatrix.ScalingList8x8[i - 6] != null) {
-                            extended.scalindMatrix.ScalingList8x8[i - 6]
-                                    .write(writer);
-                        }
-                    }
-                }
-            }
-            writer.writeSE(extended.second_chroma_qp_index_offset, "PPS: ");
-        }
-
-        writer.writeTrailingBits();
-    }
-
-    @Override
-    public int hashCode() {
-        final int prime = 31;
-        int result = 1;
-        result = prime * result + Arrays.hashCode(bottom_right);
-        result = prime * result + chroma_qp_index_offset;
-        result = prime * result + (constrained_intra_pred_flag ? 1231 : 1237);
-        result = prime * result
-                + (deblocking_filter_control_present_flag ? 1231 : 1237);
-        result = prime * result + (entropy_coding_mode_flag ? 1231 : 1237);
-        result = prime * result
-                + ((extended == null) ? 0 : extended.hashCode());
-        result = prime * result + num_ref_idx_l0_active_minus1;
-        result = prime * result + num_ref_idx_l1_active_minus1;
-        result = prime * result + num_slice_groups_minus1;
-        result = prime * result + pic_init_qp_minus26;
-        result = prime * result + pic_init_qs_minus26;
-        result = prime * result + (pic_order_present_flag ? 1231 : 1237);
-        result = prime * result + pic_parameter_set_id;
-        result = prime * result
-                + (redundant_pic_cnt_present_flag ? 1231 : 1237);
-        result = prime * result + Arrays.hashCode(run_length_minus1);
-        result = prime * result + seq_parameter_set_id;
-        result = prime * result
-                + (slice_group_change_direction_flag ? 1231 : 1237);
-        result = prime * result + slice_group_change_rate_minus1;
-        result = prime * result + Arrays.hashCode(slice_group_id);
-        result = prime * result + slice_group_map_type;
-        result = prime * result + Arrays.hashCode(top_left);
-        result = prime * result + weighted_bipred_idc;
-        result = prime * result + (weighted_pred_flag ? 1231 : 1237);
-        return result;
-    }
-
-    @Override
-    public boolean equals(Object obj) {
-        if (this == obj)
-            return true;
-        if (obj == null)
-            return false;
-        if (getClass() != obj.getClass())
-            return false;
-        PictureParameterSet other = (PictureParameterSet) obj;
-        if (!Arrays.equals(bottom_right, other.bottom_right))
-            return false;
-        if (chroma_qp_index_offset != other.chroma_qp_index_offset)
-            return false;
-        if (constrained_intra_pred_flag != other.constrained_intra_pred_flag)
-            return false;
-        if (deblocking_filter_control_present_flag != other.deblocking_filter_control_present_flag)
-            return false;
-        if (entropy_coding_mode_flag != other.entropy_coding_mode_flag)
-            return false;
-        if (extended == null) {
-            if (other.extended != null)
-                return false;
-        } else if (!extended.equals(other.extended))
-            return false;
-        if (num_ref_idx_l0_active_minus1 != other.num_ref_idx_l0_active_minus1)
-            return false;
-        if (num_ref_idx_l1_active_minus1 != other.num_ref_idx_l1_active_minus1)
-            return false;
-        if (num_slice_groups_minus1 != other.num_slice_groups_minus1)
-            return false;
-        if (pic_init_qp_minus26 != other.pic_init_qp_minus26)
-            return false;
-        if (pic_init_qs_minus26 != other.pic_init_qs_minus26)
-            return false;
-        if (pic_order_present_flag != other.pic_order_present_flag)
-            return false;
-        if (pic_parameter_set_id != other.pic_parameter_set_id)
-            return false;
-        if (redundant_pic_cnt_present_flag != other.redundant_pic_cnt_present_flag)
-            return false;
-        if (!Arrays.equals(run_length_minus1, other.run_length_minus1))
-            return false;
-        if (seq_parameter_set_id != other.seq_parameter_set_id)
-            return false;
-        if (slice_group_change_direction_flag != other.slice_group_change_direction_flag)
-            return false;
-        if (slice_group_change_rate_minus1 != other.slice_group_change_rate_minus1)
-            return false;
-        if (!Arrays.equals(slice_group_id, other.slice_group_id))
-            return false;
-        if (slice_group_map_type != other.slice_group_map_type)
-            return false;
-        if (!Arrays.equals(top_left, other.top_left))
-            return false;
-        if (weighted_bipred_idc != other.weighted_bipred_idc)
-            return false;
-        if (weighted_pred_flag != other.weighted_pred_flag)
-            return false;
-        return true;
-    }
-
-    @Override
-    public String toString() {
-        return "PictureParameterSet{" +
-                "\n       entropy_coding_mode_flag=" + entropy_coding_mode_flag +
-                ",\n       num_ref_idx_l0_active_minus1=" + num_ref_idx_l0_active_minus1 +
-                ",\n       num_ref_idx_l1_active_minus1=" + num_ref_idx_l1_active_minus1 +
-                ",\n       slice_group_change_rate_minus1=" + slice_group_change_rate_minus1 +
-                ",\n       pic_parameter_set_id=" + pic_parameter_set_id +
-                ",\n       seq_parameter_set_id=" + seq_parameter_set_id +
-                ",\n       pic_order_present_flag=" + pic_order_present_flag +
-                ",\n       num_slice_groups_minus1=" + num_slice_groups_minus1 +
-                ",\n       slice_group_map_type=" + slice_group_map_type +
-                ",\n       weighted_pred_flag=" + weighted_pred_flag +
-                ",\n       weighted_bipred_idc=" + weighted_bipred_idc +
-                ",\n       pic_init_qp_minus26=" + pic_init_qp_minus26 +
-                ",\n       pic_init_qs_minus26=" + pic_init_qs_minus26 +
-                ",\n       chroma_qp_index_offset=" + chroma_qp_index_offset +
-                ",\n       deblocking_filter_control_present_flag=" + deblocking_filter_control_present_flag +
-                ",\n       constrained_intra_pred_flag=" + constrained_intra_pred_flag +
-                ",\n       redundant_pic_cnt_present_flag=" + redundant_pic_cnt_present_flag +
-                ",\n       top_left=" + top_left +
-                ",\n       bottom_right=" + bottom_right +
-                ",\n       run_length_minus1=" + run_length_minus1 +
-                ",\n       slice_group_change_direction_flag=" + slice_group_change_direction_flag +
-                ",\n       slice_group_id=" + slice_group_id +
-                ",\n       extended=" + extended +
-                '}';
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingList.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingList.java
deleted file mode 100755
index 5d272bf..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingList.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import com.googlecode.mp4parser.h264.read.CAVLCReader;
-import com.googlecode.mp4parser.h264.write.CAVLCWriter;
-
-import java.io.IOException;
-
-/**
- * Scaling list entity
- * <p/>
- * capable to serialize / deserialize with CAVLC bitstream
- *
- * @author Stanislav Vitvitskiy
- */
-public class ScalingList {
-
-    public int[] scalingList;
-    public boolean useDefaultScalingMatrixFlag;
-
-    public void write(CAVLCWriter out) throws IOException {
-        if (useDefaultScalingMatrixFlag) {
-            out.writeSE(0, "SPS: ");
-            return;
-        }
-
-        int lastScale = 8;
-        int nextScale = 8;
-        for (int j = 0; j < scalingList.length; j++) {
-            if (nextScale != 0) {
-                int deltaScale = scalingList[j] - lastScale - 256;
-                out.writeSE(deltaScale, "SPS: ");
-            }
-            lastScale = scalingList[j];
-        }
-    }
-
-    public static ScalingList read(CAVLCReader is, int sizeOfScalingList)
-            throws IOException {
-
-        ScalingList sl = new ScalingList();
-        sl.scalingList = new int[sizeOfScalingList];
-        int lastScale = 8;
-        int nextScale = 8;
-        for (int j = 0; j < sizeOfScalingList; j++) {
-            if (nextScale != 0) {
-                int deltaScale = is.readSE("deltaScale");
-                nextScale = (lastScale + deltaScale + 256) % 256;
-                sl.useDefaultScalingMatrixFlag = (j == 0 && nextScale == 0);
-            }
-            sl.scalingList[j] = nextScale == 0 ? lastScale : nextScale;
-            lastScale = sl.scalingList[j];
-        }
-        return sl;
-    }
-
-    @Override
-    public String toString() {
-        return "ScalingList{" +
-                "scalingList=" + scalingList +
-                ", useDefaultScalingMatrixFlag=" + useDefaultScalingMatrixFlag +
-                '}';
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingMatrix.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingMatrix.java
deleted file mode 100755
index d04af8e..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/ScalingMatrix.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import java.util.Arrays;
-
-public class ScalingMatrix {
-
-    public ScalingList[] ScalingList4x4;
-    public ScalingList[] ScalingList8x8;
-
-    @Override
-    public String toString() {
-        return "ScalingMatrix{" +
-                "ScalingList4x4=" + (ScalingList4x4 == null ? null : Arrays.asList(ScalingList4x4)) + "\n" +
-                ", ScalingList8x8=" + (ScalingList8x8 == null ? null : Arrays.asList(ScalingList8x8)) + "\n" +
-                '}';
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/SeqParameterSet.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/SeqParameterSet.java
deleted file mode 100755
index 4894df8..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/SeqParameterSet.java
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-import com.googlecode.mp4parser.h264.read.CAVLCReader;
-import com.googlecode.mp4parser.h264.write.CAVLCWriter;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * Sequence Parameter Set structure of h264 bitstream
- * <p/>
- * capable to serialize and deserialize with CAVLC bitstream
- *
- * @author Stanislav Vitvitskiy
- */
-public class SeqParameterSet extends BitstreamElement {
-    public int pic_order_cnt_type;
-    public boolean field_pic_flag;
-    public boolean delta_pic_order_always_zero_flag;
-    public boolean weighted_pred_flag;
-    public int weighted_bipred_idc;
-    public boolean entropy_coding_mode_flag;
-    public boolean mb_adaptive_frame_field_flag;
-    public boolean direct_8x8_inference_flag;
-    public ChromaFormat chroma_format_idc;
-    public int log2_max_frame_num_minus4;
-    public int log2_max_pic_order_cnt_lsb_minus4;
-    public int pic_height_in_map_units_minus1;
-    public int pic_width_in_mbs_minus1;
-    public int bit_depth_luma_minus8;
-    public int bit_depth_chroma_minus8;
-    public boolean qpprime_y_zero_transform_bypass_flag;
-    public int profile_idc;
-    public boolean constraint_set_0_flag;
-    public boolean constraint_set_1_flag;
-    public boolean constraint_set_2_flag;
-    public boolean constraint_set_3_flag;
-    public int level_idc;
-    public int seq_parameter_set_id;
-    public boolean residual_color_transform_flag;
-    public int offset_for_non_ref_pic;
-    public int offset_for_top_to_bottom_field;
-    public int num_ref_frames;
-    public boolean gaps_in_frame_num_value_allowed_flag;
-    public boolean frame_mbs_only_flag;
-    public boolean frame_cropping_flag;
-    public int frame_crop_left_offset;
-    public int frame_crop_right_offset;
-    public int frame_crop_top_offset;
-    public int frame_crop_bottom_offset;
-    public int[] offsetForRefFrame;
-    public VUIParameters vuiParams;
-    public ScalingMatrix scalingMatrix;
-    public int num_ref_frames_in_pic_order_cnt_cycle;
-
-    public static SeqParameterSet read(InputStream is) throws IOException {
-        CAVLCReader reader = new CAVLCReader(is);
-        SeqParameterSet sps = new SeqParameterSet();
-
-        sps.profile_idc = (int) reader.readNBit(8, "SPS: profile_idc");
-        sps.constraint_set_0_flag = reader
-                .readBool("SPS: constraint_set_0_flag");
-        sps.constraint_set_1_flag = reader
-                .readBool("SPS: constraint_set_1_flag");
-        sps.constraint_set_2_flag = reader
-                .readBool("SPS: constraint_set_2_flag");
-        sps.constraint_set_3_flag = reader
-                .readBool("SPS: constraint_set_3_flag");
-        reader.readNBit(4, "SPS: reserved_zero_4bits");
-        sps.level_idc = (int) reader.readNBit(8, "SPS: level_idc");
-        sps.seq_parameter_set_id = reader.readUE("SPS: seq_parameter_set_id");
-
-        if (sps.profile_idc == 100 || sps.profile_idc == 110
-                || sps.profile_idc == 122 || sps.profile_idc == 144) {
-            sps.chroma_format_idc = ChromaFormat.fromId(reader
-                    .readUE("SPS: chroma_format_idc"));
-            if (sps.chroma_format_idc == ChromaFormat.YUV_444) {
-                sps.residual_color_transform_flag = reader
-                        .readBool("SPS: residual_color_transform_flag");
-            }
-            sps.bit_depth_luma_minus8 = reader
-                    .readUE("SPS: bit_depth_luma_minus8");
-            sps.bit_depth_chroma_minus8 = reader
-                    .readUE("SPS: bit_depth_chroma_minus8");
-            sps.qpprime_y_zero_transform_bypass_flag = reader
-                    .readBool("SPS: qpprime_y_zero_transform_bypass_flag");
-            boolean seqScalingMatrixPresent = reader
-                    .readBool("SPS: seq_scaling_matrix_present_lag");
-            if (seqScalingMatrixPresent) {
-                readScalingListMatrix(reader, sps);
-            }
-        } else {
-            sps.chroma_format_idc = ChromaFormat.YUV_420;
-        }
-        sps.log2_max_frame_num_minus4 = reader
-                .readUE("SPS: log2_max_frame_num_minus4");
-        sps.pic_order_cnt_type = reader.readUE("SPS: pic_order_cnt_type");
-        if (sps.pic_order_cnt_type == 0) {
-            sps.log2_max_pic_order_cnt_lsb_minus4 = reader
-                    .readUE("SPS: log2_max_pic_order_cnt_lsb_minus4");
-        } else if (sps.pic_order_cnt_type == 1) {
-            sps.delta_pic_order_always_zero_flag = reader
-                    .readBool("SPS: delta_pic_order_always_zero_flag");
-            sps.offset_for_non_ref_pic = reader
-                    .readSE("SPS: offset_for_non_ref_pic");
-            sps.offset_for_top_to_bottom_field = reader
-                    .readSE("SPS: offset_for_top_to_bottom_field");
-            sps.num_ref_frames_in_pic_order_cnt_cycle = reader
-                    .readUE("SPS: num_ref_frames_in_pic_order_cnt_cycle");
-            sps.offsetForRefFrame = new int[sps.num_ref_frames_in_pic_order_cnt_cycle];
-            for (int i = 0; i < sps.num_ref_frames_in_pic_order_cnt_cycle; i++) {
-                sps.offsetForRefFrame[i] = reader
-                        .readSE("SPS: offsetForRefFrame [" + i + "]");
-            }
-        }
-        sps.num_ref_frames = reader.readUE("SPS: num_ref_frames");
-        sps.gaps_in_frame_num_value_allowed_flag = reader
-                .readBool("SPS: gaps_in_frame_num_value_allowed_flag");
-        sps.pic_width_in_mbs_minus1 = reader
-                .readUE("SPS: pic_width_in_mbs_minus1");
-        sps.pic_height_in_map_units_minus1 = reader
-                .readUE("SPS: pic_height_in_map_units_minus1");
-        sps.frame_mbs_only_flag = reader.readBool("SPS: frame_mbs_only_flag");
-        if (!sps.frame_mbs_only_flag) {
-            sps.mb_adaptive_frame_field_flag = reader
-                    .readBool("SPS: mb_adaptive_frame_field_flag");
-        }
-        sps.direct_8x8_inference_flag = reader
-                .readBool("SPS: direct_8x8_inference_flag");
-        sps.frame_cropping_flag = reader.readBool("SPS: frame_cropping_flag");
-        if (sps.frame_cropping_flag) {
-            sps.frame_crop_left_offset = reader
-                    .readUE("SPS: frame_crop_left_offset");
-            sps.frame_crop_right_offset = reader
-                    .readUE("SPS: frame_crop_right_offset");
-            sps.frame_crop_top_offset = reader
-                    .readUE("SPS: frame_crop_top_offset");
-            sps.frame_crop_bottom_offset = reader
-                    .readUE("SPS: frame_crop_bottom_offset");
-        }
-        boolean vui_parameters_present_flag = reader
-                .readBool("SPS: vui_parameters_present_flag");
-        if (vui_parameters_present_flag)
-            sps.vuiParams = ReadVUIParameters(reader);
-
-        reader.readTrailingBits();
-
-        return sps;
-    }
-
-    private static void readScalingListMatrix(CAVLCReader reader,
-                                              SeqParameterSet sps) throws IOException {
-        sps.scalingMatrix = new ScalingMatrix();
-        for (int i = 0; i < 8; i++) {
-            boolean seqScalingListPresentFlag = reader
-                    .readBool("SPS: seqScalingListPresentFlag");
-            if (seqScalingListPresentFlag) {
-                sps.scalingMatrix.ScalingList4x4 = new ScalingList[8];
-                sps.scalingMatrix.ScalingList8x8 = new ScalingList[8];
-                if (i < 6) {
-                    sps.scalingMatrix.ScalingList4x4[i] = ScalingList.read(
-                            reader, 16);
-                } else {
-                    sps.scalingMatrix.ScalingList8x8[i - 6] = ScalingList.read(
-                            reader, 64);
-                }
-            }
-        }
-    }
-
-    private static VUIParameters ReadVUIParameters(CAVLCReader reader)
-            throws IOException {
-        VUIParameters vuip = new VUIParameters();
-        vuip.aspect_ratio_info_present_flag = reader
-                .readBool("VUI: aspect_ratio_info_present_flag");
-        if (vuip.aspect_ratio_info_present_flag) {
-            vuip.aspect_ratio = AspectRatio.fromValue((int) reader.readNBit(8,
-                    "VUI: aspect_ratio"));
-            if (vuip.aspect_ratio == AspectRatio.Extended_SAR) {
-                vuip.sar_width = (int) reader.readNBit(16, "VUI: sar_width");
-                vuip.sar_height = (int) reader.readNBit(16, "VUI: sar_height");
-            }
-        }
-        vuip.overscan_info_present_flag = reader
-                .readBool("VUI: overscan_info_present_flag");
-        if (vuip.overscan_info_present_flag) {
-            vuip.overscan_appropriate_flag = reader
-                    .readBool("VUI: overscan_appropriate_flag");
-        }
-        vuip.video_signal_type_present_flag = reader
-                .readBool("VUI: video_signal_type_present_flag");
-        if (vuip.video_signal_type_present_flag) {
-            vuip.video_format = (int) reader.readNBit(3, "VUI: video_format");
-            vuip.video_full_range_flag = reader
-                    .readBool("VUI: video_full_range_flag");
-            vuip.colour_description_present_flag = reader
-                    .readBool("VUI: colour_description_present_flag");
-            if (vuip.colour_description_present_flag) {
-                vuip.colour_primaries = (int) reader.readNBit(8,
-                        "VUI: colour_primaries");
-                vuip.transfer_characteristics = (int) reader.readNBit(8,
-                        "VUI: transfer_characteristics");
-                vuip.matrix_coefficients = (int) reader.readNBit(8,
-                        "VUI: matrix_coefficients");
-            }
-        }
-        vuip.chroma_loc_info_present_flag = reader
-                .readBool("VUI: chroma_loc_info_present_flag");
-        if (vuip.chroma_loc_info_present_flag) {
-            vuip.chroma_sample_loc_type_top_field = reader
-                    .readUE("VUI chroma_sample_loc_type_top_field");
-            vuip.chroma_sample_loc_type_bottom_field = reader
-                    .readUE("VUI chroma_sample_loc_type_bottom_field");
-        }
-        vuip.timing_info_present_flag = reader
-                .readBool("VUI: timing_info_present_flag");
-        if (vuip.timing_info_present_flag) {
-            vuip.num_units_in_tick = (int) reader.readNBit(32,
-                    "VUI: num_units_in_tick");
-            vuip.time_scale = (int) reader.readNBit(32, "VUI: time_scale");
-            vuip.fixed_frame_rate_flag = reader
-                    .readBool("VUI: fixed_frame_rate_flag");
-        }
-        boolean nal_hrd_parameters_present_flag = reader
-                .readBool("VUI: nal_hrd_parameters_present_flag");
-        if (nal_hrd_parameters_present_flag)
-            vuip.nalHRDParams = readHRDParameters(reader);
-        boolean vcl_hrd_parameters_present_flag = reader
-                .readBool("VUI: vcl_hrd_parameters_present_flag");
-        if (vcl_hrd_parameters_present_flag)
-            vuip.vclHRDParams = readHRDParameters(reader);
-        if (nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag) {
-            vuip.low_delay_hrd_flag = reader
-                    .readBool("VUI: low_delay_hrd_flag");
-        }
-        vuip.pic_struct_present_flag = reader
-                .readBool("VUI: pic_struct_present_flag");
-        boolean bitstream_restriction_flag = reader
-                .readBool("VUI: bitstream_restriction_flag");
-        if (bitstream_restriction_flag) {
-            vuip.bitstreamRestriction = new VUIParameters.BitstreamRestriction();
-            vuip.bitstreamRestriction.motion_vectors_over_pic_boundaries_flag = reader
-                    .readBool("VUI: motion_vectors_over_pic_boundaries_flag");
-            vuip.bitstreamRestriction.max_bytes_per_pic_denom = reader
-                    .readUE("VUI max_bytes_per_pic_denom");
-            vuip.bitstreamRestriction.max_bits_per_mb_denom = reader
-                    .readUE("VUI max_bits_per_mb_denom");
-            vuip.bitstreamRestriction.log2_max_mv_length_horizontal = reader
-                    .readUE("VUI log2_max_mv_length_horizontal");
-            vuip.bitstreamRestriction.log2_max_mv_length_vertical = reader
-                    .readUE("VUI log2_max_mv_length_vertical");
-            vuip.bitstreamRestriction.num_reorder_frames = reader
-                    .readUE("VUI num_reorder_frames");
-            vuip.bitstreamRestriction.max_dec_frame_buffering = reader
-                    .readUE("VUI max_dec_frame_buffering");
-        }
-
-        return vuip;
-    }
-
-    private static HRDParameters readHRDParameters(CAVLCReader reader)
-            throws IOException {
-        HRDParameters hrd = new HRDParameters();
-        hrd.cpb_cnt_minus1 = reader.readUE("SPS: cpb_cnt_minus1");
-        hrd.bit_rate_scale = (int) reader.readNBit(4, "HRD: bit_rate_scale");
-        hrd.cpb_size_scale = (int) reader.readNBit(4, "HRD: cpb_size_scale");
-        hrd.bit_rate_value_minus1 = new int[hrd.cpb_cnt_minus1 + 1];
-        hrd.cpb_size_value_minus1 = new int[hrd.cpb_cnt_minus1 + 1];
-        hrd.cbr_flag = new boolean[hrd.cpb_cnt_minus1 + 1];
-
-        for (int SchedSelIdx = 0; SchedSelIdx <= hrd.cpb_cnt_minus1; SchedSelIdx++) {
-            hrd.bit_rate_value_minus1[SchedSelIdx] = reader
-                    .readUE("HRD: bit_rate_value_minus1");
-            hrd.cpb_size_value_minus1[SchedSelIdx] = reader
-                    .readUE("HRD: cpb_size_value_minus1");
-            hrd.cbr_flag[SchedSelIdx] = reader.readBool("HRD: cbr_flag");
-        }
-        hrd.initial_cpb_removal_delay_length_minus1 = (int) reader.readNBit(5,
-                "HRD: initial_cpb_removal_delay_length_minus1");
-        hrd.cpb_removal_delay_length_minus1 = (int) reader.readNBit(5,
-                "HRD: cpb_removal_delay_length_minus1");
-        hrd.dpb_output_delay_length_minus1 = (int) reader.readNBit(5,
-                "HRD: dpb_output_delay_length_minus1");
-        hrd.time_offset_length = (int) reader.readNBit(5,
-                "HRD: time_offset_length");
-        return hrd;
-    }
-
-    public void write(OutputStream out) throws IOException {
-        CAVLCWriter writer = new CAVLCWriter(out);
-
-        writer.writeNBit(profile_idc, 8, "SPS: profile_idc");
-        writer.writeBool(constraint_set_0_flag, "SPS: constraint_set_0_flag");
-        writer.writeBool(constraint_set_1_flag, "SPS: constraint_set_1_flag");
-        writer.writeBool(constraint_set_2_flag, "SPS: constraint_set_2_flag");
-        writer.writeBool(constraint_set_3_flag, "SPS: constraint_set_3_flag");
-        writer.writeNBit(0, 4, "SPS: reserved");
-        writer.writeNBit(level_idc, 8, "SPS: level_idc");
-        writer.writeUE(seq_parameter_set_id, "SPS: seq_parameter_set_id");
-
-        if (profile_idc == 100 || profile_idc == 110 || profile_idc == 122
-                || profile_idc == 144) {
-            writer.writeUE(chroma_format_idc.getId(), "SPS: chroma_format_idc");
-            if (chroma_format_idc == ChromaFormat.YUV_444) {
-                writer.writeBool(residual_color_transform_flag,
-                        "SPS: residual_color_transform_flag");
-            }
-            writer.writeUE(bit_depth_luma_minus8, "SPS: ");
-            writer.writeUE(bit_depth_chroma_minus8, "SPS: ");
-            writer.writeBool(qpprime_y_zero_transform_bypass_flag,
-                    "SPS: qpprime_y_zero_transform_bypass_flag");
-            writer.writeBool(scalingMatrix != null, "SPS: ");
-            if (scalingMatrix != null) {
-                for (int i = 0; i < 8; i++) {
-                    if (i < 6) {
-                        writer.writeBool(
-                                scalingMatrix.ScalingList4x4[i] != null,
-                                "SPS: ");
-                        if (scalingMatrix.ScalingList4x4[i] != null) {
-                            scalingMatrix.ScalingList4x4[i].write(writer);
-                        }
-                    } else {
-                        writer.writeBool(
-                                scalingMatrix.ScalingList8x8[i - 6] != null,
-                                "SPS: ");
-                        if (scalingMatrix.ScalingList8x8[i - 6] != null) {
-                            scalingMatrix.ScalingList8x8[i - 6].write(writer);
-                        }
-                    }
-                }
-            }
-        }
-        writer.writeUE(log2_max_frame_num_minus4,
-                "SPS: log2_max_frame_num_minus4");
-        writer.writeUE(pic_order_cnt_type, "SPS: pic_order_cnt_type");
-        if (pic_order_cnt_type == 0) {
-            writer.writeUE(log2_max_pic_order_cnt_lsb_minus4,
-                    "SPS: log2_max_pic_order_cnt_lsb_minus4");
-        } else if (pic_order_cnt_type == 1) {
-            writer.writeBool(delta_pic_order_always_zero_flag,
-                    "SPS: delta_pic_order_always_zero_flag");
-            writer.writeSE(offset_for_non_ref_pic,
-                    "SPS: offset_for_non_ref_pic");
-            writer.writeSE(offset_for_top_to_bottom_field,
-                    "SPS: offset_for_top_to_bottom_field");
-            writer.writeUE(offsetForRefFrame.length, "SPS: ");
-            for (int i = 0; i < offsetForRefFrame.length; i++)
-                writer.writeSE(offsetForRefFrame[i], "SPS: ");
-        }
-        writer.writeUE(num_ref_frames, "SPS: num_ref_frames");
-        writer.writeBool(gaps_in_frame_num_value_allowed_flag,
-                "SPS: gaps_in_frame_num_value_allowed_flag");
-        writer.writeUE(pic_width_in_mbs_minus1, "SPS: pic_width_in_mbs_minus1");
-        writer.writeUE(pic_height_in_map_units_minus1,
-                "SPS: pic_height_in_map_units_minus1");
-        writer.writeBool(frame_mbs_only_flag, "SPS: frame_mbs_only_flag");
-        if (!frame_mbs_only_flag) {
-            writer.writeBool(mb_adaptive_frame_field_flag,
-                    "SPS: mb_adaptive_frame_field_flag");
-        }
-        writer.writeBool(direct_8x8_inference_flag,
-                "SPS: direct_8x8_inference_flag");
-        writer.writeBool(frame_cropping_flag, "SPS: frame_cropping_flag");
-        if (frame_cropping_flag) {
-            writer.writeUE(frame_crop_left_offset,
-                    "SPS: frame_crop_left_offset");
-            writer.writeUE(frame_crop_right_offset,
-                    "SPS: frame_crop_right_offset");
-            writer.writeUE(frame_crop_top_offset, "SPS: frame_crop_top_offset");
-            writer.writeUE(frame_crop_bottom_offset,
-                    "SPS: frame_crop_bottom_offset");
-        }
-        writer.writeBool(vuiParams != null, "SPS: ");
-        if (vuiParams != null)
-            writeVUIParameters(vuiParams, writer);
-
-        writer.writeTrailingBits();
-    }
-
-    private void writeVUIParameters(VUIParameters vuip, CAVLCWriter writer)
-            throws IOException {
-        writer.writeBool(vuip.aspect_ratio_info_present_flag,
-                "VUI: aspect_ratio_info_present_flag");
-        if (vuip.aspect_ratio_info_present_flag) {
-            writer.writeNBit(vuip.aspect_ratio.getValue(), 8,
-                    "VUI: aspect_ratio");
-            if (vuip.aspect_ratio == AspectRatio.Extended_SAR) {
-                writer.writeNBit(vuip.sar_width, 16, "VUI: sar_width");
-                writer.writeNBit(vuip.sar_height, 16, "VUI: sar_height");
-            }
-        }
-        writer.writeBool(vuip.overscan_info_present_flag,
-                "VUI: overscan_info_present_flag");
-        if (vuip.overscan_info_present_flag) {
-            writer.writeBool(vuip.overscan_appropriate_flag,
-                    "VUI: overscan_appropriate_flag");
-        }
-        writer.writeBool(vuip.video_signal_type_present_flag,
-                "VUI: video_signal_type_present_flag");
-        if (vuip.video_signal_type_present_flag) {
-            writer.writeNBit(vuip.video_format, 3, "VUI: video_format");
-            writer.writeBool(vuip.video_full_range_flag,
-                    "VUI: video_full_range_flag");
-            writer.writeBool(vuip.colour_description_present_flag,
-                    "VUI: colour_description_present_flag");
-            if (vuip.colour_description_present_flag) {
-                writer.writeNBit(vuip.colour_primaries, 8,
-                        "VUI: colour_primaries");
-                writer.writeNBit(vuip.transfer_characteristics, 8,
-                        "VUI: transfer_characteristics");
-                writer.writeNBit(vuip.matrix_coefficients, 8,
-                        "VUI: matrix_coefficients");
-            }
-        }
-        writer.writeBool(vuip.chroma_loc_info_present_flag,
-                "VUI: chroma_loc_info_present_flag");
-        if (vuip.chroma_loc_info_present_flag) {
-            writer.writeUE(vuip.chroma_sample_loc_type_top_field,
-                    "VUI: chroma_sample_loc_type_top_field");
-            writer.writeUE(vuip.chroma_sample_loc_type_bottom_field,
-                    "VUI: chroma_sample_loc_type_bottom_field");
-        }
-        writer.writeBool(vuip.timing_info_present_flag,
-                "VUI: timing_info_present_flag");
-        if (vuip.timing_info_present_flag) {
-            writer.writeNBit(vuip.num_units_in_tick, 32,
-                    "VUI: num_units_in_tick");
-            writer.writeNBit(vuip.time_scale, 32, "VUI: time_scale");
-            writer.writeBool(vuip.fixed_frame_rate_flag,
-                    "VUI: fixed_frame_rate_flag");
-        }
-        writer.writeBool(vuip.nalHRDParams != null, "VUI: ");
-        if (vuip.nalHRDParams != null) {
-            writeHRDParameters(vuip.nalHRDParams, writer);
-        }
-        writer.writeBool(vuip.vclHRDParams != null, "VUI: ");
-        if (vuip.vclHRDParams != null) {
-            writeHRDParameters(vuip.vclHRDParams, writer);
-        }
-
-        if (vuip.nalHRDParams != null || vuip.vclHRDParams != null) {
-            writer
-                    .writeBool(vuip.low_delay_hrd_flag,
-                            "VUI: low_delay_hrd_flag");
-        }
-        writer.writeBool(vuip.pic_struct_present_flag,
-                "VUI: pic_struct_present_flag");
-        writer.writeBool(vuip.bitstreamRestriction != null, "VUI: ");
-        if (vuip.bitstreamRestriction != null) {
-            writer
-                    .writeBool(
-                            vuip.bitstreamRestriction.motion_vectors_over_pic_boundaries_flag,
-                            "VUI: motion_vectors_over_pic_boundaries_flag");
-            writer.writeUE(vuip.bitstreamRestriction.max_bytes_per_pic_denom,
-                    "VUI: max_bytes_per_pic_denom");
-            writer.writeUE(vuip.bitstreamRestriction.max_bits_per_mb_denom,
-                    "VUI: max_bits_per_mb_denom");
-            writer.writeUE(
-                    vuip.bitstreamRestriction.log2_max_mv_length_horizontal,
-                    "VUI: log2_max_mv_length_horizontal");
-            writer.writeUE(
-                    vuip.bitstreamRestriction.log2_max_mv_length_vertical,
-                    "VUI: log2_max_mv_length_vertical");
-            writer.writeUE(vuip.bitstreamRestriction.num_reorder_frames,
-                    "VUI: num_reorder_frames");
-            writer.writeUE(vuip.bitstreamRestriction.max_dec_frame_buffering,
-                    "VUI: max_dec_frame_buffering");
-        }
-
-    }
-
-    private void writeHRDParameters(HRDParameters hrd, CAVLCWriter writer)
-            throws IOException {
-        writer.writeUE(hrd.cpb_cnt_minus1, "HRD: cpb_cnt_minus1");
-        writer.writeNBit(hrd.bit_rate_scale, 4, "HRD: bit_rate_scale");
-        writer.writeNBit(hrd.cpb_size_scale, 4, "HRD: cpb_size_scale");
-
-        for (int SchedSelIdx = 0; SchedSelIdx <= hrd.cpb_cnt_minus1; SchedSelIdx++) {
-            writer.writeUE(hrd.bit_rate_value_minus1[SchedSelIdx], "HRD: ");
-            writer.writeUE(hrd.cpb_size_value_minus1[SchedSelIdx], "HRD: ");
-            writer.writeBool(hrd.cbr_flag[SchedSelIdx], "HRD: ");
-        }
-        writer.writeNBit(hrd.initial_cpb_removal_delay_length_minus1, 5,
-                "HRD: initial_cpb_removal_delay_length_minus1");
-        writer.writeNBit(hrd.cpb_removal_delay_length_minus1, 5,
-                "HRD: cpb_removal_delay_length_minus1");
-        writer.writeNBit(hrd.dpb_output_delay_length_minus1, 5,
-                "HRD: dpb_output_delay_length_minus1");
-        writer.writeNBit(hrd.time_offset_length, 5, "HRD: time_offset_length");
-    }
-
-    @Override
-    public String toString() {
-        return "SeqParameterSet{ " +
-                "\n        pic_order_cnt_type=" + pic_order_cnt_type +
-                ", \n        field_pic_flag=" + field_pic_flag +
-                ", \n        delta_pic_order_always_zero_flag=" + delta_pic_order_always_zero_flag +
-                ", \n        weighted_pred_flag=" + weighted_pred_flag +
-                ", \n        weighted_bipred_idc=" + weighted_bipred_idc +
-                ", \n        entropy_coding_mode_flag=" + entropy_coding_mode_flag +
-                ", \n        mb_adaptive_frame_field_flag=" + mb_adaptive_frame_field_flag +
-                ", \n        direct_8x8_inference_flag=" + direct_8x8_inference_flag +
-                ", \n        chroma_format_idc=" + chroma_format_idc +
-                ", \n        log2_max_frame_num_minus4=" + log2_max_frame_num_minus4 +
-                ", \n        log2_max_pic_order_cnt_lsb_minus4=" + log2_max_pic_order_cnt_lsb_minus4 +
-                ", \n        pic_height_in_map_units_minus1=" + pic_height_in_map_units_minus1 +
-                ", \n        pic_width_in_mbs_minus1=" + pic_width_in_mbs_minus1 +
-                ", \n        bit_depth_luma_minus8=" + bit_depth_luma_minus8 +
-                ", \n        bit_depth_chroma_minus8=" + bit_depth_chroma_minus8 +
-                ", \n        qpprime_y_zero_transform_bypass_flag=" + qpprime_y_zero_transform_bypass_flag +
-                ", \n        profile_idc=" + profile_idc +
-                ", \n        constraint_set_0_flag=" + constraint_set_0_flag +
-                ", \n        constraint_set_1_flag=" + constraint_set_1_flag +
-                ", \n        constraint_set_2_flag=" + constraint_set_2_flag +
-                ", \n        constraint_set_3_flag=" + constraint_set_3_flag +
-                ", \n        level_idc=" + level_idc +
-                ", \n        seq_parameter_set_id=" + seq_parameter_set_id +
-                ", \n        residual_color_transform_flag=" + residual_color_transform_flag +
-                ", \n        offset_for_non_ref_pic=" + offset_for_non_ref_pic +
-                ", \n        offset_for_top_to_bottom_field=" + offset_for_top_to_bottom_field +
-                ", \n        num_ref_frames=" + num_ref_frames +
-                ", \n        gaps_in_frame_num_value_allowed_flag=" + gaps_in_frame_num_value_allowed_flag +
-                ", \n        frame_mbs_only_flag=" + frame_mbs_only_flag +
-                ", \n        frame_cropping_flag=" + frame_cropping_flag +
-                ", \n        frame_crop_left_offset=" + frame_crop_left_offset +
-                ", \n        frame_crop_right_offset=" + frame_crop_right_offset +
-                ", \n        frame_crop_top_offset=" + frame_crop_top_offset +
-                ", \n        frame_crop_bottom_offset=" + frame_crop_bottom_offset +
-                ", \n        offsetForRefFrame=" + offsetForRefFrame +
-                ", \n        vuiParams=" + vuiParams +
-                ", \n        scalingMatrix=" + scalingMatrix +
-                ", \n        num_ref_frames_in_pic_order_cnt_cycle=" + num_ref_frames_in_pic_order_cnt_cycle +
-                '}';
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/model/VUIParameters.java b/android/src/main/java/com/googlecode/mp4parser/h264/model/VUIParameters.java
deleted file mode 100755
index eec7880..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/model/VUIParameters.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.model;
-
-public class VUIParameters {
-
-    public static class BitstreamRestriction {
-
-        public boolean motion_vectors_over_pic_boundaries_flag;
-        public int max_bytes_per_pic_denom;
-        public int max_bits_per_mb_denom;
-        public int log2_max_mv_length_horizontal;
-        public int log2_max_mv_length_vertical;
-        public int num_reorder_frames;
-        public int max_dec_frame_buffering;
-
-    }
-
-    public boolean aspect_ratio_info_present_flag;
-    public int sar_width;
-    public int sar_height;
-    public boolean overscan_info_present_flag;
-    public boolean overscan_appropriate_flag;
-    public boolean video_signal_type_present_flag;
-    public int video_format;
-    public boolean video_full_range_flag;
-    public boolean colour_description_present_flag;
-    public int colour_primaries;
-    public int transfer_characteristics;
-    public int matrix_coefficients;
-    public boolean chroma_loc_info_present_flag;
-    public int chroma_sample_loc_type_top_field;
-    public int chroma_sample_loc_type_bottom_field;
-    public boolean timing_info_present_flag;
-    public int num_units_in_tick;
-    public int time_scale;
-    public boolean fixed_frame_rate_flag;
-    public boolean low_delay_hrd_flag;
-    public boolean pic_struct_present_flag;
-    public HRDParameters nalHRDParams;
-    public HRDParameters vclHRDParams;
-
-    public BitstreamRestriction bitstreamRestriction;
-    public AspectRatio aspect_ratio;
-
-    @Override
-    public String toString() {
-        return "VUIParameters{" + "\n" +
-                "aspect_ratio_info_present_flag=" + aspect_ratio_info_present_flag + "\n" +
-                ", sar_width=" + sar_width + "\n" +
-                ", sar_height=" + sar_height + "\n" +
-                ", overscan_info_present_flag=" + overscan_info_present_flag + "\n" +
-                ", overscan_appropriate_flag=" + overscan_appropriate_flag + "\n" +
-                ", video_signal_type_present_flag=" + video_signal_type_present_flag + "\n" +
-                ", video_format=" + video_format + "\n" +
-                ", video_full_range_flag=" + video_full_range_flag + "\n" +
-                ", colour_description_present_flag=" + colour_description_present_flag + "\n" +
-                ", colour_primaries=" + colour_primaries + "\n" +
-                ", transfer_characteristics=" + transfer_characteristics + "\n" +
-                ", matrix_coefficients=" + matrix_coefficients + "\n" +
-                ", chroma_loc_info_present_flag=" + chroma_loc_info_present_flag + "\n" +
-                ", chroma_sample_loc_type_top_field=" + chroma_sample_loc_type_top_field + "\n" +
-                ", chroma_sample_loc_type_bottom_field=" + chroma_sample_loc_type_bottom_field + "\n" +
-                ", timing_info_present_flag=" + timing_info_present_flag + "\n" +
-                ", num_units_in_tick=" + num_units_in_tick + "\n" +
-                ", time_scale=" + time_scale + "\n" +
-                ", fixed_frame_rate_flag=" + fixed_frame_rate_flag + "\n" +
-                ", low_delay_hrd_flag=" + low_delay_hrd_flag + "\n" +
-                ", pic_struct_present_flag=" + pic_struct_present_flag + "\n" +
-                ", nalHRDParams=" + nalHRDParams + "\n" +
-                ", vclHRDParams=" + vclHRDParams + "\n" +
-                ", bitstreamRestriction=" + bitstreamRestriction + "\n" +
-                ", aspect_ratio=" + aspect_ratio + "\n" +
-                '}';
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/read/BitstreamReader.java b/android/src/main/java/com/googlecode/mp4parser/h264/read/BitstreamReader.java
deleted file mode 100755
index 816af6a..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/read/BitstreamReader.java
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.read;
-
-import com.googlecode.mp4parser.h264.CharCache;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-/**
- * A dummy implementation of H264 RBSP reading
- *
- * @author Stanislav Vitvitskiy
- */
-public class BitstreamReader {
-    private InputStream is;
-    private int curByte;
-    private int nextByte;
-    int nBit;
-    protected static int bitsRead;
-
-    protected CharCache debugBits = new CharCache(50);
-
-    public BitstreamReader(InputStream is) throws IOException {
-        this.is = is;
-        curByte = is.read();
-        nextByte = is.read();
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#read1Bit()
-      */
-    public int read1Bit() throws IOException {
-        if (nBit == 8) {
-            advance();
-            if (curByte == -1) {
-                return -1;
-            }
-        }
-        int res = (curByte >> (7 - nBit)) & 1;
-        nBit++;
-
-        debugBits.append(res == 0 ? '0' : '1');
-        ++bitsRead;
-
-        return res;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#readNBit(int)
-      */
-    public long readNBit(int n) throws IOException {
-        if (n > 64)
-            throw new IllegalArgumentException("Can not readByte more then 64 bit");
-
-        long val = 0;
-
-        for (int i = 0; i < n; i++) {
-            val <<= 1;
-            val |= read1Bit();
-        }
-
-        return val;
-    }
-
-    private void advance() throws IOException {
-        curByte = nextByte;
-        nextByte = is.read();
-        nBit = 0;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#readByte()
-      */
-    public int readByte() throws IOException {
-        if (nBit > 0) {
-            advance();
-        }
-
-        int res = curByte;
-
-        advance();
-
-        return res;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#moreRBSPData()
-      */
-    public boolean moreRBSPData() throws IOException {
-        if (nBit == 8) {
-            advance();
-        }
-        int tail = 1 << (8 - nBit - 1);
-        int mask = ((tail << 1) - 1);
-        boolean hasTail = (curByte & mask) == tail;
-
-        return !(curByte == -1 || (nextByte == -1 && hasTail));
-    }
-
-    public long getBitPosition() {
-        return (bitsRead * 8 + (nBit % 8));
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#readRemainingByte()
-      */
-    public long readRemainingByte() throws IOException {
-        return readNBit(8 - nBit);
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#next_bits(int)
-      */
-    public int peakNextBits(int n) throws IOException {
-        if (n > 8)
-            throw new IllegalArgumentException("N should be less then 8");
-        if (nBit == 8) {
-            advance();
-            if (curByte == -1) {
-                return -1;
-            }
-        }
-        int[] bits = new int[16 - nBit];
-
-        int cnt = 0;
-        for (int i = nBit; i < 8; i++) {
-            bits[cnt++] = (curByte >> (7 - i)) & 0x1;
-        }
-
-        for (int i = 0; i < 8; i++) {
-            bits[cnt++] = (nextByte >> (7 - i)) & 0x1;
-        }
-
-        int result = 0;
-        for (int i = 0; i < n; i++) {
-            result <<= 1;
-            result |= bits[i];
-        }
-
-        return result;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#byte_aligned()
-      */
-    public boolean isByteAligned() {
-        return (nBit % 8) == 0;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see ua.org.jplayer.javcodec.h264.RBSPInputStream#close()
-      */
-    public void close() throws IOException {
-    }
-
-    public int getCurBit() {
-        return nBit;
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/read/CAVLCReader.java b/android/src/main/java/com/googlecode/mp4parser/h264/read/CAVLCReader.java
deleted file mode 100755
index 07c7f71..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/read/CAVLCReader.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.read;
-
-
-import com.googlecode.mp4parser.h264.BTree;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import static com.googlecode.mp4parser.h264.Debug.println;
-
-
-public class CAVLCReader extends BitstreamReader {
-
-    public CAVLCReader(InputStream is) throws IOException {
-        super(is);
-    }
-
-    public long readNBit(int n, String message) throws IOException {
-        long val = readNBit(n);
-
-        trace(message, String.valueOf(val));
-
-        return val;
-    }
-
-    /**
-     * Read unsigned exp-golomb code
-     *
-     * @return
-     * @throws java.io.IOException
-     * @throws java.io.IOException
-     */
-    private int readUE() throws IOException {
-        int cnt = 0;
-        while (read1Bit() == 0)
-            cnt++;
-
-        int res = 0;
-        if (cnt > 0) {
-            long val = readNBit(cnt);
-
-            res = (int) ((1 << cnt) - 1 + val);
-        }
-
-        return res;
-    }
-
-    /*
-      * (non-Javadoc)
-      *
-      * @see
-      * ua.org.jplayer.javcodec.h264.H264BitInputStream#readUE(java.lang.String)
-      */
-    public int readUE(String message) throws IOException {
-        int res = readUE();
-
-        trace(message, String.valueOf(res));
-
-        return res;
-    }
-
-    public int readSE(String message) throws IOException {
-        int val = readUE();
-
-        int sign = ((val & 0x1) << 1) - 1;
-        val = ((val >> 1) + (val & 0x1)) * sign;
-
-        trace(message, String.valueOf(val));
-
-        return val;
-    }
-
-    public boolean readBool(String message) throws IOException {
-
-        boolean res = read1Bit() == 0 ? false : true;
-
-        trace(message, res ? "1" : "0");
-
-        return res;
-    }
-
-    public int readU(int i, String string) throws IOException {
-        return (int) readNBit(i, string);
-    }
-
-    public byte[] read(int payloadSize) throws IOException {
-        byte[] result = new byte[payloadSize];
-        for (int i = 0; i < payloadSize; i++) {
-            result[i] = (byte) readByte();
-        }
-        return result;
-    }
-
-    public boolean readAE() {
-        // TODO: do it!!
-        throw new UnsupportedOperationException("Stan");
-    }
-
-    public int readTE(int max) throws IOException {
-        if (max > 1)
-            return readUE();
-        return ~read1Bit() & 0x1;
-    }
-
-    public int readAEI() {
-        // TODO: do it!!
-        throw new UnsupportedOperationException("Stan");
-    }
-
-    public int readME(String string) throws IOException {
-        return readUE(string);
-    }
-
-    public Object readCE(BTree bt, String message) throws IOException {
-        while (true) {
-            int bit = read1Bit();
-            bt = bt.down(bit);
-            if (bt == null) {
-                throw new RuntimeException("Illegal code");
-            }
-            Object i = bt.getValue();
-            if (i != null) {
-                trace(message, i.toString());
-                return i;
-            }
-        }
-    }
-
-    public int readZeroBitCount(String message) throws IOException {
-        int count = 0;
-        while (read1Bit() == 0)
-            count++;
-
-        trace(message, String.valueOf(count));
-
-        return count;
-    }
-
-    public void readTrailingBits() throws IOException {
-        read1Bit();
-        readRemainingByte();
-    }
-
-    private void trace(String message, String val) {
-        StringBuilder traceBuilder = new StringBuilder();
-        int spaces;
-        String pos = String.valueOf(bitsRead - debugBits.length());
-        spaces = 8 - pos.length();
-
-        traceBuilder.append("@" + pos);
-
-        for (int i = 0; i < spaces; i++)
-            traceBuilder.append(' ');
-
-        traceBuilder.append(message);
-        spaces = 100 - traceBuilder.length() - debugBits.length();
-        for (int i = 0; i < spaces; i++)
-            traceBuilder.append(' ');
-        traceBuilder.append(debugBits);
-        traceBuilder.append(" (" + val + ")");
-        debugBits.clear();
-
-        println(traceBuilder.toString());
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/write/BitstreamWriter.java b/android/src/main/java/com/googlecode/mp4parser/h264/write/BitstreamWriter.java
deleted file mode 100755
index b382400..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/write/BitstreamWriter.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.write;
-
-import com.googlecode.mp4parser.h264.Debug;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * A dummy implementation of H264 RBSP output stream
- *
- * @author Stanislav Vitvitskiy
- */
-public class BitstreamWriter {
-
-    private final OutputStream os;
-    private int[] curByte = new int[8];
-    private int curBit;
-
-    public BitstreamWriter(OutputStream out) {
-        this.os = out;
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see ua.org.jplayer.javcodec.h264.H264BitOutputStream#flush()
-     */
-    public void flush() throws IOException {
-        for (int i = curBit; i < 8; i++) {
-            curByte[i] = 0;
-        }
-        curBit = 0;
-        writeCurByte();
-    }
-
-    private void writeCurByte() throws IOException {
-        int toWrite = (curByte[0] << 7) | (curByte[1] << 6) | (curByte[2] << 5)
-                | (curByte[3] << 4) | (curByte[4] << 3) | (curByte[5] << 2)
-                | (curByte[6] << 1) | curByte[7];
-        os.write(toWrite);
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see ua.org.jplayer.javcodec.h264.H264BitOutputStream#write1Bit(int)
-     */
-    public void write1Bit(int value) throws IOException {
-        Debug.print(value);
-        if (curBit == 8) {
-            curBit = 0;
-            writeCurByte();
-        }
-        curByte[curBit++] = value;
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see ua.org.jplayer.javcodec.h264.H264BitOutputStream#writeNBit(long,
-     * int)
-     */
-    public void writeNBit(long value, int n) throws IOException {
-        for (int i = 0; i < n; i++) {
-            write1Bit((int) (value >> (n - i - 1)) & 0x1);
-        }
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see
-     * ua.org.jplayer.javcodec.h264.H264BitOutputStream#writeRemainingZero()
-     */
-    public void writeRemainingZero() throws IOException {
-        writeNBit(0, 8 - curBit);
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see ua.org.jplayer.javcodec.h264.H264BitOutputStream#writeByte(int)
-     */
-    public void writeByte(int b) throws IOException {
-        os.write(b);
-
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/h264/write/CAVLCWriter.java b/android/src/main/java/com/googlecode/mp4parser/h264/write/CAVLCWriter.java
deleted file mode 100755
index c4e0026..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/h264/write/CAVLCWriter.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
-Copyright (c) 2011 Stanislav Vitvitskiy
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this
-software and associated documentation files (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use, copy, modify,
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or
-substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
-PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
-FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-package com.googlecode.mp4parser.h264.write;
-
-import com.googlecode.mp4parser.h264.Debug;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-
-/**
- * A class responsible for outputting exp-Golumb values into binary stream
- *
- * @author Stanislav Vitvitskiy
- */
-public class CAVLCWriter extends BitstreamWriter {
-
-    public CAVLCWriter(OutputStream out) {
-        super(out);
-    }
-
-    public void writeU(int value, int n, String string) throws IOException {
-        Debug.print(string + "\t");
-        writeNBit(value, n);
-        Debug.println("\t" + value);
-    }
-
-    public void writeUE(int value) throws IOException {
-        int bits = 0;
-        int cumul = 0;
-        for (int i = 0; i < 15; i++) {
-            if (value < cumul + (1 << i)) {
-                bits = i;
-                break;
-            }
-            cumul += (1 << i);
-        }
-        writeNBit(0, bits);
-        write1Bit(1);
-        writeNBit(value - cumul, bits);
-    }
-
-    public void writeUE(int value, String string) throws IOException {
-        Debug.print(string + "\t");
-        writeUE(value);
-        Debug.println("\t" + value);
-    }
-
-    public void writeSE(int value, String string) throws IOException {
-        Debug.print(string + "\t");
-        writeUE((value << 1) * (value < 0 ? -1 : 1) + (value > 0 ? 1 : 0));
-        Debug.println("\t" + value);
-    }
-
-    public void writeBool(boolean value, String string) throws IOException {
-        Debug.print(string + "\t");
-        write1Bit(value ? 1 : 0);
-        Debug.println("\t" + value);
-    }
-
-    public void writeU(int i, int n) throws IOException {
-        writeNBit(i, n);
-    }
-
-    public void writeNBit(long value, int n, String string) throws IOException {
-        Debug.print(string + "\t");
-        for (int i = 0; i < n; i++) {
-            write1Bit((int) (value >> (n - i - 1)) & 0x1);
-        }
-        Debug.println("\t" + value);
-    }
-
-    public void writeTrailingBits() throws IOException {
-        write1Bit(1);
-        writeRemainingZero();
-        flush();
-    }
-
-    public void writeSliceTrailingBits() {
-        throw new IllegalStateException("todo");
-    }
-}
\ No newline at end of file
diff --git a/android/src/main/java/com/googlecode/mp4parser/util/ByteBufferByteChannel.java b/android/src/main/java/com/googlecode/mp4parser/util/ByteBufferByteChannel.java
deleted file mode 100755
index 9f3264b..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/util/ByteBufferByteChannel.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.googlecode.mp4parser.util;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.ByteChannel;
-
-/**
- * Creates a <code>ReadableByteChannel</code> that is backed by a <code>ByteBuffer</code>.
- */
-public class ByteBufferByteChannel implements ByteChannel {
-    ByteBuffer byteBuffer;
-
-    public ByteBufferByteChannel(ByteBuffer byteBuffer) {
-        this.byteBuffer = byteBuffer;
-    }
-
-    public int read(ByteBuffer dst) throws IOException {
-        byte[] b = dst.array();
-        int r = dst.remaining();
-        if (byteBuffer.remaining() >= r) {
-            byteBuffer.get(b, dst.position(), r);
-            return r;
-        } else {
-            throw new EOFException("Reading beyond end of stream");
-        }
-    }
-
-    public boolean isOpen() {
-        return true;
-    }
-
-    public void close() throws IOException {
-    }
-
-    public int write(ByteBuffer src) throws IOException {
-        int r = src.remaining();
-        byteBuffer.put(src);
-        return r;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/util/CastUtils.java b/android/src/main/java/com/googlecode/mp4parser/util/CastUtils.java
deleted file mode 100755
index 2dd011a..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/util/CastUtils.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2012 Sebastian Annies, Hamburg
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.googlecode.mp4parser.util;
-
-
-public class CastUtils {
-    /**
-     * Casts a long to an int. In many cases I use a long for a UInt32 but this cannot be used to allocate
-     * ByteBuffers or arrays since they restricted to <code>Integer.MAX_VALUE</code> this cast-method will throw
-     * a RuntimeException if the cast would cause a loss of information.
-     *
-     * @param l the long value
-     * @return the long value as int
-     */
-    public static int l2i(long l) {
-        if (l > Integer.MAX_VALUE || l < Integer.MIN_VALUE) {
-            throw new RuntimeException("A cast to int has gone wrong. Please contact the mp4parser discussion group (" + l + ")");
-        }
-        return (int) l;
-    }
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/util/Math.java b/android/src/main/java/com/googlecode/mp4parser/util/Math.java
deleted file mode 100755
index 27fd4b2..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/util/Math.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package com.googlecode.mp4parser.util;
-
-public class Math {
-    public static long gcd(long a, long b) {
-        while (b > 0) {
-            long temp = b;
-            b = a % b; // % is remainder
-            a = temp;
-        }
-        return a;
-    }
-
-    public static int gcd(int a, int b) {
-        while (b > 0) {
-            int temp = b;
-            b = a % b; // % is remainder
-            a = temp;
-        }
-        return a;
-    }
-
-    public static long lcm(long a, long b) {
-        return a * (b / gcd(a, b));
-    }
-
-    public static int lcm(int a, int b) {
-        return a * (b / gcd(a, b));
-    }
-
-}
diff --git a/android/src/main/java/com/googlecode/mp4parser/util/Matrix.java b/android/src/main/java/com/googlecode/mp4parser/util/Matrix.java
deleted file mode 100755
index 3287800..0000000
--- a/android/src/main/java/com/googlecode/mp4parser/util/Matrix.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package com.googlecode.mp4parser.util;
-
-import com.coremedia.iso.IsoTypeReader;
-import com.coremedia.iso.IsoTypeWriter;
-
-import java.nio.ByteBuffer;
-
-/**
- * Transformation Matrix as used in <code>Track-</code> and <code>MovieHeaderBox</code>.
- */
-public class Matrix {
-    public static final Matrix ROTATE_0 = new Matrix(1, 0, 0, 1, 0, 0, 1, 0, 0);
-    public static final Matrix ROTATE_90 = new Matrix(0, 1, -1, 0, 0, 0, 1, 0, 0);
-    public static final Matrix ROTATE_180 = new Matrix(-1, 0, 0, -1, 0, 0, 1, 0, 0);
-    public static final Matrix ROTATE_270 = new Matrix(0, -1, 1, 0, 0, 0, 1, 0, 0);
-    double u, v, w;
-    double a, b, c, d, tx, ty;
-
-    public Matrix(double a, double b, double c, double d, double u, double v, double w, double tx, double ty) {
-        this.u = u;
-        this.v = v;
-        this.w = w;
-        this.a = a;
-        this.b = b;
-        this.c = c;
-        this.d = d;
-        this.tx = tx;
-        this.ty = ty;
-    }
-
-    public static Matrix fromFileOrder(double a, double b, double u, double c, double d, double v, double tx, double ty, double w) {
-        return new Matrix(a, b, c, d, u, v, w, tx, ty);
-    }
-
-    public static Matrix fromByteBuffer(ByteBuffer byteBuffer) {
-        return fromFileOrder(
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint0230(byteBuffer),
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint0230(byteBuffer),
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint1616(byteBuffer),
-                IsoTypeReader.readFixedPoint0230(byteBuffer)
-        );
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-
-        Matrix matrix = (Matrix) o;
-
-        if (Double.compare(matrix.a, a) != 0) return false;
-        if (Double.compare(matrix.b, b) != 0) return false;
-        if (Double.compare(matrix.c, c) != 0) return false;
-        if (Double.compare(matrix.d, d) != 0) return false;
-        if (Double.compare(matrix.tx, tx) != 0) return false;
-        if (Double.compare(matrix.ty, ty) != 0) return false;
-        if (Double.compare(matrix.u, u) != 0) return false;
-        if (Double.compare(matrix.v, v) != 0) return false;
-        if (Double.compare(matrix.w, w) != 0) return false;
-
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        int result;
-        long temp;
-        temp = Double.doubleToLongBits(u);
-        result = (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(v);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(w);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(a);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(b);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(c);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(d);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(tx);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        temp = Double.doubleToLongBits(ty);
-        result = 31 * result + (int) (temp ^ (temp >>> 32));
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        if (this.equals(ROTATE_0)) {
-            return "Rotate 0°";
-        }
-        if (this.equals(ROTATE_90)) {
-            return "Rotate 90°";
-        }
-        if (this.equals(ROTATE_180)) {
-            return "Rotate 180°";
-        }
-        if (this.equals(ROTATE_270)) {
-            return "Rotate 270°";
-        }
-        return "Matrix{" +
-                "u=" + u +
-                ", v=" + v +
-                ", w=" + w +
-                ", a=" + a +
-                ", b=" + b +
-                ", c=" + c +
-                ", d=" + d +
-                ", tx=" + tx +
-                ", ty=" + ty +
-                '}';
-    }
-
-    public void getContent(ByteBuffer byteBuffer) {
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, a);
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, b);
-        IsoTypeWriter.writeFixedPoint0230(byteBuffer, u);
-
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, c);
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, d);
-        IsoTypeWriter.writeFixedPoint0230(byteBuffer, v);
-
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, tx);
-        IsoTypeWriter.writeFixedPoint1616(byteBuffer, ty);
-        IsoTypeWriter.writeFixedPoint0230(byteBuffer, w);
-
-    }
-
-
-}
diff --git a/android/src/main/java/com/reactlibrary/RNBroadcastPackage.java b/android/src/main/java/com/pedrolibrary/RNBroadcastPackage.java
similarity index 90%
rename from android/src/main/java/com/reactlibrary/RNBroadcastPackage.java
rename to android/src/main/java/com/pedrolibrary/RNBroadcastPackage.java
index 7495bb9..a64e0b1 100644
--- a/android/src/main/java/com/reactlibrary/RNBroadcastPackage.java
+++ b/android/src/main/java/com/pedrolibrary/RNBroadcastPackage.java
@@ -1,9 +1,6 @@
 
-package com.reactlibrary;
+package com.pedrolibrary;
 
-import android.app.Activity;
-
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
@@ -30,3 +27,4 @@ public List<ViewManager> createViewManagers(ReactApplicationContext reactContext
     }
 }
 
+
diff --git a/android/src/main/java/com/pedrolibrary/RNBroadcastView.java b/android/src/main/java/com/pedrolibrary/RNBroadcastView.java
new file mode 100644
index 0000000..bf949d6
--- /dev/null
+++ b/android/src/main/java/com/pedrolibrary/RNBroadcastView.java
@@ -0,0 +1,218 @@
+package com.pedrolibrary;
+
+import android.content.Context;
+import android.media.AudioManager;
+import android.support.annotation.NonNull;
+import android.util.Log;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import com.pedro.rtplibrary.view.OpenGlView;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.FrameLayout;
+
+import com.facebook.react.uimanager.ThemedReactContext;
+import com.pedro.encoder.input.video.CameraOpenException;
+import com.pedro.rtplibrary.rtmp.RtmpCamera1;
+import com.pedro.encoder.input.video.CameraHelper;
+import net.ossrs.rtmp.ConnectCheckerRtmp;
+
+import static com.facebook.react.common.ReactConstants.TAG;
+
+
+public class RNBroadcastView extends FrameLayout implements
+        ConnectCheckerRtmp, SurfaceHolder.Callback {
+    // private SurfaceView mCameraView;
+    private OpenGlView mCameraView;
+    private ThemedReactContext mContext = null;
+    private RtmpCamera1 rtmpCamera1;
+    private Boolean isLive = false;
+    private Boolean surfaceExists = false;
+    private String streamUrl = "";
+
+    public RNBroadcastView(@NonNull ThemedReactContext context) {
+        super(context);
+        // context.addLifecycleEventListener(this);
+        mContext = context;
+
+        // mCameraView = new SurfaceView(context);
+        mCameraView = new OpenGlView(context);
+        mCameraView.setKeepAspectRatio(true);
+        mCameraView.getHolder().addCallback(this);
+
+        addView(mCameraView);
+
+        rtmpCamera1 = new RtmpCamera1(mCameraView, this);
+        rtmpCamera1.setReTries(10);
+
+        Log.d(TAG, "About to return from RNBroadcastView");
+    }
+
+    private void handleException(Exception e) {
+        try {
+            rtmpCamera1.stopStream();
+        } catch (Exception e1) {
+            //
+        }
+    }
+
+    public void setPublish(String publishUrl) {
+        if (!publishUrl.isEmpty()) {
+            AudioManager am = (AudioManager) this.mContext.getSystemService(Context.AUDIO_SERVICE);
+            am.setSpeakerphoneOn(true);
+            am.setMode(AudioManager.MODE_IN_COMMUNICATION);
+            streamUrl = publishUrl;
+            this.startStreaming();
+        }
+        else if (publishUrl.isEmpty() && this.isLive){
+            AudioManager am = (AudioManager) this.mContext.getSystemService(Context.AUDIO_SERVICE);
+            am.setSpeakerphoneOn(true);
+            am.setMode(AudioManager.MODE_NORMAL);
+            this.stopStreaming();
+        }
+    }
+
+    public void handleStopPublish() {
+        AudioManager am = (AudioManager) this.mContext.getSystemService(Context.AUDIO_SERVICE);
+        am.setSpeakerphoneOn(true);
+        am.setMode(AudioManager.MODE_NORMAL);
+        this.stopStreaming();
+    }
+
+    public void startStreaming() {
+        if (surfaceExists && !streamUrl.isEmpty()) {
+            boolean hardwareRotation = true;
+            try {
+                rtmpCamera1.getGlInterface();
+            }
+            catch (Exception e) {
+                hardwareRotation = false;
+            }
+
+            int cameraOrientation = CameraHelper.getCameraOrientation(getContext()),
+                captureWidth = 640,
+                captureHeight = 480;
+
+            if (rtmpCamera1.isRecording()
+                    || rtmpCamera1.prepareAudio() && rtmpCamera1.prepareVideo(captureWidth, captureHeight, 30, 600*1024, hardwareRotation, 1, cameraOrientation)) {
+                rtmpCamera1.startStream(streamUrl);
+                isLive = true;
+            } else {
+//      Toast.makeText(this, "Error preparing stream, This device cant do it",
+//              Toast.LENGTH_SHORT).show();
+                isLive = false;
+            }
+        }
+    }
+
+    public void stopStreaming() {
+        rtmpCamera1.stopStream();
+        isLive = false;
+    }
+
+    @Override
+    public void onConnectionSuccessRtmp() {
+//    runOnUiThread(new Runnable() {
+//      @Override
+//      public void run() {
+//        Toast.makeText(MainActivity.this, "Connection success", Toast.LENGTH_SHORT)
+//                .show();
+//      }
+//    });
+    }
+
+    @Override
+    public void onConnectionFailedRtmp(final String reason) {
+//    runOnUiThread(new Runnable() {
+//      @Override
+//      public void run() {
+//        if (rtmpCamera2.shouldRetry(reason)) {
+//          Toast.makeText(MainActivity.this, "Retry", Toast.LENGTH_SHORT)
+//                  .show();
+//          rtmpCamera2.reTry(5000);  //Wait 5s and retry connect stream
+//        } else {
+//          Toast.makeText(MainActivity.this, "Connection failed. " + reason, Toast.LENGTH_SHORT).show();
+//          //rtmpCamera2.stopStream();
+//          // button.setText(R.string.start_button);
+//          stopClick();
+//        }
+//      }
+//    });
+    }
+
+    @Override
+    public void onNewBitrateRtmp(long bitrate) {
+
+    }
+
+    @Override
+    public void onDisconnectRtmp() {
+//    runOnUiThread(new Runnable() {
+//      @Override
+//      public void run() {
+//        Toast.makeText(MainActivity.this, "Disconnected", Toast.LENGTH_SHORT).show();
+//      }
+//    });
+    }
+
+    @Override
+    public void onAuthErrorRtmp() {
+//    runOnUiThread(new Runnable() {
+//      @Override
+//      public void run() {
+//        Toast.makeText(MainActivity.this, "Auth error", Toast.LENGTH_SHORT).show();
+//      }
+//    });
+    }
+
+    @Override
+    public void onAuthSuccessRtmp() {
+//    runOnUiThread(new Runnable() {
+//      @Override
+//      public void run() {
+//        Toast.makeText(MainActivity.this, "Auth success", Toast.LENGTH_SHORT).show();
+//      }
+//    });
+    }
+
+    @Override
+    public void surfaceCreated(SurfaceHolder surfaceHolder) {
+        Log.d(TAG, "SurfaceCreated");
+
+        // Set the surface exists flag to true and
+        // hit the startStreaming function in case we already have a URL
+        surfaceExists = true;
+        startStreaming();
+
+//    if (rtmpCamera2.isRecording()
+//            || rtmpCamera2.prepareAudio() && rtmpCamera2.prepareVideo()) {
+//      rtmpCamera2.startStream("rtmp://"+Constants.STREAMING_BASE_URL+":"+Constants.STREAMING_PORT+"/live/" + mDevicePath);
+//      isServicePersisted = true;
+//      dismissStreamDialog();
+//    } else {
+//      Toast.makeText(this, "Error preparing stream, This device cant do it",
+//              Toast.LENGTH_SHORT).show();
+//      isServicePersisted = false;
+//      showStreamDialog();
+//    }
+//
+//    updateServiceIndicator();
+    }
+
+    @Override
+    public void surfaceChanged(SurfaceHolder surfaceHolder, int i, int i1, int i2) {
+        Log.d(TAG, "SurfaceChanged");
+        rtmpCamera1.startPreview();
+        int cameraOrientation = CameraHelper.getCameraOrientation(getContext());
+        rtmpCamera1.setPreviewOrientation(cameraOrientation);
+    }
+
+    @Override
+    public void surfaceDestroyed(SurfaceHolder surfaceHolder) {
+        surfaceExists = false;
+        if (rtmpCamera1.isStreaming()) {
+            stopStreaming();
+        }
+        rtmpCamera1.stopPreview();
+    }
+}
diff --git a/android/src/main/java/com/pedrolibrary/RNBroadcastViewManager.java b/android/src/main/java/com/pedrolibrary/RNBroadcastViewManager.java
new file mode 100644
index 0000000..99af96c
--- /dev/null
+++ b/android/src/main/java/com/pedrolibrary/RNBroadcastViewManager.java
@@ -0,0 +1,87 @@
+
+package com.pedrolibrary;
+
+import java.util.Map;
+
+import android.support.annotation.Nullable;
+import android.util.Log;
+
+import com.facebook.infer.annotation.Assertions;
+import com.facebook.react.bridge.ReadableArray;
+import com.facebook.react.common.MapBuilder;
+import com.facebook.react.uimanager.SimpleViewManager;
+import com.facebook.react.uimanager.ThemedReactContext;
+import com.facebook.react.uimanager.annotations.ReactProp;
+
+import static com.facebook.react.common.ReactConstants.TAG;
+
+
+public class RNBroadcastViewManager extends SimpleViewManager<RNBroadcastView> {
+
+  public static final String REACT_CLASS = "RNBroadcastView";
+  public static final int COMMAND_STOP_PUBLISH = 1;
+  private RNBroadcastView mBroadcastView = null;
+
+  @Override
+  public String getName() {
+    return REACT_CLASS;
+  }
+
+  @Override
+  public RNBroadcastView createViewInstance(ThemedReactContext context) {
+    mBroadcastView = new RNBroadcastView(context);
+
+    Log.d(TAG, "About to return from createViewInstance");
+
+    return mBroadcastView;
+  }
+
+  @ReactProp(name = "publish")
+  public void started(RNBroadcastView view, @Nullable String publish) {
+    System.out.println("Starting: " + publish);
+
+    mBroadcastView.setPublish(publish);
+  }
+
+  @Override
+  public Map<String,Integer> getCommandsMap() {
+    return MapBuilder.of(
+            "stopPublish",
+            COMMAND_STOP_PUBLISH);
+  }
+
+  @Override
+  public void receiveCommand(
+          RNBroadcastView view,
+          int commandType,
+          @Nullable ReadableArray args) {
+    Assertions.assertNotNull(view);
+    Assertions.assertNotNull(args);
+    switch (commandType) {
+      case COMMAND_STOP_PUBLISH: {
+        mBroadcastView.handleStopPublish();
+        return;
+      }
+
+      default:
+        throw new IllegalArgumentException(String.format(
+                "Unsupported command %d received by %s.",
+                commandType,
+                getClass().getSimpleName()));
+    }
+  }
+
+  /**
+   * This method maps the sending of the "onClick" event to the JS "onClick" function.
+   */
+  @Nullable @Override
+  public Map<String, Object> getExportedCustomDirectEventTypeConstants() {
+    return MapBuilder.<String, Object>builder()
+//            .put("onBroadcastStart",
+//                    MapBuilder.of("registrationName", "onBroadcastStart"))
+            .build();
+  }
+
+
+}
+
diff --git a/android/src/main/java/com/reactlibrary/RNBroadcastViewManager.java b/android/src/main/java/com/reactlibrary/RNBroadcastViewManager.java
deleted file mode 100644
index adbfc72..0000000
--- a/android/src/main/java/com/reactlibrary/RNBroadcastViewManager.java
+++ /dev/null
@@ -1,225 +0,0 @@
-
-package com.reactlibrary;
-
-import java.io.IOException;
-import java.net.SocketException;
-import java.util.List;
-import java.util.Arrays;
-
-import android.app.Activity;
-import android.graphics.Color;
-import android.hardware.Camera;
-import android.support.annotation.Nullable;
-import android.util.Log;
-import android.widget.LinearLayout;
-import android.widget.Toast;
-
-import com.facebook.react.ReactRootView;
-import com.facebook.react.bridge.ReactApplicationContext;
-import com.facebook.react.bridge.ReactContextBaseJavaModule;
-import com.facebook.react.bridge.ReactMethod;
-import com.facebook.react.bridge.Callback;
-import com.facebook.react.uimanager.SimpleViewManager;
-import com.facebook.react.uimanager.ThemedReactContext;
-import com.facebook.react.uimanager.ViewProps;
-import com.facebook.react.uimanager.annotations.ReactProp;
-import com.facebook.react.views.image.ImageResizeMode;
-import com.facebook.drawee.backends.pipeline.Fresco;
-import com.github.faucamp.simplertmp.RtmpHandler;
-
-import net.ossrs.yasea.SrsCameraView;
-import net.ossrs.yasea.SrsEncodeHandler;
-import net.ossrs.yasea.SrsPublisher;
-import net.ossrs.yasea.SrsRecordHandler;
-
-import static com.facebook.react.common.ReactConstants.TAG;
-
-
-public class RNBroadcastViewManager extends SimpleViewManager<SrsCameraView> implements RtmpHandler.RtmpListener,
-        SrsRecordHandler.SrsRecordListener, SrsEncodeHandler.SrsEncodeListener {
-
-    public static final String REACT_CLASS = "RNBroadcastView";
-    private SrsPublisher mPublisher;
-    private ThemedReactContext mContext = null;
-    private Boolean isLive = false;
-
-
-    @Override
-    public String getName() {
-    return REACT_CLASS;
-  }
-
-    @Override
-    public SrsCameraView createViewInstance(ThemedReactContext context) {
-      this.mContext = context;
-
-      SrsCameraView view = new SrsCameraView(context);
-      this.mPublisher = new SrsPublisher(view);
-      this.mPublisher.setEncodeHandler(new SrsEncodeHandler(this));
-      this.mPublisher.setRtmpHandler(new RtmpHandler(this));
-      this.mPublisher.setRecordHandler(new SrsRecordHandler(this));
-      this.mPublisher.setPreviewResolution(1280, 720);
-      this.mPublisher.setOutputResolution(1280, 720);
-      this.mPublisher.setScreenOrientation(2);
-      this.mPublisher.switchToSoftEncoder();
-      this.mPublisher.setVideoHDMode();
-      this.mPublisher.startCamera();
-      return view;
-    }
-
-  @ReactProp(name = "cameraPosition")
-  public void setCameraPosition(SrsCameraView view, @Nullable String cameraPosition) {
-    mPublisher.switchCameraFace((mPublisher.getCamraId() + 1) % Camera.getNumberOfCameras());
-  }
-
-  private void handleException(Exception e) {
-    try {
-      this.mPublisher.stopPublish();
-      this.mPublisher.stopRecord();
-    } catch (Exception e1) {
-      //
-    }
-  }
-
-  @ReactProp(name = "publish")
-  public void started(SrsCameraView view, @Nullable String publish) {
-    System.out.println("Starting: " + publish + "live: " + this.isLive);
-    if (!publish.isEmpty()) {
-      this.mPublisher.startPublish(publish);
-    }
-    else if (publish.isEmpty() && this.isLive){
-      this.mPublisher.stopPublish();
-    }
-  }
-
-  // Implementation of SrsRtmpListener.
-
-  @Override
-  public void onRtmpConnecting(String msg) {
-//    Toast.makeText(getApplicationContext(), msg, Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRtmpConnected(String msg) {
-//    Toast.makeText(getApplicationContext(), msg, Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRtmpVideoStreaming() {
-      this.isLive = true;
-  }
-
-  @Override
-  public void onRtmpAudioStreaming() {
-    this.isLive = true;
-  }
-
-  @Override
-  public void onRtmpStopped() {
-      this.isLive = false;
-  }
-
-  @Override
-  public void onRtmpDisconnected() {
-//    Toast.makeText(getApplicationContext(), "Disconnected", Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRtmpVideoFpsChanged(double fps) {
-    Log.i(TAG, String.format("Output Fps: %f", fps));
-  }
-
-  @Override
-  public void onRtmpVideoBitrateChanged(double bitrate) {
-    int rate = (int) bitrate;
-    if (rate / 1000 > 0) {
-      Log.i(TAG, String.format("Video bitrate: %f kbps", bitrate / 1000));
-    } else {
-      Log.i(TAG, String.format("Video bitrate: %d bps", rate));
-    }
-  }
-
-  @Override
-  public void onRtmpAudioBitrateChanged(double bitrate) {
-    int rate = (int) bitrate;
-    if (rate / 1000 > 0) {
-      Log.i(TAG, String.format("Audio bitrate: %f kbps", bitrate / 1000));
-    } else {
-      Log.i(TAG, String.format("Audio bitrate: %d bps", rate));
-    }
-  }
-
-  @Override
-  public void onRtmpSocketException(SocketException e) {
-
-    handleException(e);
-  }
-
-  @Override
-  public void onRtmpIOException(IOException e) {
-
-    handleException(e);
-  }
-
-  @Override
-  public void onRtmpIllegalArgumentException(IllegalArgumentException e) {
-
-    handleException(e);
-  }
-
-  @Override
-  public void onRtmpIllegalStateException(IllegalStateException e) {
-
-    handleException(e);
-  }
-
-  // Implementation of SrsRecordHandler.
-
-  @Override
-  public void onRecordPause() {
-//    Toast.makeText(getApplicationContext(), "Record paused", Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRecordResume() {
-//    Toast.makeText(getApplicationContext(), "Record resumed", Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRecordStarted(String msg) {
-//    Toast.makeText(getApplicationContext(), "Recording file: " + msg, Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRecordFinished(String msg) {
-//    Toast.makeText(getApplicationContext(), "MP4 file saved: " + msg, Toast.LENGTH_SHORT).show();
-  }
-
-  @Override
-  public void onRecordIOException(IOException e) {
-
-
-  }
-
-  @Override
-  public void onRecordIllegalArgumentException(IllegalArgumentException e) {
-
-
-  }
-
-  // Implementation of SrsEncodeHandler.
-
-  @Override
-  public void onNetworkWeak() {
-  }
-
-  @Override
-  public void onNetworkResume() {
-  }
-
-  @Override
-  public void onEncodeIllegalArgumentException(IllegalArgumentException e) {
-    handleException(e);
-  }
-
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicAmaroFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicAmaroFilter.java
deleted file mode 100755
index fc0b383..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicAmaroFilter.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicAmaroFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicAmaroFilter(){
-        super(MagicFilterType.AMARO, R.raw.amaro);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for (int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_blowout.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/overlaymap.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/amaromap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicAntiqueFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicAntiqueFilter.java
deleted file mode 100755
index de0174d..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicAntiqueFilter.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicAntiqueFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicAntiqueFilter(){
-        super(MagicFilterType.ANTIQUE, R.raw.antique);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        this.mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (this.mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (this.mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(this.mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 64, 66, 67, 68, 69, 71, 72, 73, 74, 76, 77, 78, 79, 81, 82, 83, 85, 86, 87, 89, 90, 91, 93, 94, 95, 96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112, 114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 128, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, 157, 158, 159, 160, 161, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 211, 212, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 226, 226, 227, 228, 228, 229, 230, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 239, 240, 241, 241, 242, 242, 243, 244, 244, 245, 245, 246, 247, 247, 248, 248, 249, 249, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt2 = { 15, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 55, 56, 57, 57, 58, 59, 61, 62, 63, 64, 66, 67, 68, 69, 71, 72, 72, 73, 74, 76, 77, 78, 79, 81, 82, 83, 85, 86, 87, 87, 89, 90, 91, 93, 94, 95, 96, 98, 99, 100, 102, 102, 103, 104, 106, 107, 108, 110, 111, 112, 114, 115, 116, 118, 118, 119, 120, 122, 123, 124, 126, 127, 128, 130, 131, 132, 134, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 146, 148, 149, 149, 150, 152, 153, 154, 155, 157, 158, 159, 160, 161, 163, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 175, 176, 177, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 211, 212, 212, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 222, 223, 223, 224, 225, 226, 226, 227, 228, 228, 229, 230, 230, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 238, 239, 239, 240, 241, 241, 242, 242, 243, 244, 244, 245, 245, 245, 246, 247, 247, 248, 248, 249, 249, 250, 251, 251, 252, 252, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt3 = { 87, 89, 89, 90, 90, 91, 91, 93, 93, 94, 95, 95, 96, 96, 98, 98, 99, 100, 100, 102, 102, 103, 103, 104, 104, 106, 107, 107, 108, 108, 110, 110, 111, 112, 112, 114, 114, 115, 115, 116, 118, 118, 119, 119, 120, 120, 122, 123, 123, 124, 124, 126, 126, 127, 128, 128, 130, 130, 131, 131, 132, 134, 134, 135, 135, 136, 136, 137, 139, 139, 140, 140, 141, 143, 143, 144, 144, 145, 146, 146, 148, 148, 149, 150, 150, 152, 152, 153, 154, 154, 155, 155, 157, 158, 158, 159, 159, 160, 161, 161, 163, 163, 164, 165, 165, 166, 168, 168, 169, 169, 170, 171, 171, 172, 173, 173, 175, 175, 176, 177, 177, 178, 179, 179, 180, 181, 181, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 210, 211, 211, 211, 212, 212, 213, 214, 215, 215, 216, 216, 217, 217, 218, 219, 219, 220, 220, 221, 221, 222, 223, 223, 223, 224, 225, 226, 226, 226, 227, 228, 228, 228, 229, 230, 230, 230, 231, 232, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 238, 238, 238, 239, 239, 240, 240, 241, 241, 242, 242, 242, 243, 244, 244, 244, 245, 245, 246, 247, 247, 247, 248, 248, 249, 249, 249, 250, 251, 251, 252, 252, 252, 253, 253, 254, 254, 254, 255 };
-                int[] arrayOfInt4 = { 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 24, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 97, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 113, 114, 115, 117, 118, 119, 121, 122, 123, 125, 126, 127, 129, 130, 131, 133, 134, 136, 137, 138, 140, 141, 142, 144, 145, 146, 148, 149, 150, 152, 153, 154, 156, 157, 158, 159, 161, 162, 163, 165, 166, 167, 168, 170, 171, 172, 173, 175, 176, 177, 178, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 213, 214, 215, 216, 217, 218, 219, 219, 220, 221, 222, 223, 223, 224, 225, 226, 227, 227, 228, 229, 230, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 241, 242, 242, 243, 244, 244, 245, 245, 246, 247, 247, 248, 248, 249, 250, 250, 251, 251, 252, 253, 253, 254, 254, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicBeautyFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicBeautyFilter.java
deleted file mode 100755
index f1cec7f..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicBeautyFilter.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-/**
- * Created by Administrator on 2016/5/22.
- */
-public class MagicBeautyFilter extends GPUImageFilter{
-    private int mSingleStepOffsetLocation;
-
-    public MagicBeautyFilter(){
-        super(MagicFilterType.BEAUTY, R.raw.beauty);
-    }
-
-    protected void onInit() {
-        super.onInit();
-        mSingleStepOffsetLocation = GLES20.glGetUniformLocation(getProgram(), "singleStepOffset");
-    }
-
-    @Override
-    public void onInputSizeChanged(final int width, final int height) {
-        super.onInputSizeChanged(width, height);
-        setFloatVec2(mSingleStepOffsetLocation, new float[] {2.0f / width, 2.0f / height});
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicBlackCatFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicBlackCatFilter.java
deleted file mode 100755
index 5053657..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicBlackCatFilter.java
+++ /dev/null
@@ -1,90 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicBlackCatFilter extends GPUImageFilter{
-
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-
-    public MagicBlackCatFilter(){
-        super(MagicFilterType.BLACKCAT, R.raw.blackcat);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 7, 8, 10, 11, 13, 15, 16, 18, 20, 21, 23, 24, 26, 28, 29, 31, 33, 34, 36, 37, 39, 41, 42, 44, 45, 47, 49, 50, 52, 53, 55, 57, 58, 60, 61, 63, 65, 66, 68, 69, 71, 72, 74, 76, 77, 79, 80, 82, 83, 85, 86, 88, 90, 91, 93, 94, 96, 97, 99, 100, 102, 103, 105, 106, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, 124, 125, 127, 128, 129, 131, 132, 134, 135, 136, 138, 139, 141, 142, 143, 145, 146, 147, 149, 150, 151, 153, 154, 155, 157, 158, 159, 160, 162, 163, 164, 165, 167, 168, 169, 170, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224, 225, 226, 227, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 241, 242, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 248, 249, 250, 250, 251, 251, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 7, 8, 10, 11, 13, 15, 16, 16, 18, 20, 21, 23, 24, 26, 28, 29, 31, 33, 34, 36, 37, 39, 41, 41, 42, 44, 45, 47, 49, 50, 52, 53, 55, 57, 58, 60, 61, 63, 65, 66, 68, 69, 71, 72, 74, 76, 77, 79, 80, 82, 83, 85, 86, 86, 88, 90, 91, 93, 94, 96, 97, 99, 100, 102, 103, 105, 106, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, 124, 125, 127, 128, 129, 131, 134, 135, 136, 138, 139, 141, 142, 143, 145, 146, 147, 149, 150, 151, 153, 154, 155, 157, 158, 159, 160, 162, 163, 164, 165, 167, 168, 169, 170, 172, 173, 174, 175, 176, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 203, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 215, 216, 217, 219, 220, 220, 221, 222, 223, 224, 224, 225, 226, 227, 227, 228, 229, 229, 230, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 242, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 248, 249, 250, 251, 251, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 5, 7, 8, 10, 10, 11, 13, 15, 16, 18, 20, 20, 21, 23, 24, 26, 28, 29, 29, 31, 33, 34, 36, 37, 39, 39, 41, 42, 44, 45, 47, 49, 50, 50, 52, 53, 55, 57, 58, 60, 61, 63, 63, 65, 66, 68, 69, 71, 72, 74, 76, 77, 79, 79, 80, 82, 83, 85, 86, 88, 90, 91, 93, 94, 96, 97, 99, 100, 100, 102, 103, 105, 106, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, 124, 125, 127, 128, 129, 131, 132, 134, 135, 136, 138, 139, 141, 142, 143, 145, 146, 147, 149, 150, 151, 153, 154, 155, 157, 158, 159, 160, 162, 164, 165, 167, 168, 169, 170, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 215, 216, 218, 219, 220, 220, 221, 222, 223, 224, 225, 226, 227, 227, 228, 229, 229, 230, 232, 232, 233, 234, 234, 235, 236, 236, 238, 238, 239, 240, 240, 241, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 251, 252, 253, 254, 254, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                int[] arrayOfInt4 = { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 12, 13, 13, 14, 14, 15, 16, 17, 18, 18, 19, 19, 20, 21, 21, 23, 24, 25, 26, 26, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 38, 39, 40, 41, 43, 44, 45, 45, 46, 47, 48, 49, 49, 50, 51, 52, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 69, 70, 70, 71, 72, 74, 75, 76, 76, 77, 78, 79, 80, 81, 82, 82, 83, 85, 86, 87, 88, 89, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 101, 102, 103, 104, 105, 107, 107, 108, 109, 111, 112, 113, 114, 114, 115, 116, 117, 119, 120, 120, 121, 122, 123, 124, 125, 126, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 139, 141, 142, 143, 144, 145, 145, 146, 147, 148, 149, 151, 151, 153, 154, 155, 156, 157, 158, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 181, 182, 183, 183, 184, 185, 186, 189, 189, 190, 191, 192, 193, 194, 195, 195, 196, 198, 199, 201, 202, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 213, 214, 214, 215, 216, 218, 219, 220, 221, 221, 222, 223, 225, 227, 227, 228, 229, 230, 230, 230, 230, 230, 230, 230, 230, 230 };
-                int[] arrayOfInt5 = { 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 76, 77, 78, 79, 81, 82, 83, 84, 86, 87, 88, 89, 91, 92, 93, 95, 96, 97, 99, 100, 101, 103, 104, 105, 107, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, 124, 125, 126, 128, 129, 131, 132, 134, 135, 137, 138, 140, 141, 142, 144, 145, 147, 148, 149, 151, 152, 154, 155, 156, 158, 159, 160, 162, 163, 164, 166, 167, 168, 170, 171, 172, 173, 175, 176, 177, 178, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 241, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 252, 253, 253, 254, 254, 254, 255, 255 };
-                int[] arrayOfInt6 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt4[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicBrannanFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicBrannanFilter.java
deleted file mode 100755
index f4a4581..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicBrannanFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicBrannanFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicBrannanFilter(){
-        super(MagicFilterType.BRANNAN, R.raw.brannan);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for (int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_process.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_blowout.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_contrast.png");
-                inputTextureHandles[3] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_luma.png");
-                inputTextureHandles[4] = OpenGLUtils.loadTexture(getContext(), "filter/brannan_screen.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicBrooklynFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicBrooklynFilter.java
deleted file mode 100755
index 7461d27..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicBrooklynFilter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicBrooklynFilter extends GPUImageFilter {
-    private int[] inputTextureHandles = {-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicBrooklynFilter(){
-        super(MagicFilterType.BROOKLYN, R.raw.brooklyn);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for (int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for (int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/brooklynCurves1.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/filter_map_first.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/brooklynCurves2.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicCalmFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicCalmFilter.java
deleted file mode 100755
index a03456a..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicCalmFilter.java
+++ /dev/null
@@ -1,127 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicCalmFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mMaskGrey2TextureId = -1;
-    private int mMaskGrey2UniformLocation;
-    
-    public MagicCalmFilter(){
-        super(MagicFilterType.CALM, R.raw.calm);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();      
-        GLES20.glDeleteTextures(3, new int[]{mToneCurveTexture[0], mMaskGrey1TextureId, mMaskGrey2TextureId}, 0);        
-        mToneCurveTexture[0] = -1;
-        mMaskGrey1TextureId = -1;
-        mMaskGrey2TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey2TextureId);
-             GLES20.glUniform1i(mMaskGrey2UniformLocation, 5);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey1Frame");
-        mMaskGrey2UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey2Frame");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[3072];
-                int[] arrayOfInt1 = { 38, 39, 40, 41, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 52, 53, 54, 55, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 69, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 81, 82, 83, 84, 85, 86, 87, 87, 88, 89, 90, 91, 92, 92, 93, 94, 95, 96, 97, 98, 98, 99, 100, 101, 102, 103, 104, 104, 105, 106, 107, 108, 109, 109, 110, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 126, 127, 127, 128, 129, 130, 131, 132, 132, 133, 134, 135, 136, 137, 138, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 161, 161, 162, 163, 164, 165, 166, 166, 167, 168, 169, 170, 171, 172, 172, 173, 174, 175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 195, 196, 197, 198, 199, 200, 201, 201, 202, 203, 204, 205, 206, 206, 207, 208, 209, 210, 211, 212, 212, 213, 214, 215, 216, 217, 218, 218, 219, 220, 221, 222, 223, 224, 224, 225, 226, 227, 228, 229, 229, 230, 231, 232, 233, 234, 235, 235, 236, 237, 238, 239, 240, 241, 241, 242, 243, 244, 245, 246, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                int[] arrayOfInt2 = { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 93, 94, 95, 96, 97, 98, 99, 99, 100, 101, 102, 103, 104, 104, 105, 106, 107, 108, 109, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 127, 128, 129, 130, 131, 131, 132, 133, 134, 135, 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 156, 157, 157, 158, 159, 160, 161, 162, 163, 164, 165, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 213, 214, 215, 216, 217, 218, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 232, 233, 234, 235, 237, 238, 239, 240, 241, 243, 244, 245, 246, 248, 249, 250, 251, 253, 254, 255 };
-                int[] arrayOfInt3 = { 0, 1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 42, 43, 44, 46, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 64, 65, 67, 69, 70, 72, 74, 75, 77, 79, 80, 82, 84, 85, 87, 89, 91, 92, 94, 96, 97, 99, 101, 102, 104, 106, 107, 109, 111, 112, 114, 115, 117, 118, 120, 121, 123, 124, 126, 127, 129, 130, 132, 133, 134, 136, 137, 138, 140, 141, 142, 144, 145, 146, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 184, 185, 186, 187, 188, 189, 190, 191, 191, 192, 193, 194, 195, 196, 196, 197, 198, 199, 200, 200, 201, 202, 203, 203, 204, 205, 206, 206, 207, 208, 209, 209, 210, 211, 211, 212, 213, 214, 214, 215, 216, 216, 217, 218, 218, 219, 220, 220, 221, 222, 222, 223, 224, 224, 225, 226, 226, 227, 227, 228, 229, 229, 230, 231, 231, 232, 233, 233, 234, 234, 235, 236, 236, 237, 237, 238, 239, 239, 240, 240, 241, 242, 242, 243, 243, 244, 245, 245, 246, 246, 247, 247, 248, 249, 249, 250, 250, 251, 252, 252, 253, 253, 254, 254, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt3[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt3[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt2[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                int[] arrayOfInt4 = { 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 26, 27, 28, 30, 31, 33, 34, 35, 37, 38, 39, 41, 42, 43, 45, 46, 47, 49, 50, 51, 53, 54, 55, 57, 58, 59, 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 98, 99, 100, 101, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 175, 176, 177, 178, 179, 180, 181, 182, 183, 183, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 202, 203, 204, 205, 205, 206, 207, 208, 208, 209, 210, 211, 211, 212, 213, 214, 214, 215, 216, 216, 217, 218, 218, 219, 220, 221, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 231, 232, 233, 233, 234, 235, 235, 236, 237, 237, 238, 239, 239, 240, 240, 241, 242, 242, 243, 244, 244, 245, 246, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt5 = { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 39, 40, 40, 41, 42, 42, 43, 44, 44, 45, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 96, 97, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 113, 114, 115, 117, 118, 120, 121, 123, 124, 126, 127, 129, 130, 132, 133, 135, 137, 138, 140, 142, 143, 145, 147, 148, 150, 152, 153, 155, 157, 159, 161, 162, 164, 166, 168, 170, 172, 174, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 247, 249, 251, 253, 255 };
-                int[] arrayOfInt6 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int k = 0; k < 256; k++){
-                  arrayOfByte[(2048 + k * 4)] = ((byte)arrayOfInt4[k]);
-                  arrayOfByte[(1 + (2048 + k * 4))] = ((byte)arrayOfInt5[k]);
-                  arrayOfByte[(2 + (2048 + k * 4))] = ((byte)arrayOfInt6[k]);
-                  arrayOfByte[(3 + (2048 + k * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 3, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/calm_mask1.jpg");
-                mMaskGrey2TextureId = OpenGLUtils.loadTexture(getContext(), "filter/calm_mask2.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicCoolFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicCoolFilter.java
deleted file mode 100755
index c45c0fc..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicCoolFilter.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicCoolFilter extends GPUImageFilter {
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-
-    public MagicCoolFilter(){
-        super(MagicFilterType.COOL, R.raw.cool);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        this.mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (this.mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        if (this.mToneCurveTexture[0] != -1) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(this.mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 24, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 33, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 97, 98, 99, 100, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116, 117, 119, 120, 121, 123, 124, 126, 127, 128, 130, 131, 133, 134, 136, 137, 139, 140, 142, 143, 145, 146, 148, 149, 151, 152, 154, 155, 157, 158, 160, 161, 163, 165, 166, 168, 169, 171, 173, 174, 176, 177, 179, 181, 182, 184, 185, 187, 189, 190, 192, 194, 195, 197, 199, 200, 202, 204, 205, 207, 209, 210, 212, 214, 216, 217, 219, 221, 222, 224, 226, 228, 229, 231, 233, 234, 236, 238, 240, 241, 243, 245, 246, 248, 250, 252, 253, 255 };
-                int[] arrayOfInt2 = { 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt3 = { 0, 3, 6, 9, 11, 14, 17, 20, 23, 26, 28, 31, 34, 37, 40, 43, 45, 48, 51, 54, 57, 59, 62, 65, 68, 70, 73, 76, 79, 81, 84, 87, 89, 92, 95, 97, 100, 102, 105, 108, 110, 113, 115, 118, 120, 123, 125, 128, 130, 133, 135, 137, 140, 142, 144, 147, 149, 151, 153, 156, 158, 160, 162, 164, 166, 168, 171, 173, 175, 177, 179, 180, 182, 184, 186, 188, 190, 191, 193, 195, 197, 198, 200, 201, 203, 205, 206, 207, 209, 210, 212, 213, 214, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 240, 240, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 251, 252, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt4 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 13, 17, 21, 24, 32, 36, 39, 46, 50, 53, 56, 62, 65, 68, 73, 75, 78, 80, 85, 87, 88, 92, 94, 95, 96, 99, 100, 102, 104, 106, 107, 109, 110, 112, 113, 115, 116, 117, 120, 121, 122, 123, 125, 126, 127, 129, 130, 131, 132, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 146, 147, 148, 149, 150, 151, 152, 154, 154, 155, 156, 158, 159, 159, 161, 162, 163, 163, 165, 166, 166, 168, 169, 169, 170, 172, 172, 173, 175, 175, 176, 177, 178, 179, 180, 181, 182, 182, 184, 184, 185, 186, 187, 188, 188, 190, 190, 191, 192, 193, 194, 194, 196, 196, 197, 197, 199, 199, 200, 201, 202, 202, 203, 204, 205, 205, 207, 207, 208, 208, 210, 210, 211, 212, 213, 213, 214, 215, 215, 216, 217, 218, 218, 219, 220, 221, 221, 222, 223, 223, 224, 225, 226, 226, 227, 228, 228, 229, 230, 230, 231, 232, 232, 233, 234, 235, 235, 236, 237, 237, 238, 239, 239, 240, 240, 241, 242, 242, 243, 244, 244, 245, 246, 246, 247, 248, 248, 249, 249, 250, 251, 251, 252, 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                for (int i = 0; i < 256; i++){
-                    arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                    arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                    arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                    arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                    arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                    arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                    arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                    arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicCrayonFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicCrayonFilter.java
deleted file mode 100755
index e111f2f..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicCrayonFilter.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicCrayonFilter extends GPUImageFilter{
-    
-    private int mSingleStepOffsetLocation;
-    //1.0 - 5.0
-    private int mStrengthLocation;
-    
-    public MagicCrayonFilter(){
-        super(MagicFilterType.CRAYON, R.raw.crayon);
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mSingleStepOffsetLocation = GLES20.glGetUniformLocation(getProgram(), "singleStepOffset");
-        mStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-        setFloat(mStrengthLocation, 2.0f);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mStrengthLocation, 0.5f);
-    }
-
-    @Override
-    public void onInputSizeChanged(final int width, final int height) {
-        setFloatVec2(mSingleStepOffsetLocation, new float[] {1.0f / width, 1.0f / height});
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicEarlyBirdFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicEarlyBirdFilter.java
deleted file mode 100755
index 2666e81..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicEarlyBirdFilter.java
+++ /dev/null
@@ -1,71 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicEarlyBirdFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1,-1,-1};
-    protected int mGLStrengthLocation;
-
-    public MagicEarlyBirdFilter(){
-        super(MagicFilterType.EARLYBIRD, R.raw.earlybird);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++)
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture"+(2+i));
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(),    "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/earlybirdcurves.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/earlybirdoverlaymap_new.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/vignettemap_new.png");
-                inputTextureHandles[3] = OpenGLUtils.loadTexture(getContext(), "filter/earlybirdblowout.png");
-                inputTextureHandles[4] = OpenGLUtils.loadTexture(getContext(), "filter/earlybirdmap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicEmeraldFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicEmeraldFilter.java
deleted file mode 100755
index 5610e5b..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicEmeraldFilter.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicEmeraldFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicEmeraldFilter(){
-        super(MagicFilterType.EMERALD, R.raw.emerald);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);            
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 17, 18, 19, 21, 22, 23, 25, 26, 29, 30, 31, 32, 34, 35, 36, 39, 40, 41, 43, 44, 45, 46, 48, 50, 51, 53, 54, 55, 56, 58, 60, 61, 62, 64, 65, 66, 67, 69, 70, 72, 73, 75, 76, 77, 78, 79, 81, 82, 84, 85, 87, 88, 89, 90, 91, 92, 94, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 184, 185, 186, 187, 188, 189, 190, 191, 191, 192, 193, 194, 195, 196, 197, 197, 198, 199, 200, 201, 201, 202, 203, 204, 205, 206, 206, 207, 208, 209, 210, 210, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 226, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236, 236, 237, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, 244, 245, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 0, 0, 1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 16, 18, 19, 21, 22, 23, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 50, 51, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 109, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 184, 185, 186, 188, 189, 190, 191, 191, 192, 193, 194, 195, 196, 197, 197, 198, 199, 200, 201, 201, 202, 203, 204, 205, 206, 206, 207, 209, 210, 210, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 226, 226, 227, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, 244, 245, 247, 247, 248, 249, 249, 250, 251, 251, 252, 253, 254, 254, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt3 = { 0, 1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 49, 50, 51, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 184, 185, 186, 187, 188, 189, 190, 191, 191, 192, 193, 194, 195, 196, 197, 197, 198, 199, 200, 201, 201, 202, 203, 204, 205, 206, 206, 207, 208, 209, 210, 210, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 226, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, 244, 245, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 253, 254, 254, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                int[] arrayOfInt4 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 224, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt5 = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 223, 224, 225, 226, 227, 228, 229, 229, 230, 231, 232, 233, 233, 234, 235, 236, 237, 237, 238, 239, 240, 240, 241, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 249, 250, 250, 251, 252, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt6 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt4[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicEvergreenFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicEvergreenFilter.java
deleted file mode 100755
index 0a2eed2..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicEvergreenFilter.java
+++ /dev/null
@@ -1,80 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicEvergreenFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicEvergreenFilter(){
-        super(MagicFilterType.EVERGREEN, R.raw.evergreen);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-    @Override protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt2 = { 0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 24, 25, 26, 27, 28, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 148, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 241, 242, 243, 244, 245, 246, 247, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255 };
-                int[] arrayOfInt3 = { 0, 2, 4, 6, 8, 10, 11, 13, 16, 17, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31, 32, 33, 34, 36, 38, 39, 40, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 168, 169, 170, 171, 172, 173, 175, 176, 177, 177, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187, 188, 190, 191, 192, 193, 193, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 205, 206, 207, 207, 208, 209, 210, 211, 212, 213, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 225, 226, 227, 228, 229, 230, 231, 231, 232, 234, 235, 236, 237, 237, 238, 239, 240, 241, 242, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++)
-                {
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicFreudFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicFreudFilter.java
deleted file mode 100755
index 0d8659e..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicFreudFilter.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicFreudFilter extends GPUImageFilter {
-    private int mTexelHeightUniformLocation;
-    private int mTexelWidthUniformLocation;
-    private int[] inputTextureHandles = {-1};
-    private int[] inputTextureUniformLocations = {-1};
-    private int mGLStrengthLocation;
-
-    public MagicFreudFilter(){
-        super(MagicFilterType.FREUD, R.raw.freud);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for (int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for (int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        inputTextureUniformLocations[0] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture2");
-        
-        mTexelWidthUniformLocation = GLES20.glGetUniformLocation(getProgram(), "inputImageTextureWidth");
-        mTexelHeightUniformLocation = GLES20.glGetUniformLocation(getProgram(), "inputImageTextureHeight");
-
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/freud_rand.png");
-            }
-        });
-    }
-
-    @Override
-    public void onInputSizeChanged(int width, int height) {
-        super.onInputSizeChanged(width, height);
-        GLES20.glUniform1f(mTexelWidthUniformLocation, (float)width);
-        GLES20.glUniform1f(mTexelHeightUniformLocation, (float)height);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicHealthyFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicHealthyFilter.java
deleted file mode 100755
index 1a529b9..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicHealthyFilter.java
+++ /dev/null
@@ -1,113 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicHealthyFilter extends GPUImageFilter{
-
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mTexelHeightUniformLocation;
-    private int mTexelWidthUniformLocation;
-
-    public MagicHealthyFilter(){
-        super(MagicFilterType.HEALTHY, R.raw.healthy);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-        int[] texture = new int[1];
-        texture[0] = mMaskGrey1TextureId;
-        GLES20.glDeleteTextures(1, texture, 0);
-        mMaskGrey1TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-          GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-          GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-          GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "mask");
-        mTexelWidthUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelWidthOffset");
-        mTexelHeightUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelHeightOffset");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt1 = { 95, 95, 96, 97, 97, 98, 99, 99, 100, 101, 101, 102, 103, 104, 104, 105, 106, 106, 107, 108, 108, 109, 110, 111, 111, 112, 113, 113, 114, 115, 115, 116, 117, 117, 118, 119, 120, 120, 121, 122, 122, 123, 124, 124, 125, 126, 127, 127, 128, 129, 129, 130, 131, 131, 132, 133, 133, 134, 135, 136, 136, 137, 138, 138, 139, 140, 140, 141, 142, 143, 143, 144, 145, 145, 146, 147, 147, 148, 149, 149, 150, 151, 152, 152, 153, 154, 154, 155, 156, 156, 157, 158, 159, 159, 160, 161, 161, 162, 163, 163, 164, 165, 165, 166, 167, 168, 168, 169, 170, 170, 171, 172, 172, 173, 174, 175, 175, 176, 177, 177, 178, 179, 179, 180, 181, 181, 182, 183, 184, 184, 185, 186, 186, 187, 188, 188, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 200, 200, 201, 202, 202, 203, 204, 204, 205, 206, 207, 207, 208, 209, 209, 210, 211, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 239, 239, 240, 241, 241, 242, 243, 243, 244, 245, 245, 246, 247, 248, 248, 249, 250, 250, 251, 252, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 4, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 205, 206, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 219, 220, 221, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 234, 235, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt3 = { 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 249, 250, 251, 252, 253, 254, 255 };
-                for (int i = 0; i < 256; i++)
-                {
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/healthy_mask_1.jpg");
-            }
-        });
-    }
-
-    @Override
-    public void onInputSizeChanged(int width, int height){
-        super.onInputSizeChanged(width, height);
-        GLES20.glUniform1f(mTexelWidthUniformLocation, 1.0f / width);
-        GLES20.glUniform1f(mTexelHeightUniformLocation, 1.0f / height);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicHefeFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicHefeFilter.java
deleted file mode 100755
index ab01e47..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicHefeFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicHefeFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicHefeFilter(){
-        super(MagicFilterType.HEFE, R.raw.hefe);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/edgeburn.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/hefemap.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/hefemetal.png");
-                inputTextureHandles[3] = OpenGLUtils.loadTexture(getContext(), "filter/hefesoftlight.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicHudsonFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicHudsonFilter.java
deleted file mode 100755
index cdb7caf..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicHudsonFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicHudsonFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicHudsonFilter(){
-        super(MagicFilterType.HUDSON, R.raw.hudson);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/ohudsonbackground.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/overlaymap.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/hudsonmap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicImageAdjustFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicImageAdjustFilter.java
deleted file mode 100755
index ecd3701..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicImageAdjustFilter.java
+++ /dev/null
@@ -1,55 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import com.seu.magicfilter.base.MagicBaseGroupFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageBrightnessFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageContrastFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageExposureFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageHueFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageSaturationFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageSharpenFilter;
-
-public class MagicImageAdjustFilter extends MagicBaseGroupFilter{
-    
-    public MagicImageAdjustFilter() {
-        super(initFilters());
-    }
-    
-    private static List<GPUImageFilter> initFilters(){
-        List<GPUImageFilter> filters = new ArrayList<GPUImageFilter>();
-        filters.add(new GPUImageContrastFilter());
-        filters.add(new GPUImageBrightnessFilter());
-        filters.add(new GPUImageExposureFilter());
-        filters.add(new GPUImageHueFilter());
-        filters.add(new GPUImageSaturationFilter());
-        filters.add(new GPUImageSharpenFilter());
-        return filters;        
-    }
-    
-    public void setSharpness(final float range){
-        ((GPUImageSharpenFilter) filters.get(5)).setSharpness(range);
-    }
-    
-    public void setHue(final float range){
-        ((GPUImageHueFilter) filters.get(3)).setHue(range);
-    }
-    
-    public void setBrightness(final float range){
-        ((GPUImageBrightnessFilter) filters.get(1)).setBrightness(range);
-    }
-    
-    public void setContrast(final float range){
-        ((GPUImageContrastFilter) filters.get(0)).setContrast(range);
-    }
-    
-    public void setSaturation(final float range){
-        ((GPUImageSaturationFilter) filters.get(4)).setSaturation(range);
-    }
-    
-    public void setExposure(final float range){
-        ((GPUImageExposureFilter) filters.get(2)).setExposure(range);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicInkwellFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicInkwellFilter.java
deleted file mode 100755
index b09b5e0..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicInkwellFilter.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicInkwellFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1};
-    private int[] inputTextureUniformLocations = {-1};
-    private int mGLStrengthLocation;
-
-    public MagicInkwellFilter(){
-        super(MagicFilterType.INKWELL, R.raw.inkwell);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/inkwellmap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicKevinFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicKevinFilter.java
deleted file mode 100755
index 94d74ff..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicKevinFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicKevinFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1};
-    private int[] inputTextureUniformLocations = {-1};
-    private int mGLStrengthLocation;
-
-    public MagicKevinFilter(){
-        super(MagicFilterType.KEVIN, R.raw.kevin_new);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, inputTextureHandles, 0);
-        for (int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();        
-        for(int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/kelvinmap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicLatteFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicLatteFilter.java
deleted file mode 100755
index da0e9d9..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicLatteFilter.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicLatteFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicLatteFilter(){
-        super(MagicFilterType.LATTE, R.raw.latte);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 5, 6, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 120, 121, 122, 123, 124, 125, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 134, 135, 136, 137, 137, 138, 139, 140, 141, 141, 142, 143, 144, 145, 145, 146, 147, 148, 148, 149, 150, 151, 151, 152, 153, 154, 155, 155, 156, 157, 158, 158, 159, 160, 161, 162, 162, 163, 164, 165, 166, 166, 167, 168, 169, 170, 170, 171, 172, 173, 174, 174, 175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 183, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt2 = { 5, 6, 8, 11, 12, 14, 15, 18, 19, 21, 22, 25, 26, 28, 29, 32, 33, 34, 36, 39, 40, 41, 43, 44, 46, 48, 49, 50, 52, 54, 55, 56, 58, 59, 61, 62, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 108, 108, 109, 110, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 120, 121, 122, 123, 125, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 134, 135, 136, 137, 137, 138, 139, 140, 141, 141, 142, 143, 144, 145, 145, 146, 147, 148, 148, 149, 149, 150, 151, 151, 152, 153, 154, 155, 155, 156, 157, 158, 158, 159, 160, 161, 162, 162, 163, 164, 165, 165, 166, 166, 167, 168, 169, 170, 170, 171, 172, 173, 174, 174, 175, 175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 183, 184, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 199, 200, 201, 202, 203, 204, 205, 206, 206, 207, 208, 209, 210, 211, 212, 213, 213, 214, 215, 215, 216, 217, 218, 219, 219, 220, 221, 222, 223, 224, 225, 226, 226, 227, 228, 229, 230, 231, 232, 232, 233, 234, 235, 237, 238, 239, 240, 240, 241, 242, 243, 244, 245, 246, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255 };
-                int[] arrayOfInt3 = { 5, 6, 8, 11, 12, 14, 15, 16, 18, 21, 22, 23, 25, 26, 28, 30, 32, 33, 34, 36, 37, 40, 41, 43, 44, 45, 46, 49, 50, 52, 53, 54, 55, 58, 59, 60, 61, 62, 64, 66, 67, 68, 69, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 83, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 108, 109, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 120, 121, 123, 124, 125, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 134, 135, 136, 137, 137, 138, 139, 140, 141, 141, 142, 143, 144, 145, 145, 146, 147, 148, 148, 149, 150, 151, 151, 152, 153, 154, 155, 155, 156, 156, 157, 158, 158, 159, 160, 161, 162, 162, 163, 164, 165, 165, 166, 166, 167, 168, 169, 170, 170, 170, 171, 172, 173, 174, 174, 175, 176, 176, 177, 178, 178, 179, 180, 180, 181, 182, 183, 183, 184, 184, 185, 186, 187, 188, 189, 189, 189, 190, 191, 192, 192, 193, 194, 195, 196, 196, 197, 198, 198, 199, 199, 200, 201, 202, 202, 203, 204, 205, 206, 206, 207, 208, 209, 209, 210, 211, 212, 213, 213, 214, 215, 215, 215, 216, 217, 218, 218, 219, 220, 221, 221, 222, 223, 224, 224, 225, 226, 227, 227, 228, 229, 230, 230, 231, 232, 233, 233, 234, 235, 237, 237, 238, 239, 240, 240, 241, 242, 243, 243, 244 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                int[] arrayOfInt4 = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 179, 180, 181, 182, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 206, 208, 209, 210, 211, 212, 214, 215, 216, 217, 218, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 232, 233, 234, 235, 236, 238, 239, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240 };
-                int[] arrayOfInt5 = { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63, 65, 66, 67, 69, 70, 71, 73, 74, 75, 77, 78, 79, 81, 82, 83, 85, 86, 87, 89, 90, 92, 93, 94, 96, 97, 99, 100, 101, 103, 104, 106, 107, 108, 110, 111, 113, 114, 116, 117, 119, 120, 121, 123, 124, 126, 127, 129, 130, 132, 133, 135, 136, 138, 139, 140, 142, 143, 145, 146, 147, 149, 150, 151, 153, 154, 155, 157, 158, 159, 160, 162, 163, 164, 165, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 210, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 231, 232, 233, 233, 234, 234, 235, 236, 236, 237, 237, 238, 238, 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 244, 245, 245, 246, 246, 247, 247, 247, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252, 252, 253, 253, 253, 253, 254, 254, 254, 254, 255, 255, 255 };
-                int[] arrayOfInt6 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt4[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicLomoFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicLomoFilter.java
deleted file mode 100755
index e874d9e..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicLomoFilter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicLomoFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicLomoFilter(){
-        super(MagicFilterType.LOMO, R.raw.lomo);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/vlomomap_new.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/vignette_map.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicN1977Filter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicN1977Filter.java
deleted file mode 100755
index fecd45b..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicN1977Filter.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicN1977Filter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicN1977Filter(){
-        super(MagicFilterType.N1977, R.raw.n1977);
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/n1977map.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/n1977blowout.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicNashvilleFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicNashvilleFilter.java
deleted file mode 100755
index 47a75cd..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicNashvilleFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicNashvilleFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1};
-    private int[] inputTextureUniformLocations = {-1};
-    private int mGLStrengthLocation;
-
-    public MagicNashvilleFilter(){
-        super(MagicFilterType.NASHVILLE, R.raw.nashville);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/nashvillemap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicNostalgiaFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicNostalgiaFilter.java
deleted file mode 100755
index 5d292f3..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicNostalgiaFilter.java
+++ /dev/null
@@ -1,141 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicNostalgiaFilter extends GPUImageFilter{
-    private int mBlurSizeUniformLocation;
-    private int mTexelWidthUniformLocation;
-    private int mTexelHeightUniformLocation;
-    private int[] mToneCurveTexture = { -1 };
-    private int[] mToneCurveTexture2 = { -1 };
-    private int mToneCurveTextureUniformLocation;
-    private int mToneCurveTextureUniformLocation2;
-      
-    public MagicNostalgiaFilter(){
-        super(MagicFilterType.NOSTALGIA, R.raw.nostalgia);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-        GLES20.glDeleteTextures(1, mToneCurveTexture2, 0);
-        mToneCurveTexture2[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mToneCurveTexture2[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mToneCurveTexture2[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation2, 4);
-        }
-        GLES20.glUniform1f(mBlurSizeUniformLocation, 1.0F);
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mToneCurveTextureUniformLocation2 = GLES20.glGetUniformLocation(getProgram(), "curve2");
-        mTexelWidthUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelWidthOffset");
-        mTexelHeightUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelHeightOffset");
-        mBlurSizeUniformLocation = GLES20.glGetUniformLocation(getProgram(), "blurSize");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte1 = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 6, 8, 9, 11, 13, 15, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 39, 41, 43, 45, 47, 49, 50, 52, 54, 56, 57, 59, 61, 62, 64, 66, 68, 69, 71, 72, 74, 76, 77, 79, 80, 82, 84, 85, 87, 88, 90, 91, 93, 94, 96, 97, 98, 100, 101, 103, 104, 106, 107, 108, 110, 111, 112, 114, 115, 116, 118, 119, 120, 122, 123, 124, 125, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 140, 141, 142, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 174, 175, 176, 177, 178, 179, 180, 181, 182, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193, 193, 194, 195, 196, 197, 197, 198, 199, 200, 201, 201, 202, 203, 204, 204, 205, 206, 207, 207, 208, 209, 210, 210, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 219, 219, 220, 221, 221, 222, 223, 224, 224, 225, 226, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 233, 234, 235, 235, 236, 237, 237, 238, 239, 240, 240, 241, 242, 242, 243, 244, 244, 245, 246, 246, 247, 248, 248, 249, 250, 250, 251, 252, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 14, 16, 17, 19, 20, 22, 23, 24, 26, 27, 29, 30, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 69, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 89, 91, 92, 93, 95, 96, 97, 99, 100, 101, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116, 117, 119, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 158, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 203, 204, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 231, 232, 233, 233, 234, 235, 235, 236, 237, 237, 238, 239, 239, 240, 241, 241, 242, 243, 243, 244, 244, 245, 246, 246, 247, 248, 248, 249, 249, 250, 251, 251, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt3 = { 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 31, 33, 35, 37, 39, 41, 43, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 82, 84, 85, 87, 89, 91, 92, 94, 96, 98, 99, 101, 102, 104, 106, 107, 109, 110, 112, 114, 115, 117, 118, 119, 121, 122, 124, 125, 126, 128, 129, 130, 132, 133, 134, 135, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 175, 176, 177, 178, 178, 179, 180, 181, 181, 182, 183, 184, 184, 185, 186, 186, 187, 188, 189, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 200, 200, 201, 202, 202, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 210, 210, 211, 212, 212, 213, 214, 214, 215, 216, 216, 217, 218, 218, 219, 220, 220, 221, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 231, 232, 233, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 241, 242, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte1[(0 + i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte1[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte1[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte1[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 32, 33, 34, 35, 37, 38, 39, 41, 42, 43, 44, 46, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72, 74, 75, 76, 77, 79, 80, 81, 82, 84, 85, 86, 87, 88, 90, 91, 92, 93, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 174, 175, 176, 177, 178, 179, 180, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 204, 205, 206, 207, 209, 210, 211, 213, 214, 215, 217, 218, 220, 221, 222, 224, 225, 227, 228, 230, 231, 233, 234, 235, 237, 238, 240, 241, 243, 244, 246, 247, 249, 250, 252, 253, 255 };
-                int[] arrayOfInt6 = { 0, 3, 6, 8, 11, 14, 16, 18, 21, 24, 26, 29, 30, 33, 35, 37, 39, 41, 43, 45, 47, 49, 50, 52, 53, 54, 56, 58, 59, 61, 62, 63, 65, 65, 66, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 127, 128, 129, 130, 131, 132, 133, 134, 135, 135, 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 146, 147, 147, 148, 149, 149, 150, 151, 152, 153, 154, 154, 155, 156, 157, 158, 158, 159, 159, 160, 161, 161, 162, 163, 164, 164, 165, 166, 167, 167, 168, 169, 170, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 179, 179, 180, 181, 181, 182, 182, 183, 183, 184, 185, 186, 186, 187, 188, 188, 189, 190, 191, 191, 192, 193, 193, 194, 194, 194, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 205, 205, 206, 206, 207, 207, 208, 209, 209, 210, 211, 211, 212, 213, 213, 214, 215, 215, 216, 217, 217, 217, 217, 218, 219, 219, 220, 221, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 228, 229, 229, 229, 230, 231, 231, 232, 232, 233, 234, 234, 235 };
-                int[] arrayOfInt7 = { 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 102, 103, 104, 106, 107, 108, 109, 111, 112, 113, 115, 116, 118, 119, 120, 122, 123, 125, 126, 128, 129, 131, 132, 134, 135, 137, 139, 140, 142, 143, 145, 147, 148, 150, 152, 153, 155, 156, 158, 160, 161, 163, 164, 166, 167, 169, 170, 172, 173, 175, 176, 178, 179, 180, 182, 183, 185, 186, 188, 189, 190, 192, 193, 194, 196, 197, 199, 200, 201, 203, 204, 205, 206, 208, 209, 210, 212, 213, 214, 216, 217, 218, 219, 221, 222, 223, 224, 226, 227, 228, 229, 231, 232, 233, 234, 236, 237, 238, 239, 240, 242, 243, 244, 245, 247, 248, 249, 250, 251, 253, 254, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte1[(0 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte1[(1 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte1[(2 + (1024 + j * 4))] = ((byte)arrayOfInt7[j]);
-                  arrayOfByte1[(3 + (1024 + j * 4))] = ((byte)arrayOfInt4[j]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte1));
-                GLES20.glGenTextures(1, mToneCurveTexture2, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture2[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte2 = new byte[1024];
-                int[] arrayOfInt8 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                int[] arrayOfInt9 = { 42, 43, 43, 44, 45, 45, 46, 47, 48, 48, 49, 50, 50, 51, 52, 52, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 62, 63, 64, 65, 65, 66, 67, 67, 68, 69, 70, 70, 71, 72, 72, 73, 74, 75, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 84, 85, 86, 87, 87, 88, 89, 90, 91, 91, 92, 93, 94, 94, 95, 96, 97, 98, 98, 99, 100, 101, 102, 103, 103, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 113, 113, 114, 115, 116, 117, 118, 119, 120, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 201, 202, 203, 204, 205, 206, 207, 207, 208, 209, 210, 211, 212, 212, 213, 214, 215, 216, 217, 217, 218, 219, 220, 221, 221, 222, 223, 224, 224, 225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 240, 240, 241, 242, 243, 243, 244, 245, 246, 246, 247, 248, 248, 249, 250, 251, 251, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt10 = { 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 203, 204, 205, 205, 206, 207, 208, 209, 210, 211, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 227, 228, 229, 230, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 238, 239, 240, 240, 241, 241, 242, 243, 243, 244, 245, 245, 246, 246, 247, 248, 248, 249, 250, 250, 251, 251, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt11 = { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 174, 175, 176, 177, 178, 179, 180, 181, 182, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 193, 194, 195, 196, 197, 197, 198, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 219, 219, 220, 221, 222, 223, 224, 224, 225, 226, 227, 228, 229, 229, 230, 231, 232, 233, 234, 234, 235, 236, 237, 238, 239, 239, 240, 241, 242, 243, 244, 244, 245, 246, 247, 248, 248, 249, 250, 251, 252, 253, 253, 254, 255 };
-                for (int k = 0; k < 256; k++){
-                  arrayOfByte2[(0 + k * 4)] = ((byte)arrayOfInt9[k]);
-                  arrayOfByte2[(1 + k * 4)] = ((byte)arrayOfInt10[k]);
-                  arrayOfByte2[(2 + k * 4)] = ((byte)arrayOfInt11[k]);
-                  arrayOfByte2[(3 + k * 4)] = ((byte)arrayOfInt8[k]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte2));
-            }
-        });
-    }
-
-    @Override
-    public void onInputSizeChanged(int width, int height) {
-        super.onInputSizeChanged(width, height);
-        GLES20.glUniform1f(mTexelWidthUniformLocation, (1.0f / (float)width));
-        GLES20.glUniform1f(mTexelHeightUniformLocation, (1.0f / (float)height));
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicPixarFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicPixarFilter.java
deleted file mode 100755
index 9f7d396..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicPixarFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicPixarFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1};
-    private int[] inputTextureUniformLocations = {-1};
-    private int mGLStrengthLocation;
-
-    public MagicPixarFilter(){
-        super(MagicFilterType.PIXAR, R.raw.pixar);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, inputTextureHandles, 0);
-        for (int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for (int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for (int i=0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/pixar_curves.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicRiseFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicRiseFilter.java
deleted file mode 100755
index b6585c5..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicRiseFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicRiseFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1};
-    
-    public MagicRiseFilter(){
-        super(MagicFilterType.RISE, R.raw.rise);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i=0; i < inputTextureUniformLocations.length; i++){
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture"+(2+i));
-        }
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/blackboard1024.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/overlaymap.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/risemap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicRomanceFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicRomanceFilter.java
deleted file mode 100755
index 4d1c322..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicRomanceFilter.java
+++ /dev/null
@@ -1,82 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicRomanceFilter extends GPUImageFilter{
-    
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicRomanceFilter(){
-        super(MagicFilterType.ROMANCE, R.raw.romance);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                
-                byte[] romance_arrayOfByte = new byte[1024];
-                int[] romance_arrayOfInt1 = { 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 66, 67, 67, 68, 69, 69, 70, 71, 72, 73, 73, 74, 75, 76, 77, 78, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110, 111, 112, 113, 114, 116, 117, 118, 119, 120, 122, 123, 124, 125, 127, 128, 129, 130, 131, 133, 134, 135, 136, 138, 139, 140, 141, 143, 144, 145, 146, 148, 149, 150, 151, 153, 154, 155, 156, 158, 159, 160, 161, 162, 164, 165, 166, 167, 169, 170, 171, 172, 173, 175, 176, 177, 178, 179, 180, 182, 183, 184, 185, 186, 187, 188, 189, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 211, 212, 213, 214, 215, 216, 217, 218, 218, 219, 220, 221, 222, 222, 223, 224, 225, 226, 226, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 239, 240, 241, 241, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 252, 253, 254, 254, 255 };
-                int[] romance_arrayOfInt2 = { 0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 49, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] romance_arrayOfInt3 = { 0, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 20, 21, 23, 25, 26, 28, 30, 31, 33, 34, 36, 38, 39, 41, 42, 44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 65, 66, 67, 69, 70, 72, 73, 74, 76, 77, 78, 79, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112, 112, 113, 114, 115, 116, 116, 117, 118, 119, 119, 120, 121, 122, 122, 123, 124, 124, 125, 126, 126, 127, 128, 128, 129, 130, 130, 131, 131, 132, 133, 133, 134, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140, 141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147, 148, 149, 149, 150, 150, 151, 152, 152, 153, 154, 154, 155, 155, 156, 157, 157, 158, 159, 159, 160, 161, 162, 162, 163, 164, 164, 165, 166, 167, 168, 168, 169, 170, 171, 172, 172, 173, 174, 175, 176, 177, 177, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 251, 252, 253, 254, 255 };
-                int[] romance_arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++){
-                    romance_arrayOfByte[(i * 4)] = ((byte)romance_arrayOfInt1[i]);
-                    romance_arrayOfByte[(1 + i * 4)] = ((byte)romance_arrayOfInt2[i]);
-                    romance_arrayOfByte[(2 + i * 4)] = ((byte)romance_arrayOfInt3[i]);
-                    romance_arrayOfByte[(3 + i * 4)] = ((byte)romance_arrayOfInt4[i]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(romance_arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSakuraFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSakuraFilter.java
deleted file mode 100755
index 92a2b0d..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSakuraFilter.java
+++ /dev/null
@@ -1,90 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicSakuraFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mTexelHeightUniformLocation;
-    private int mTexelWidthUniformLocation;
-    
-    public MagicSakuraFilter(){
-        super(MagicFilterType.SAKURA, R.raw.romance);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mTexelWidthUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelWidthOffset");
-        mTexelHeightUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelHeightOffset");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt = { 95, 95, 96, 97, 97, 98, 99, 99, 100, 101, 101, 102, 103, 104, 104, 105, 106, 106, 107, 108, 108, 109, 110, 111, 111, 112, 113, 113, 114, 115, 115, 116, 117, 117, 118, 119, 120, 120, 121, 122, 122, 123, 124, 124, 125, 126, 127, 127, 128, 129, 129, 130, 131, 131, 132, 133, 133, 134, 135, 136, 136, 137, 138, 138, 139, 140, 140, 141, 142, 143, 143, 144, 145, 145, 146, 147, 147, 148, 149, 149, 150, 151, 152, 152, 153, 154, 154, 155, 156, 156, 157, 158, 159, 159, 160, 161, 161, 162, 163, 163, 164, 165, 165, 166, 167, 168, 168, 169, 170, 170, 171, 172, 172, 173, 174, 175, 175, 176, 177, 177, 178, 179, 179, 180, 181, 181, 182, 183, 184, 184, 185, 186, 186, 187, 188, 188, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 200, 200, 201, 202, 202, 203, 204, 204, 205, 206, 207, 207, 208, 209, 209, 210, 211, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 239, 239, 240, 241, 241, 242, 243, 243, 244, 245, 245, 246, 247, 248, 248, 249, 250, 250, 251, 252, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                for (int i = 0; i < 256; i++)
-                {
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt[i]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-
-    @Override
-    public void onInputSizeChanged(int width, int height) {
-        super.onInputSizeChanged(width, height);
-        GLES20.glUniform1f(mTexelWidthUniformLocation, (1.0f / (float)width));
-        GLES20.glUniform1f(mTexelHeightUniformLocation, (1.0f / (float)height));
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSierraFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSierraFilter.java
deleted file mode 100755
index 3655ea1..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSierraFilter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicSierraFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicSierraFilter(){
-        super(MagicFilterType.SIERRA, R.raw.sierra);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++)
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture"+(2+i));
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/sierravignette.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/overlaymap.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/sierramap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSketchFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSketchFilter.java
deleted file mode 100755
index 8d60c66..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSketchFilter.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicSketchFilter extends GPUImageFilter{
-    
-    private int mSingleStepOffsetLocation;
-    //0.0 - 1.0
-    private int mStrengthLocation;
-    
-    public MagicSketchFilter(){
-        super(MagicFilterType.SKETCH, R.raw.sketch);
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mSingleStepOffsetLocation = GLES20.glGetUniformLocation(getProgram(), "singleStepOffset");
-        mStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        setFloat(mStrengthLocation, 0.5f);
-    }
-
-    @Override
-    public void onInputSizeChanged(final int width, final int height) {
-        super.onInputSizeChanged(width, height);
-        setFloatVec2(mSingleStepOffsetLocation, new float[] {1.0f / width, 1.0f / height});
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSkinWhitenFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSkinWhitenFilter.java
deleted file mode 100755
index 7778cf7..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSkinWhitenFilter.java
+++ /dev/null
@@ -1,94 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicSkinWhitenFilter extends GPUImageFilter {
-    private int mTexelHeightUniformLocation;
-    private int mTexelWidthUniformLocation;
-    private int mToneCurveTextureUniformLocation;
-    private int[] mToneCurveTexture = new int[] {-1};
-    
-    public MagicSkinWhitenFilter() {
-        super(MagicFilterType.SKINWHITEN, R.raw.skinwhiten);
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mTexelWidthUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelWidthOffset");
-        mTexelHeightUniformLocation = GLES20.glGetUniformLocation(getProgram(), "texelHeightOffset");
-
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable() {
-            public void run() {
-                GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt1 = { 95, 95, 96, 97, 97, 98, 99, 99, 100, 101, 101, 102, 103, 104, 104, 105, 106, 106, 107, 108, 108, 109, 110, 111, 111, 112, 113, 113, 114, 115, 115, 116, 117, 117, 118, 119, 120, 120, 121, 122, 122, 123, 124, 124, 125, 126, 127, 127, 128, 129, 129, 130, 131, 131, 132, 133, 133, 134, 135, 136, 136, 137, 138, 138, 139, 140, 140, 141, 142, 143, 143, 144, 145, 145, 146, 147, 147, 148, 149, 149, 150, 151, 152, 152, 153, 154, 154, 155, 156, 156, 157, 158, 159, 159, 160, 161, 161, 162, 163, 163, 164, 165, 165, 166, 167, 168, 168, 169, 170, 170, 171, 172, 172, 173, 174, 175, 175, 176, 177, 177, 178, 179, 179, 180, 181, 181, 182, 183, 184, 184, 185, 186, 186, 187, 188, 188, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 200, 200, 201, 202, 202, 203, 204, 204, 205, 206, 207, 207, 208, 209, 209, 210, 211, 211, 212, 213, 213, 214, 215, 216, 216, 217, 218, 218, 219, 220, 220, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 229, 229, 230, 231, 232, 232, 233, 234, 234, 235, 236, 236, 237, 238, 239, 239, 240, 241, 241, 242, 243, 243, 244, 245, 245, 246, 247, 248, 248, 249, 250, 250, 251, 252, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt2 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(3 + i * 4)] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        super.onDrawArraysPre();
-        if(mToneCurveTexture[0] != -1) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(this.mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        super.onDrawArraysAfter();
-        if (mToneCurveTexture[0] != -1) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    public void onInputSizeChanged(int width, int height) {
-        super.onInputSizeChanged(width, height);
-        GLES20.glUniform1f(mTexelWidthUniformLocation, (1.0f / (float)width));
-        GLES20.glUniform1f(mTexelHeightUniformLocation, (1.0f / (float)height));
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSunriseFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSunriseFilter.java
deleted file mode 100755
index 30fa62f..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSunriseFilter.java
+++ /dev/null
@@ -1,145 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicSunriseFilter extends GPUImageFilter{
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mMaskGrey2TextureId = -1;
-    private int mMaskGrey2UniformLocation;
-    private int mMaskGrey3TextureId = -1;
-    private int mMaskGrey3UniformLocation;
-    private int[] mToneCurveTexture = { -1 };
-    private int mToneCurveTextureUniformLocation;
-    
-    public MagicSunriseFilter(){
-        super(MagicFilterType.SUNRISE, R.raw.sunrise);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-        GLES20.glDeleteTextures(1, new int[]{mMaskGrey1TextureId}, 0);
-        mMaskGrey1TextureId = -1;
-        GLES20.glDeleteTextures(1, new int[]{mMaskGrey2TextureId}, 0);
-        mMaskGrey2TextureId = -1;
-        GLES20.glDeleteTextures(1, new int[]{mMaskGrey3TextureId}, 0);
-        mMaskGrey3TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey3TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE6);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey2TextureId);
-             GLES20.glUniform1i(mMaskGrey2UniformLocation, 5);
-        }
-        if (mMaskGrey3TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE6);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey3TextureId);
-             GLES20.glUniform1i(mMaskGrey3UniformLocation, 6);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey1Frame");
-        mMaskGrey2UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey2Frame");
-        mMaskGrey3UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey3Frame");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 34, 35, 36, 38, 39, 41, 42, 44, 45, 47, 49, 50, 52, 54, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 92, 94, 96, 98, 101, 103, 105, 107, 110, 111, 113, 115, 118, 120, 122, 124, 126, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 150, 152, 154, 156, 158, 159, 161, 162, 164, 166, 167, 169, 170, 172, 173, 174, 176, 177, 178, 180, 181, 182, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 203, 204, 205, 205, 207, 208, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 213, 214, 214, 215, 215, 215, 216, 216, 216, 216, 217, 217, 217, 218, 218, 218, 218, 219, 219, 219, 219, 219, 220, 220, 220, 220, 220, 220, 221, 221, 221, 221, 221, 222, 222, 222, 222, 222, 223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 225, 225, 225, 225, 226, 226, 226, 227, 227, 227, 228, 228, 228, 229, 229, 230, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 238, 238, 239, 239, 241, 241, 242, 243, 243, 244, 245, 245, 246, 246, 247, 248, 248, 249, 250, 250, 251, 252, 252, 253, 254, 254, 255 };
-                int[] arrayOfInt2 = { 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 17, 18, 19, 21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 44, 45, 47, 48, 50, 52, 54, 55, 57, 59, 61, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 83, 85, 87, 90, 92, 94, 96, 98, 101, 103, 105, 107, 110, 111, 113, 115, 117, 119, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 143, 145, 147, 149, 150, 152, 154, 155, 157, 158, 160, 161, 163, 164, 165, 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 183, 184, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 198, 199, 199, 200, 200, 201, 201, 202, 202, 203, 203, 204, 204, 205, 205, 205, 207, 207, 208, 208, 208, 209, 209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, 214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 222, 222, 222, 223, 223, 224, 224, 225, 225, 225, 226, 226, 227, 227, 228, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 236, 237, 237, 238, 238, 239, 239, 241, 241, 242, 242, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt3 = { 0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 53, 54, 56, 58, 60, 62, 64, 66, 68, 69, 71, 73, 76, 78, 80, 82, 83, 85, 87, 89, 91, 93, 95, 97, 100, 102, 104, 106, 108, 110, 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 130, 132, 134, 136, 138, 139, 141, 143, 145, 146, 148, 150, 151, 153, 155, 156, 158, 159, 161, 162, 164, 165, 167, 168, 169, 171, 172, 173, 175, 176, 177, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 204, 205, 207, 207, 208, 209, 209, 210, 210, 211, 212, 212, 213, 213, 214, 214, 215, 215, 215, 216, 216, 217, 217, 217, 218, 218, 218, 219, 219, 219, 220, 220, 220, 220, 221, 221, 221, 221, 222, 222, 222, 222, 223, 223, 223, 223, 224, 224, 224, 224, 224, 225, 225, 225, 225, 225, 226, 226, 226, 226, 227, 227, 227, 227, 228, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231, 231, 231, 232, 232, 233, 233, 233, 234, 234, 235, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 252, 252, 253, 253, 254, 254, 255 };
-                int[] arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 187, 188, 189, 190, 191, 192, 194, 195, 196, 197, 198, 199, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 234, 235, 236, 238, 239, 240, 241, 242, 243, 245, 246, 247, 248, 249, 250, 252, 253, 254, 255 };
-                int[] arrayOfInt6 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 53, 55, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 83, 85, 86, 87, 88, 90, 91, 92, 93, 95, 96, 97, 98, 100, 101, 102, 103, 104, 106, 107, 108, 109, 111, 112, 113, 114, 116, 117, 118, 119, 121, 122, 123, 124, 126, 127, 128, 129, 131, 132, 133, 134, 136, 137, 138, 139, 141, 142, 143, 144, 146, 147, 148, 149, 151, 152, 153, 154, 155, 157, 158, 159, 160, 162, 163, 164, 165, 167, 168, 169, 170, 172, 173, 174, 175, 177, 178, 179, 180, 182, 183, 184, 185, 187, 188, 189, 190, 192, 193, 194, 195, 197, 198, 199, 200, 202, 203, 204, 205, 206, 208, 209, 210, 211, 213, 214, 215, 216, 218, 219, 220, 221, 223, 224, 225, 226, 228, 229, 230, 231, 233, 234, 235, 236, 238, 239, 240, 241, 243, 244, 245, 246, 248, 249, 250, 251, 253, 254, 255 };
-                int[] arrayOfInt7 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 252, 253, 254, 255 };
-                int[] arrayOfInt8 = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, 22, 22, 23, 24, 24, 25, 26, 27, 27, 28, 29, 30, 31, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 44, 44, 45, 46, 47, 48, 49, 50, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 168, 169, 170, 171, 173, 174, 175, 176, 177, 179, 180, 181, 182, 184, 185, 186, 187, 189, 190, 191, 192, 194, 195, 196, 197, 199, 200, 201, 202, 204, 205, 206, 208, 209, 210, 211, 213, 214, 215, 217, 218, 219, 221, 222, 223, 224, 226, 227, 228, 230, 231, 232, 234, 235, 236, 238, 239, 240, 242, 243, 244, 246, 247, 248, 250, 251, 252, 254, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt7[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = ((byte)arrayOfInt8[j]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/amaro_mask1.jpg");
-                GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-                mMaskGrey2TextureId = OpenGLUtils.loadTexture(getContext(), "filter/amaro_mask2.jpg");
-                GLES20.glActiveTexture(GLES20.GL_TEXTURE6);
-                mMaskGrey3TextureId = OpenGLUtils.loadTexture(getContext(), "filter/toy_mask1.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSunsetFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSunsetFilter.java
deleted file mode 100755
index 8bd7439..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSunsetFilter.java
+++ /dev/null
@@ -1,122 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicSunsetFilter extends GPUImageFilter{
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mMaskGrey2TextureId = -1;
-    private int mMaskGrey2UniformLocation;
-    private int[] mToneCurveTexture = { -1 };
-    private int mToneCurveTextureUniformLocation;
-    
-    public MagicSunsetFilter(){
-        super(MagicFilterType.SUNSET, R.raw.sunset);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(3, new int[]{mToneCurveTexture[0], mMaskGrey1TextureId, mMaskGrey2TextureId}, 0);        
-        mToneCurveTexture[0] = -1;
-        mMaskGrey1TextureId = -1;
-        mMaskGrey2TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey2TextureId);
-             GLES20.glUniform1i(mMaskGrey2UniformLocation, 5);
-        }
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey1Frame");
-        mMaskGrey2UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey2Frame");
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 1, 2, 3, 5, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 16, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 51, 52, 54, 55, 56, 59, 60, 62, 63, 64, 66, 67, 70, 71, 72, 74, 76, 78, 79, 80, 83, 84, 85, 88, 89, 90, 93, 94, 95, 98, 99, 100, 102, 104, 106, 107, 108, 109, 112, 113, 114, 116, 117, 118, 119, 120, 122, 124, 125, 126, 128, 129, 130, 131, 132, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 142, 143, 145, 146, 147, 148, 148, 149, 150, 151, 151, 152, 153, 154, 155, 155, 156, 157, 157, 158, 159, 160, 160, 161, 162, 162, 163, 164, 165, 165, 166, 167, 167, 168, 169, 169, 170, 171, 171, 172, 173, 173, 174, 175, 175, 176, 177, 177, 178, 178, 179, 179, 180, 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 188, 188, 189, 190, 190, 191, 192, 193, 193, 194, 194, 194, 195, 196, 197, 197, 198, 199, 200, 201, 201, 202, 203, 204, 204, 205, 206, 207, 208, 208, 208, 209, 210, 211, 212, 212, 213, 214, 215, 216, 217, 218, 218, 219, 220, 221, 222, 222, 223, 224, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 234, 235, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 247, 248, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt2 = { 0, 1, 2, 3, 4, 5, 6, 7, 9, 9, 10, 12, 12, 13, 14, 16, 16, 17, 19, 20, 20, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 51, 53, 54, 56, 57, 59, 61, 62, 64, 65, 66, 69, 70, 72, 73, 76, 77, 78, 80, 82, 84, 85, 87, 89, 90, 93, 94, 95, 98, 99, 100, 103, 104, 106, 108, 109, 111, 112, 114, 116, 117, 118, 120, 122, 123, 124, 125, 126, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 158, 159, 160, 161, 162, 162, 163, 164, 165, 165, 166, 167, 167, 168, 169, 170, 170, 171, 172, 172, 173, 173, 174, 175, 175, 176, 177, 177, 178, 178, 178, 179, 179, 180, 181, 181, 182, 182, 183, 184, 184, 185, 185, 186, 187, 187, 188, 188, 189, 190, 190, 191, 191, 192, 193, 193, 194, 194, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 202, 202, 203, 204, 204, 205, 206, 207, 208, 208, 208, 209, 210, 210, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 220, 221, 222, 222, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 234, 235, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt3 = { 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 9, 11, 12, 12, 13, 14, 16, 16, 17, 18, 20, 20, 21, 22, 23, 25, 25, 26, 27, 29, 30, 31, 31, 32, 34, 35, 36, 37, 39, 40, 41, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 76, 78, 79, 80, 82, 83, 84, 85, 88, 89, 90, 92, 93, 94, 95, 98, 99, 100, 102, 103, 104, 106, 107, 108, 111, 112, 113, 114, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 147, 148, 149, 150, 151, 152, 153, 154, 154, 155, 156, 157, 158, 159, 159, 160, 161, 162, 162, 163, 164, 165, 166, 166, 167, 168, 169, 169, 170, 171, 172, 172, 173, 174, 175, 175, 176, 177, 178, 178, 178, 179, 179, 180, 181, 182, 182, 183, 184, 185, 185, 186, 187, 188, 188, 189, 190, 191, 191, 192, 193, 194, 194, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 235, 236, 237, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 249, 250, 251, 252, 253, 254, 255 };
-                int[] arrayOfInt4 = { 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 14, 16, 17, 19, 20, 21, 23, 24, 26, 27, 28, 30, 31, 32, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 126, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 135, 135, 136, 137, 138, 139, 140, 141, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 230, 231, 232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 249, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 };
-                for (int i = 0; i < 256; i++)
-                {
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++)
-                {
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/rise_mask1.jpg");
-                mMaskGrey2TextureId = OpenGLUtils.loadTexture(getContext(), "filter/rise_mask2.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSutroFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSutroFilter.java
deleted file mode 100755
index d5d70d8..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSutroFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicSutroFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicSutroFilter(){
-        super(MagicFilterType.SUTRO, R.raw.sutro);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for (int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-    
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/vignette_map.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/sutrometal.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/softlight.png");
-                inputTextureHandles[3] = OpenGLUtils.loadTexture(getContext(), "filter/sutroedgeburn.png");
-                inputTextureHandles[4] = OpenGLUtils.loadTexture(getContext(), "filter/sutrocurves.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicSweetsFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicSweetsFilter.java
deleted file mode 100755
index a7464eb..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicSweetsFilter.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicSweetsFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mLowPerformanceUniformLocation;
-    
-    public MagicSweetsFilter(){
-        super(MagicFilterType.SWEETS, R.raw.sweets);
-    }
-
-    @Override
-    protected void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(2, new int[]{mToneCurveTexture[0], mMaskGrey1TextureId}, 0);
-        mToneCurveTexture[0] = -1;
-        mMaskGrey1TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey1Frame");
-        mLowPerformanceUniformLocation = GLES20.glGetUniformLocation(getProgram(), "lowPerformance");
-        setInteger(mLowPerformanceUniformLocation, 1);
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt = { 0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 24, 25, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 98, 99, 100, 101, 103, 104, 105, 106, 108, 109, 110, 111, 113, 114, 115, 116, 118, 119, 120, 121, 123, 124, 125, 126, 128, 129, 130, 132, 133, 134, 135, 137, 138, 139, 140, 142, 143, 144, 145, 147, 148, 149, 150, 152, 153, 154, 155, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 214, 215, 216, 217, 217, 218, 219, 220, 221, 222, 222, 223, 224, 225, 226, 227, 227, 228, 229, 230, 230, 231, 232, 233, 234, 234, 235, 236, 237, 237, 238, 239, 240, 240, 241, 242, 243, 243, 244, 245, 246, 246, 247, 248, 248, 249, 250, 251, 251, 252, 253, 254, 254, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)i);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/rise_mask2.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicTenderFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicTenderFilter.java
deleted file mode 100755
index ddc1112..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicTenderFilter.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicTenderFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    
-    public MagicTenderFilter(){
-        super(MagicFilterType.TENDER, R.raw.tender);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(2, new int[]{mToneCurveTexture[0], mMaskGrey1TextureId}, 0);
-        mToneCurveTexture[0] = -1;
-        mMaskGrey1TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "grey1Frame");
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                    
-                byte[] arrayOfByte = new byte[1024];
-                int[] arrayOfInt1 = { 10, 12, 14, 15, 17, 19, 21, 22, 24, 26, 28, 29, 31, 33, 35, 38, 40, 41, 43, 45, 47, 48, 50, 52, 53, 55, 57, 58, 60, 61, 63, 65, 66, 68, 69, 71, 72, 74, 75, 77, 79, 80, 81, 83, 84, 86, 87, 89, 92, 93, 94, 96, 97, 99, 100, 101, 103, 104, 105, 107, 108, 109, 110, 112, 113, 114, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 158, 159, 160, 161, 162, 163, 164, 165, 166, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 175, 176, 177, 178, 179, 179, 180, 181, 182, 182, 183, 184, 184, 185, 186, 187, 187, 188, 189, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 196, 197, 198, 198, 199, 200, 200, 201, 201, 202, 202, 203, 204, 204, 205, 205, 206, 206, 207, 207, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 216, 216, 217, 217, 218, 218, 219, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223, 224, 224, 224, 225, 225, 226, 226, 227, 227, 227, 228, 228, 229, 229, 230, 230, 230, 231, 231, 232, 232, 232, 233, 233, 234, 234, 234, 234, 235, 235, 236, 236, 236, 237, 237, 238, 238, 238, 239, 239, 240, 240, 240, 241, 241, 242, 242 };
-                int[] arrayOfInt2 = { 10, 12, 14, 15, 17, 19, 19, 21, 22, 24, 26, 28, 29, 31, 33, 35, 36, 36, 38, 40, 41, 43, 45, 47, 48, 50, 52, 52, 53, 55, 57, 58, 60, 61, 63, 65, 66, 68, 69, 69, 71, 72, 74, 75, 77, 79, 80, 81, 83, 84, 86, 86, 87, 89, 90, 92, 93, 94, 96, 97, 99, 100, 101, 103, 103, 104, 105, 107, 108, 109, 110, 112, 113, 114, 116, 117, 118, 119, 120, 122, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 158, 159, 160, 161, 162, 163, 164, 165, 166, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 175, 176, 177, 178, 179, 179, 180, 181, 182, 182, 183, 184, 184, 185, 186, 187, 187, 188, 189, 190, 191, 191, 192, 193, 193, 194, 195, 195, 196, 196, 197, 198, 198, 199, 200, 200, 201, 201, 202, 202, 204, 204, 205, 205, 206, 206, 207, 207, 208, 209, 209, 210, 210, 211, 211, 212, 213, 213, 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223, 224, 224, 224, 225, 226, 226, 227, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 232, 233, 233, 234, 234, 234, 235, 236, 236, 236, 237, 237, 238, 238, 238, 239, 239, 240, 240, 241, 241, 242, 242 };
-                int[] arrayOfInt3 = { 10, 12, 12, 14, 15, 15, 17, 17, 19, 21, 21, 22, 24, 24, 26, 28, 28, 29, 31, 31, 33, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 45, 47, 47, 48, 50, 52, 52, 53, 55, 55, 57, 58, 58, 60, 61, 63, 63, 65, 66, 68, 68, 69, 71, 71, 72, 74, 75, 77, 77, 79, 80, 81, 81, 83, 84, 86, 87, 87, 89, 90, 92, 93, 94, 94, 96, 97, 99, 100, 101, 103, 103, 104, 105, 107, 108, 109, 110, 112, 113, 113, 114, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 129, 130, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 158, 159, 160, 161, 162, 163, 164, 165, 166, 166, 167, 169, 170, 171, 171, 172, 173, 174, 175, 175, 176, 178, 179, 179, 180, 181, 182, 182, 183, 184, 185, 186, 187, 187, 188, 189, 190, 191, 191, 192, 193, 193, 195, 195, 196, 196, 197, 198, 199, 200, 200, 201, 201, 202, 203, 204, 204, 205, 206, 206, 207, 207, 209, 209, 210, 210, 211, 212, 212, 213, 213, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 222, 222, 223, 223, 224, 224, 225, 225, 226, 227, 227, 227, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 238, 238, 238, 239, 240, 240, 240, 241, 242, 242 };
-                int[] arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 1, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/bluevintage_mask1.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicToasterFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicToasterFilter.java
deleted file mode 100755
index 32de4f2..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicToasterFilter.java
+++ /dev/null
@@ -1,72 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicToasterFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1,-1,-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1,-1,-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicToasterFilter(){
-        super(MagicFilterType.TOASTER2, R.raw.toaster2_filter_shader);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    protected void onInit() {
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    protected void onInitialized() {
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/toastermetal.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/toastersoftlight.png");
-                inputTextureHandles[2] = OpenGLUtils.loadTexture(getContext(), "filter/toastercurves.png");
-                inputTextureHandles[3] = OpenGLUtils.loadTexture(getContext(), "filter/toasteroverlaymapwarm.png");
-                inputTextureHandles[4] = OpenGLUtils.loadTexture(getContext(), "filter/toastercolorshift.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicValenciaFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicValenciaFilter.java
deleted file mode 100755
index 6cda63d..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicValenciaFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicValenciaFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicValenciaFilter(){
-        super(MagicFilterType.VALENCIA, R.raw.valencia);
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        for(int i = 0; i < inputTextureHandles.length 
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++) {
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture" + (2 + i));
-        }
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    public void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++) {
-            inputTextureHandles[i] = -1;
-        }
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/valenciamap.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/valenciagradientmap.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicWaldenFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicWaldenFilter.java
deleted file mode 100755
index 3e5b539..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicWaldenFilter.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicWaldenFilter extends GPUImageFilter {
-
-    private int[] inputTextureHandles = {-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicWaldenFilter(){
-        super(MagicFilterType.WALDEN, R.raw.walden);
-    }
-
-    @Override
-    protected void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i+3));
-        }
-    }
-
-    @Override
-    public void onInit(){
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++)
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture"+(2+i));
-        mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    public void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/walden_map.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/vignette_map.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicWarmFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicWarmFilter.java
deleted file mode 100755
index 2677f43..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicWarmFilter.java
+++ /dev/null
@@ -1,123 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicWarmFilter extends GPUImageFilter{
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-    private int mMaskGrey1TextureId = -1;
-    private int mMaskGrey1UniformLocation;
-    private int mMaskGrey2TextureId = -1;
-    private int mMaskGrey2UniformLocation;
-    
-    public MagicWarmFilter(){
-        super(MagicFilterType.WARM, R.raw.warm);
-    }
-
-    @Override
-    public void onDestroy(){
-        super.onDestroy();
-        GLES20.glDeleteTextures(3, new int[]{mToneCurveTexture[0], mMaskGrey1TextureId, mMaskGrey2TextureId}, 0);
-        mToneCurveTexture[0] = -1;
-        mMaskGrey1TextureId = -1;
-        mMaskGrey2TextureId = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-        if (mMaskGrey1TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE4);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey1TextureId);
-            GLES20.glUniform1i(mMaskGrey1UniformLocation, 4);
-        }
-        if (mMaskGrey2TextureId != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE5);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskGrey2TextureId);
-             GLES20.glUniform1i(mMaskGrey2UniformLocation, 5);
-        }
-    }
-
-    @Override
-    protected void onInit(){
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-        mMaskGrey1UniformLocation = GLES20.glGetUniformLocation(getProgram(), "layerImage");
-        mMaskGrey2UniformLocation = GLES20.glGetUniformLocation(getProgram(), "greyFrame");
-    }
-
-    @Override
-    protected void onInitialized(){
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);                
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 9, 12, 14, 17, 20, 23, 25, 28, 31, 33, 35, 38, 40, 42, 44, 46, 48, 50, 52, 53, 55, 57, 58, 60, 61, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 80, 81, 82, 83, 83, 84, 85, 85, 86, 87, 87, 88, 88, 89, 90, 90, 91, 91, 92, 93, 93, 94, 94, 95, 96, 96, 97, 98, 99, 99, 100, 101, 102, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 119, 120, 121, 123, 124, 126, 127, 128, 130, 131, 133, 135, 136, 138, 139, 141, 143, 144, 146, 148, 149, 151, 153, 154, 156, 158, 159, 161, 163, 165, 166, 168, 170, 172, 173, 175, 177, 179, 180, 182, 184, 185, 187, 189, 190, 192, 194, 195, 197, 199, 200, 202, 203, 205, 207, 208, 210, 211, 213, 214, 215, 217, 218, 219, 221, 222, 223, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 238, 239, 240, 241, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 251, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 87, 88, 89, 90, 91, 92, 93, 94, 95, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241, 242 };
-                int[] arrayOfInt3 = { 9, 10, 11, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 80, 81, 81, 82, 83, 84, 85, 86, 87, 87, 88, 89, 90, 91, 92, 93, 93, 94, 95, 96, 97, 98, 98, 99, 100, 101, 102, 103, 104, 104, 105, 106, 107, 108, 109, 110, 110, 111, 112, 113, 114, 115, 116, 116, 117, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 135, 136, 137, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 160, 161, 162, 163, 164, 165, 166, 167, 168, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 225, 226, 227, 228, 229, 230 };
-                int[] arrayOfInt4 = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 33, 34, 35, 37, 38, 40, 41, 42, 44, 45, 47, 48, 50, 51, 53, 54, 56, 58, 59, 61, 62, 64, 65, 67, 69, 70, 72, 73, 75, 77, 78, 80, 82, 83, 85, 86, 88, 90, 91, 93, 94, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, 111, 113, 114, 116, 117, 119, 120, 122, 123, 124, 126, 127, 129, 130, 131, 133, 134, 136, 137, 138, 140, 141, 142, 144, 145, 146, 147, 149, 150, 151, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212, 213, 214, 215, 216, 216, 217, 218, 218, 219, 220, 221, 221, 222, 223, 223, 224, 225, 225, 226, 227, 227, 228, 228, 229, 230, 230, 231, 231, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 238, 238, 239, 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255 };
-                for (int i = 0; i < 256; i++){
-                  arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                  arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt2[i]);
-                  arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt3[i]);
-                  arrayOfByte[(3 + i * 4)] = ((byte)arrayOfInt4[i]);
-                }
-                int[] arrayOfInt5 = { 0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 24, 24, 25, 26, 27, 28, 28, 28, 29, 30, 31, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 39, 40, 41, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 65, 66, 68, 69, 70, 71, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 92, 93, 94, 95, 95, 97, 98, 99, 100, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 141, 142, 143, 144, 145, 146, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 179, 180, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 219, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 233, 234, 235, 236, 237, 239, 240, 241, 242, 243, 243, 244, 246, 247, 248, 249, 250, 251, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt6 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184, 185, 186, 187, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 220, 221, 222, 223, 224, 226, 227, 228, 229, 230, 231, 233, 234, 235, 236, 237, 239, 240, 241, 242, 243, 244, 246, 247, 249, 250, 251, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt7 = { 45, 45, 46, 46, 47, 47, 47, 47, 48, 48, 49, 49, 50, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 73, 73, 74, 75, 75, 76, 76, 77, 78, 78, 79, 79, 80, 80, 80, 81, 82, 82, 83, 84, 84, 85, 86, 87, 87, 88, 89, 89, 90, 91, 92, 92, 93, 94, 95, 95, 95, 96, 97, 98, 98, 99, 100, 101, 102, 103, 103, 104, 105, 106, 107, 108, 109, 110, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 142, 143, 144, 146, 147, 148, 149, 150, 151, 152, 154, 156, 157, 158, 159, 160, 161, 162, 165, 166, 167, 168, 169, 170, 171, 173, 175, 176, 177, 178, 179, 180, 182, 183, 184, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 217, 219, 220, 221, 222, 223, 224, 226, 227, 227, 228, 229, 230, 231, 233, 234, 235, 235, 236, 237, 239, 240, 241, 241, 242, 243, 244, 246, 246, 247, 248, 249, 250, 251, 251, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt8 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                  arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt5[j]);
-                  arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt6[j]);
-                  arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt7[j]);
-                  arrayOfByte[(3 + (1024 + j * 4))] = ((byte)arrayOfInt8[j]);
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-                mMaskGrey1TextureId = OpenGLUtils.loadTexture(getContext(), "filter/warm_layer1.jpg");
-                mMaskGrey2TextureId = OpenGLUtils.loadTexture(getContext(), "filter/bluevintage_mask1.jpg");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicWhiteCatFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicWhiteCatFilter.java
deleted file mode 100755
index cc9bc4f..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicWhiteCatFilter.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import java.nio.ByteBuffer;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-
-public class MagicWhiteCatFilter extends GPUImageFilter{
-
-    private int[] mToneCurveTexture = {-1};
-    private int mToneCurveTextureUniformLocation;
-      
-    public MagicWhiteCatFilter() {
-        super(MagicFilterType.WHITECAT, R.raw.whitecat);
-    }
-
-    @Override
-    public void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(1, mToneCurveTexture, 0);
-        mToneCurveTexture[0] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre() {
-        if (mToneCurveTexture[0] != -1){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-            GLES20.glUniform1i(mToneCurveTextureUniformLocation, 3);
-        }
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mToneCurveTextureUniformLocation = GLES20.glGetUniformLocation(getProgram(), "curve");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable(){
-            public void run(){
-                GLES20.glGenTextures(1, mToneCurveTexture, 0);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mToneCurveTexture[0]);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                        GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-                byte[] arrayOfByte = new byte[2048];
-                int[] arrayOfInt1 = { 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt2 = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 4, 5, 6, 7, 8, 10, 11, 12, 12, 13, 14, 16, 17, 18, 19, 19, 20, 22, 23, 24, 25, 26, 26, 28, 29, 30, 31, 32, 33, 35, 35, 36, 37, 38, 39, 41, 42, 42, 43, 44, 45, 46, 48, 49, 50, 50, 51, 52, 54, 55, 56, 57, 58, 58, 59, 61, 62, 63, 64, 65, 66, 66, 67, 69, 70, 71, 72, 73, 74, 75, 75, 77, 78, 79, 80, 81, 82, 83, 85, 85, 86, 87, 88, 89, 90, 91, 92, 93, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 182, 183, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 217, 218, 219, 220, 221, 222, 223, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 236, 237, 238, 239, 240, 240 };
-                for (int i = 0; i < 256; i++){
-                    arrayOfByte[(i * 4)] = ((byte)arrayOfInt1[i]);
-                    arrayOfByte[(1 + i * 4)] = ((byte)arrayOfInt1[i]);
-                    arrayOfByte[(2 + i * 4)] = ((byte)arrayOfInt2[i]);
-                    arrayOfByte[(3 + i * 4)] = -1;
-                }
-                int[] arrayOfInt3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 9, 14, 17, 19, 22, 25, 27, 30, 34, 36, 39, 41, 43, 45, 49, 51, 52, 54, 55, 57, 58, 61, 63, 64, 65, 67, 68, 69, 72, 73, 75, 76, 77, 78, 81, 82, 83, 84, 86, 87, 88, 90, 91, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 105, 106, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 138, 138, 139, 140, 141, 142, 144, 145, 146, 146, 147, 148, 149, 151, 152, 153, 153, 154, 155, 156, 158, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 168, 170, 171, 172, 172, 173, 174, 175, 176, 177, 178, 179, 180, 180, 181, 183, 183, 184, 185, 186, 186, 188, 189, 190, 190, 191, 192, 193, 194, 195, 196, 196, 197, 198, 199, 200, 201, 201, 202, 203, 204, 204, 206, 207, 207, 208, 209, 209, 211, 212, 212, 213, 214, 214, 215, 217, 217, 218, 219, 219, 220, 221, 222, 223, 224, 224, 225, 226, 227, 228, 228, 229, 230, 230, 231, 233, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 241, 242, 243, 244, 245, 245, 246, 247, 248, 249, 249, 250, 250, 251, 252, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };
-                int[] arrayOfInt4 = { 0, 2, 4, 6, 8, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 32, 34, 36, 38, 40, 42, 44, 46, 47, 49, 51, 53, 54, 56, 58, 60, 61, 63, 65, 66, 68, 70, 71, 73, 74, 76, 77, 79, 80, 82, 83, 85, 86, 88, 89, 91, 92, 93, 95, 96, 98, 99, 100, 101, 103, 104, 105, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200, 201, 202, 203, 204, 205, 206, 206, 207, 208, 209, 210, 211, 212, 212, 213, 214, 215, 216, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224, 224, 225, 226, 226, 227, 228, 228, 229, 230, 230, 231, 232, 232, 233, 233, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 245, 246, 246, 247, 247, 247, 248, 248, 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252, 252, 253, 253, 253, 253, 254, 254, 254, 254, 254, 255, 255, 255 };
-                int[] arrayOfInt5 = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 30, 29, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 48, 48, 49, 50, 51, 51, 52, 52, 53, 53, 54, 55, 55, 56, 57, 57, 58, 59, 60, 60, 61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 110, 111, 113, 115, 116, 118, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 135, 137, 139, 141, 143, 144, 146, 148, 150, 152, 154, 156, 158, 160, 163, 165, 167, 169, 171, 173, 175, 178, 180, 182, 185, 187, 189, 192, 194, 197, 199, 201, 204, 206, 209, 211, 214, 216, 219, 221, 224, 226, 229, 232, 234, 236, 239, 241, 245, 247, 250, 252, 255 };
-                for (int j = 0; j < 256; j++){
-                    arrayOfByte[(1024 + j * 4)] = ((byte)arrayOfInt4[j]);
-                    arrayOfByte[(1 + (1024 + j * 4))] = ((byte)arrayOfInt3[j]);
-                    arrayOfByte[(2 + (1024 + j * 4))] = ((byte)arrayOfInt5[j]);
-                    arrayOfByte[(3 + (1024 + j * 4))] = -1;
-                }
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, 256, 2, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, ByteBuffer.wrap(arrayOfByte));
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/advanced/MagicXproIIFilter.java b/android/src/main/java/com/seu/magicfilter/advanced/MagicXproIIFilter.java
deleted file mode 100755
index e39f8ef..0000000
--- a/android/src/main/java/com/seu/magicfilter/advanced/MagicXproIIFilter.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.seu.magicfilter.advanced;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.reactlibrary.R;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-public class MagicXproIIFilter extends GPUImageFilter{
-    private int[] inputTextureHandles = {-1,-1};
-    private int[] inputTextureUniformLocations = {-1,-1};
-    private int mGLStrengthLocation;
-
-    public MagicXproIIFilter(){
-        super(MagicFilterType.XPROII, R.raw.xproii_filter_shader);
-    }
-
-    @Override
-    public void onDestroy() {
-        super.onDestroy();
-        GLES20.glDeleteTextures(inputTextureHandles.length, inputTextureHandles, 0);
-        for(int i = 0; i < inputTextureHandles.length; i++)
-            inputTextureHandles[i] = -1;
-    }
-
-    @Override
-    protected void onDrawArraysAfter(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3));
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    @Override
-    protected void onDrawArraysPre(){
-        for(int i = 0; i < inputTextureHandles.length
-                && inputTextureHandles[i] != OpenGLUtils.NO_TEXTURE; i++){
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + (i+3) );
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, inputTextureHandles[i]);
-            GLES20.glUniform1i(inputTextureUniformLocations[i], (i + 3));
-        }
-    }
-
-    @Override
-    public void onInit(){
-        super.onInit();
-        for(int i = 0; i < inputTextureUniformLocations.length; i++)
-            inputTextureUniformLocations[i] = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture"+(2+i));
-            mGLStrengthLocation = GLES20.glGetUniformLocation(getProgram(), "strength");
-    }
-
-    @Override
-    public void onInitialized(){
-        super.onInitialized();
-        setFloat(mGLStrengthLocation, 1.0f);
-        runOnDraw(new Runnable(){
-            public void run(){
-                inputTextureHandles[0] = OpenGLUtils.loadTexture(getContext(), "filter/xpromap.png");
-                inputTextureHandles[1] = OpenGLUtils.loadTexture(getContext(), "filter/vignettemap_new.png");
-            }
-        });
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/MagicBaseGroupFilter.java b/android/src/main/java/com/seu/magicfilter/base/MagicBaseGroupFilter.java
deleted file mode 100755
index a7d8d50..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/MagicBaseGroupFilter.java
+++ /dev/null
@@ -1,144 +0,0 @@
-package com.seu.magicfilter.base;
-
-
-import java.nio.FloatBuffer;
-import java.util.List;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-import android.content.Context;
-import android.opengl.GLES20;
-
-
-public class MagicBaseGroupFilter extends GPUImageFilter {
-    private static int[] frameBuffers = null;
-    private static int[] frameBufferTextures = null;
-    private int frameWidth = -1;
-    private int frameHeight = -1;
-    protected List<GPUImageFilter> filters;
-
-    public MagicBaseGroupFilter(List<GPUImageFilter> filters) {
-        this.filters = filters;
-    }
-
-    @Override
-    public void onDestroy() {
-        for (GPUImageFilter filter : filters) {
-            filter.destroy();
-        }
-        destroyFramebuffers();
-    }
-
-    @Override
-    public void init(Context context) {
-        for (GPUImageFilter filter : filters) {
-            filter.init(context);
-        }
-    }
-
-    @Override
-    public void onInputSizeChanged(final int width, final int height) {
-        super.onInputSizeChanged(width, height);
-        int size = filters.size();
-        for (int i = 0; i < size; i++) {
-            filters.get(i).onInputSizeChanged(width, height);
-        }
-        if (frameBuffers != null && (frameWidth != width || frameHeight != height || frameBuffers.length != size - 1)) {
-            destroyFramebuffers();
-            frameWidth = width;
-            frameHeight = height;
-        }
-        if (frameBuffers == null) {
-            frameBuffers = new int[size - 1];
-            frameBufferTextures = new int[size - 1];
-
-            for (int i = 0; i < size - 1; i++) {
-                GLES20.glGenFramebuffers(1, frameBuffers, i);
-
-                GLES20.glGenTextures(1, frameBufferTextures, i);
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, frameBufferTextures[i]);
-                GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height, 0,
-                    GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-                GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, frameBuffers[i]);
-                GLES20.glFramebufferTexture2D(GLES20.GL_FRAMEBUFFER, GLES20.GL_COLOR_ATTACHMENT0,
-                    GLES20.GL_TEXTURE_2D, frameBufferTextures[i], 0);
-
-                GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
-            }
-        }
-    }
-
-    @Override
-    public int onDrawFrame(final int textureId, final FloatBuffer cubeBuffer,
-                           final FloatBuffer textureBuffer) {
-        if (frameBuffers == null || frameBufferTextures == null) {
-            return OpenGLUtils.NOT_INIT;
-        }
-        int size = filters.size();
-        int previousTexture = textureId;
-        for (int i = 0; i < size; i++) {
-            GPUImageFilter filter = filters.get(i);
-            boolean isNotLast = i < size - 1;
-            if (isNotLast) {
-                GLES20.glViewport(0, 0, mInputWidth, mInputHeight);
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, frameBuffers[i]);
-                GLES20.glClearColor(0, 0, 0, 0);
-                filter.onDrawFrame(previousTexture, mGLCubeBuffer, mGLTextureBuffer);
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
-                previousTexture = frameBufferTextures[i];
-            } else {
-                GLES20.glViewport(0, 0, mOutputWidth, mOutputHeight);
-                filter.onDrawFrame(previousTexture, cubeBuffer, textureBuffer);
-            }
-        }
-        return OpenGLUtils.ON_DRAWN;
-    }
-
-    public int onDrawFrame(int textureId) {
-        if (frameBuffers == null || frameBufferTextures == null) {
-            return OpenGLUtils.NOT_INIT;
-        }
-        int size = filters.size();
-        int previousTexture = textureId;
-        for (int i = 0; i < size; i++) {
-            GPUImageFilter filter = filters.get(i);
-            boolean isNotLast = i < size - 1;
-            if (isNotLast) {
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, frameBuffers[i]);
-                GLES20.glClearColor(0, 0, 0, 0);
-                filter.onDrawFrame(previousTexture, mGLCubeBuffer, mGLTextureBuffer);
-                GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
-                previousTexture = frameBufferTextures[i];
-            } else {
-                filter.onDrawFrame(previousTexture, mGLCubeBuffer, mGLTextureBuffer);
-            }
-        }
-        return OpenGLUtils.ON_DRAWN;
-    }
-
-    private void destroyFramebuffers() {
-        if (frameBufferTextures != null) {
-            GLES20.glDeleteTextures(frameBufferTextures.length, frameBufferTextures, 0);
-            frameBufferTextures = null;
-        }
-        if (frameBuffers != null) {
-            GLES20.glDeleteFramebuffers(frameBuffers.length, frameBuffers, 0);
-            frameBuffers = null;
-        }
-    }
-
-    public int getSize() {
-        return filters.size();
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/MagicLookupFilter.java b/android/src/main/java/com/seu/magicfilter/base/MagicLookupFilter.java
deleted file mode 100755
index 69f867b..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/MagicLookupFilter.java
+++ /dev/null
@@ -1,60 +0,0 @@
-package com.seu.magicfilter.base;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-import com.reactlibrary.R;
-
-public class MagicLookupFilter extends GPUImageFilter {
-
-    protected String table;
-
-    public MagicLookupFilter(String table) {
-        super(MagicFilterType.LOCKUP, R.raw.lookup);
-        this.table = table;
-    }
-
-    private int mLookupTextureUniform;
-    private int mLookupSourceTexture = OpenGLUtils.NO_TEXTURE;
-
-    protected void onInit() {
-        super.onInit();
-        mLookupTextureUniform = GLES20.glGetUniformLocation(getProgram(), "inputImageTexture2");
-    }
-
-    protected void onInitialized() {
-        super.onInitialized();
-        runOnDraw(new Runnable() {
-            public void run() {
-                mLookupSourceTexture = OpenGLUtils.loadTexture(getContext(), table);
-            }
-        });
-    }
-
-    protected void onDestroy() {
-        super.onDestroy();
-        int[] texture = new int[]{mLookupSourceTexture};
-        GLES20.glDeleteTextures(1, texture, 0);
-        mLookupSourceTexture = -1;
-    }
-
-    protected void onDrawArraysAfter() {
-        if (mLookupSourceTexture != -1) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        }
-    }
-
-    protected void onDrawArraysPre() {
-        if (mLookupSourceTexture != -1) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE3);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mLookupSourceTexture);
-            GLES20.glUniform1i(mLookupTextureUniform, 3);
-        }
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageBrightnessFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageBrightnessFilter.java
deleted file mode 100755
index e4892ec..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageBrightnessFilter.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-/**
- * brightness value ranges from -1.0 to 1.0, with 0.0 as the normal level
- */
-public class GPUImageBrightnessFilter extends GPUImageFilter {
-
-    private int mBrightnessLocation;
-    private float mBrightness;
-
-    public GPUImageBrightnessFilter() {
-        this(0.0f);
-    }
-
-    public GPUImageBrightnessFilter(final float brightness) {
-        super(MagicFilterType.BRIGHTNESS, R.raw.brightness);
-        mBrightness = brightness;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mBrightnessLocation = GLES20.glGetUniformLocation(getProgram(), "brightness");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setBrightness(mBrightness);
-    }
-
-    public void setBrightness(final float brightness) {
-        mBrightness = brightness;
-        setFloat(mBrightnessLocation, mBrightness);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageContrastFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageContrastFilter.java
deleted file mode 100755
index e36b07d..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageContrastFilter.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-/**
- * Changes the contrast of the image.<br>
- * <br>
- * contrast value ranges from 0.0 to 4.0, with 1.0 as the normal level
- */
-public class GPUImageContrastFilter extends GPUImageFilter {
-
-    private int mContrastLocation;
-    private float mContrast;
-
-    public GPUImageContrastFilter() {
-        this(1.0f);
-    }
-    
-    public GPUImageContrastFilter(float contrast) {
-        super(MagicFilterType.CONTRAST, R.raw.constrast);
-        mContrast = contrast;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mContrastLocation = GLES20.glGetUniformLocation(getProgram(), "contrast");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setContrast(mContrast);
-    }
-
-    public void setContrast(final float contrast) {
-        mContrast = contrast;
-        setFloat(mContrastLocation, mContrast);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageExposureFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageExposureFilter.java
deleted file mode 100755
index 00e6653..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageExposureFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-/**
- * exposure: The adjusted exposure (-10.0 - 10.0, with 0.0 as the default)
- */
-public class GPUImageExposureFilter extends GPUImageFilter {
-    public static final String EXPOSURE_FRAGMENT_SHADER = "" +
-            " varying highp vec2 textureCoordinate;\n" +
-            " \n" +
-            " uniform sampler2D inputImageTexture;\n" +
-            " uniform highp float exposure;\n" +
-            " \n" +
-            " void main()\n" +
-            " {\n" +
-            "     highp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);\n" +
-            "     \n" +
-            "     gl_FragColor = vec4(textureColor.rgb * pow(2.0, exposure), textureColor.w);\n" +
-            " } ";
-
-    private int mExposureLocation;
-    private float mExposure;
-
-    public GPUImageExposureFilter() {
-        this(0.0f);
-    }
-
-    public GPUImageExposureFilter(final float exposure) {
-        super(MagicFilterType.EXPOSURE, R.raw.exposure);
-        mExposure = exposure;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mExposureLocation = GLES20.glGetUniformLocation(getProgram(), "exposure");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setExposure(mExposure);
-    }
-
-    public void setExposure(final float exposure) {
-        mExposure = exposure;
-        setFloat(mExposureLocation, mExposure);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageFilter.java
deleted file mode 100755
index d2c2c9c..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageFilter.java
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.content.Context;
-import android.graphics.PointF;
-import android.opengl.GLES11Ext;
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.seu.magicfilter.utils.OpenGLUtils;
-import com.seu.magicfilter.utils.Rotation;
-import com.seu.magicfilter.utils.TextureRotationUtil;
-
-import com.reactlibrary.R;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.FloatBuffer;
-import java.nio.IntBuffer;
-import java.util.LinkedList;
-
-public class GPUImageFilter {
-
-    private boolean mIsInitialized;
-    private Context mContext;
-    private MagicFilterType mType = MagicFilterType.NONE;
-    private final LinkedList<Runnable> mRunOnDraw;
-    private final int mVertexShaderId;
-    private final int mFragmentShaderId;
-
-    private int mGLProgId;
-    private int mGLPositionIndex;
-    private int mGLInputImageTextureIndex;
-    private int mGLTextureCoordinateIndex;
-    private int mGLTextureTransformIndex;
-
-    protected int mInputWidth;
-    protected int mInputHeight;
-    protected int mOutputWidth;
-    protected int mOutputHeight;
-    protected FloatBuffer mGLCubeBuffer;
-    protected FloatBuffer mGLTextureBuffer;
-
-    private int[] mGLCubeId;
-    private int[] mGLTextureCoordinateId;
-    private float[] mGLTextureTransformMatrix;
-
-    private int[] mGLFboId;
-    private int[] mGLFboTexId;
-    private IntBuffer mGLFboBuffer;
-
-    public GPUImageFilter() {
-        this(MagicFilterType.NONE);
-    }
-
-    public GPUImageFilter(MagicFilterType type) {
-        this(type, R.raw.vertex, R.raw.fragment);
-    }
-
-    public GPUImageFilter(MagicFilterType type, int fragmentShaderId) {
-        this(type, R.raw.vertex, fragmentShaderId);
-    }
-
-    public GPUImageFilter(MagicFilterType type, int vertexShaderId, int fragmentShaderId) {
-        mType = type;
-        mRunOnDraw = new LinkedList<>();
-        mVertexShaderId = vertexShaderId;
-        mFragmentShaderId = fragmentShaderId;
-
-        mGLCubeBuffer = ByteBuffer.allocateDirect(TextureRotationUtil.CUBE.length * 4)
-                .order(ByteOrder.nativeOrder())
-                .asFloatBuffer();
-        mGLCubeBuffer.put(TextureRotationUtil.CUBE).position(0);
-
-        mGLTextureBuffer = ByteBuffer.allocateDirect(TextureRotationUtil.TEXTURE_NO_ROTATION.length * 4)
-                .order(ByteOrder.nativeOrder())
-                .asFloatBuffer();
-        mGLTextureBuffer.put(TextureRotationUtil.getRotation(Rotation.NORMAL, false, true)).position(0);
-    }
-
-    public void init(Context context) {
-        mContext = context;
-        onInit();
-        onInitialized();
-    }
-
-    protected void onInit() {
-        initVbo();
-        loadSamplerShader();
-    }
-
-    protected void onInitialized() {
-        mIsInitialized = true;
-    }
-
-    public final void destroy() {
-        mIsInitialized = false;
-        destroyFboTexture();
-        destoryVbo();
-        GLES20.glDeleteProgram(mGLProgId);
-        onDestroy();
-    }
-
-    protected void onDestroy() {
-    }
-
-    public void onInputSizeChanged(final int width, final int height) {
-        mInputWidth = width;
-        mInputHeight = height;
-        initFboTexture(width, height);
-    }
-
-    public void onDisplaySizeChanged(final int width, final int height) {
-        mOutputWidth = width;
-        mOutputHeight = height;
-    }
-
-    private void loadSamplerShader() {
-        mGLProgId = OpenGLUtils.loadProgram(OpenGLUtils.readShaderFromRawResource(getContext(), mVertexShaderId),
-            OpenGLUtils.readShaderFromRawResource(getContext(), mFragmentShaderId));
-        mGLPositionIndex = GLES20.glGetAttribLocation(mGLProgId, "position");
-        mGLTextureCoordinateIndex = GLES20.glGetAttribLocation(mGLProgId,"inputTextureCoordinate");
-        mGLTextureTransformIndex = GLES20.glGetUniformLocation(mGLProgId, "textureTransform");
-        mGLInputImageTextureIndex = GLES20.glGetUniformLocation(mGLProgId, "inputImageTexture");
-    }
-
-    private void initVbo() {
-        mGLCubeId = new int[1];
-        mGLTextureCoordinateId = new int[1];
-
-        GLES20.glGenBuffers(1, mGLCubeId, 0);
-        GLES20.glBindBuffer(GLES20.GL_ARRAY_BUFFER, mGLCubeId[0]);
-        GLES20.glBufferData(GLES20.GL_ARRAY_BUFFER, mGLCubeBuffer.capacity() * 4, mGLCubeBuffer, GLES20.GL_STATIC_DRAW);
-
-        GLES20.glGenBuffers(1, mGLTextureCoordinateId, 0);
-        GLES20.glBindBuffer(GLES20.GL_ARRAY_BUFFER, mGLTextureCoordinateId[0]);
-        GLES20.glBufferData(GLES20.GL_ARRAY_BUFFER, mGLTextureBuffer.capacity() * 4, mGLTextureBuffer, GLES20.GL_STATIC_DRAW);
-    }
-
-    private void destoryVbo() {
-        if (mGLCubeId != null) {
-            GLES20.glDeleteBuffers(1, mGLCubeId, 0);
-            mGLCubeId = null;
-        }
-        if (mGLTextureCoordinateId != null) {
-            GLES20.glDeleteBuffers(1, mGLTextureCoordinateId, 0);
-            mGLTextureCoordinateId = null;
-        }
-    }
-
-    private void initFboTexture(int width, int height) {
-        if (mGLFboId != null && (mInputWidth != width || mInputHeight != height)) {
-            destroyFboTexture();
-        }
-
-        mGLFboId = new int[1];
-        mGLFboTexId = new int[1];
-        mGLFboBuffer = IntBuffer.allocate(width * height);
-
-        GLES20.glGenFramebuffers(1, mGLFboId, 0);
-        GLES20.glGenTextures(1, mGLFboTexId, 0);
-        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mGLFboTexId[0]);
-        GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null);
-        GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-        GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-        GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-        GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, mGLFboId[0]);
-        GLES20.glFramebufferTexture2D(GLES20.GL_FRAMEBUFFER, GLES20.GL_COLOR_ATTACHMENT0, GLES20.GL_TEXTURE_2D, mGLFboTexId[0], 0);
-        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
-    }
-
-    private void destroyFboTexture() {
-        if (mGLFboTexId != null) {
-            GLES20.glDeleteTextures(1, mGLFboTexId, 0);
-            mGLFboTexId = null;
-        }
-        if (mGLFboId != null) {
-            GLES20.glDeleteFramebuffers(1, mGLFboId, 0);
-            mGLFboId = null;
-        }
-    }
-
-    public int onDrawFrame(final int textureId, final FloatBuffer cubeBuffer, final FloatBuffer textureBuffer) {
-        if (!mIsInitialized) {
-            return OpenGLUtils.NOT_INIT;
-        }
-
-        GLES20.glUseProgram(mGLProgId);
-        runPendingOnDrawTasks();
-
-        GLES20.glEnableVertexAttribArray(mGLPositionIndex);
-        GLES20.glVertexAttribPointer(mGLPositionIndex, 2, GLES20.GL_FLOAT, false, 4 * 2, cubeBuffer);
-
-        GLES20.glEnableVertexAttribArray(mGLTextureCoordinateIndex);
-        GLES20.glVertexAttribPointer(mGLTextureCoordinateIndex, 2, GLES20.GL_FLOAT, false, 4 * 2, textureBuffer);
-
-        if (textureId != OpenGLUtils.NO_TEXTURE) {
-            GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureId);
-            GLES20.glUniform1i(mGLInputImageTextureIndex, 0);
-        }
-
-        onDrawArraysPre();
-        GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4);
-        onDrawArraysAfter();
-
-        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
-
-        GLES20.glDisableVertexAttribArray(mGLPositionIndex);
-        GLES20.glDisableVertexAttribArray(mGLTextureCoordinateIndex);
-
-        return OpenGLUtils.ON_DRAWN;
-    }
-
-    public int onDrawFrame(int cameraTextureId) {
-        if (!mIsInitialized) {
-            return OpenGLUtils.NOT_INIT;
-        }
-
-        if (mGLFboId == null) {
-            return OpenGLUtils.NO_TEXTURE;
-        }
-
-        GLES20.glUseProgram(mGLProgId);
-        runPendingOnDrawTasks();
-
-        GLES20.glBindBuffer(GLES20.GL_ARRAY_BUFFER, mGLCubeId[0]);
-        GLES20.glEnableVertexAttribArray(mGLPositionIndex);
-        GLES20.glVertexAttribPointer(mGLPositionIndex, 2, GLES20.GL_FLOAT, false, 4 * 2, 0);
-
-        GLES20.glBindBuffer(GLES20.GL_ARRAY_BUFFER, mGLTextureCoordinateId[0]);
-        GLES20.glEnableVertexAttribArray(mGLTextureCoordinateIndex);
-        GLES20.glVertexAttribPointer(mGLTextureCoordinateIndex, 2, GLES20.GL_FLOAT, false, 4 * 2, 0);
-
-        GLES20.glUniformMatrix4fv(mGLTextureTransformIndex, 1, false, mGLTextureTransformMatrix, 0);
-
-        GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
-        GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, cameraTextureId);
-        GLES20.glUniform1i(mGLInputImageTextureIndex, 0);
-
-        onDrawArraysPre();
-
-        GLES20.glViewport(0, 0, mInputWidth, mInputHeight);
-        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, mGLFboId[0]);
-        GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4);
-        GLES20.glReadPixels(0, 0, mInputWidth, mInputHeight, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, mGLFboBuffer);
-        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
-        GLES20.glViewport(0, 0, mOutputWidth, mOutputHeight);
-
-        GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4);
-
-        onDrawArraysAfter();
-
-        GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, 0);
-
-        GLES20.glDisableVertexAttribArray(mGLPositionIndex);
-        GLES20.glDisableVertexAttribArray(mGLTextureCoordinateIndex);
-
-        GLES20.glBindBuffer(GLES20.GL_ARRAY_BUFFER, 0);
-
-        return mGLFboTexId[0];
-    }
-
-    protected void onDrawArraysPre() {}
-
-    protected void onDrawArraysAfter() {}
-    
-    private void runPendingOnDrawTasks() {
-        while (!mRunOnDraw.isEmpty()) {
-            mRunOnDraw.removeFirst().run();
-        }
-    }
-    
-    public int getProgram() {
-        return mGLProgId;
-    }
-
-    public IntBuffer getGLFboBuffer() {
-        return mGLFboBuffer;
-    }
-
-    protected Context getContext() {
-        return mContext;
-    }
-
-    protected MagicFilterType getFilterType() {
-        return mType;
-    }
-    
-    public void setTextureTransformMatrix(float[] mtx){
-        mGLTextureTransformMatrix = mtx;
-    }
-
-    protected void setInteger(final int location, final int intValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform1i(location, intValue);
-            }
-        });
-    }
-
-    protected void setFloat(final int location, final float floatValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform1f(location, floatValue);
-            }
-        });
-    }
-
-    protected void setFloatVec2(final int location, final float[] arrayValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform2fv(location, 1, FloatBuffer.wrap(arrayValue));
-            }
-        });
-    }
-
-    protected void setFloatVec3(final int location, final float[] arrayValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform3fv(location, 1, FloatBuffer.wrap(arrayValue));
-            }
-        });
-    }
-
-    protected void setFloatVec4(final int location, final float[] arrayValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform4fv(location, 1, FloatBuffer.wrap(arrayValue));
-            }
-        });
-    }
-
-    protected void setFloatArray(final int location, final float[] arrayValue) {
-        runOnDraw(new Runnable() {
-            @Override
-            public void run() {
-                GLES20.glUniform1fv(location, arrayValue.length, FloatBuffer.wrap(arrayValue));
-            }
-        });
-    }
-
-    protected void setPoint(final int location, final PointF point) {
-        runOnDraw(new Runnable() {
-
-            @Override
-            public void run() {
-                float[] vec2 = new float[2];
-                vec2[0] = point.x;
-                vec2[1] = point.y;
-                GLES20.glUniform2fv(location, 1, vec2, 0);
-            }
-        });
-    }
-
-    protected void setUniformMatrix3f(final int location, final float[] matrix) {
-        runOnDraw(new Runnable() {
-
-            @Override
-            public void run() {
-                GLES20.glUniformMatrix3fv(location, 1, false, matrix, 0);
-            }
-        });
-    }
-
-    protected void setUniformMatrix4f(final int location, final float[] matrix) {
-        runOnDraw(new Runnable() {
-
-            @Override
-            public void run() {
-                GLES20.glUniformMatrix4fv(location, 1, false, matrix, 0);
-            }
-        });
-    }
-
-    protected void runOnDraw(final Runnable runnable) {
-        synchronized (mRunOnDraw) {
-            mRunOnDraw.addLast(runnable);
-        }
-    }
-}
-
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageHueFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageHueFilter.java
deleted file mode 100755
index b2d9361..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageHueFilter.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-public class GPUImageHueFilter extends GPUImageFilter {
-
-    private float mHue;
-    private int mHueLocation;
-
-    public GPUImageHueFilter() {
-        this(0.0f);
-    }
-
-    public GPUImageHueFilter(final float hue) {
-        super(MagicFilterType.HUE, R.raw.hue);
-        mHue = hue;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mHueLocation = GLES20.glGetUniformLocation(getProgram(), "hueAdjust");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setHue(mHue);
-    }
-
-    public void setHue(final float hue) {
-        mHue = hue;
-        float hueAdjust = (mHue % 360.0f) * (float) Math.PI / 180.0f;
-        setFloat(mHueLocation, hueAdjust);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSaturationFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSaturationFilter.java
deleted file mode 100755
index 63fe898..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSaturationFilter.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-/**
- * saturation: The degree of saturation or desaturation to apply to the image (0.0 - 2.0, with 1.0 as the default)
- */
-public class GPUImageSaturationFilter extends GPUImageFilter {
-
-    private int mSaturationLocation;
-    private float mSaturation;
-
-    public GPUImageSaturationFilter() {
-        this(1.0f);
-    }
-
-    public GPUImageSaturationFilter(final float saturation) {
-        super(MagicFilterType.SATURATION, R.raw.saturation);
-        mSaturation = saturation;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mSaturationLocation = GLES20.glGetUniformLocation(getProgram(), "saturation");
-    }
-
-    @Override
-    public void onInitialized() {
-        super.onInitialized();
-        setSaturation(mSaturation);
-    }
-
-    public void setSaturation(final float saturation) {
-        mSaturation = saturation;
-        setFloat(mSaturationLocation, mSaturation);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSharpenFilter.java b/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSharpenFilter.java
deleted file mode 100755
index 563faa9..0000000
--- a/android/src/main/java/com/seu/magicfilter/base/gpuimage/GPUImageSharpenFilter.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.base.gpuimage;
-
-import android.opengl.GLES20;
-
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import com.reactlibrary.R;
-
-/**
- * Sharpens the picture. <br>
- * <br>
- * sharpness: from -4.0 to 4.0, with 0.0 as the normal level
- */
-public class GPUImageSharpenFilter extends GPUImageFilter {
-
-    private int mSharpnessLocation;
-    private float mSharpness;
-    private int mImageWidthFactorLocation;
-    private int mImageHeightFactorLocation;
-
-    public GPUImageSharpenFilter() {
-        this(0.0f);
-    }
-    
-    public GPUImageSharpenFilter(final float sharpness) {
-        super(MagicFilterType.SHARPEN, R.raw.vertex_sharpen, R.raw.sharpen);
-        mSharpness = sharpness;
-    }
-
-    @Override
-    public void onInit() {
-        super.onInit();
-        mSharpnessLocation = GLES20.glGetUniformLocation(getProgram(), "sharpness");
-        mImageWidthFactorLocation = GLES20.glGetUniformLocation(getProgram(), "imageWidthFactor");
-        mImageHeightFactorLocation = GLES20.glGetUniformLocation(getProgram(), "imageHeightFactor");
-        setSharpness(mSharpness);
-    }
-
-    @Override
-    public void onInputSizeChanged(final int width, final int height) {
-        super.onInputSizeChanged(width, height);
-        setFloat(mImageWidthFactorLocation, 1.0f / width);
-        setFloat(mImageHeightFactorLocation, 1.0f / height);
-    }
-
-    public void setSharpness(final float sharpness) {
-        mSharpness = sharpness;
-        setFloat(mSharpnessLocation, mSharpness);
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/utils/MagicFilterFactory.java b/android/src/main/java/com/seu/magicfilter/utils/MagicFilterFactory.java
deleted file mode 100755
index 8daffcc..0000000
--- a/android/src/main/java/com/seu/magicfilter/utils/MagicFilterFactory.java
+++ /dev/null
@@ -1,165 +0,0 @@
-package com.seu.magicfilter.utils;
-
-import android.content.Context;
-
-import com.seu.magicfilter.advanced.MagicAmaroFilter;
-import com.seu.magicfilter.advanced.MagicAntiqueFilter;
-import com.seu.magicfilter.advanced.MagicBeautyFilter;
-import com.seu.magicfilter.advanced.MagicBlackCatFilter;
-import com.seu.magicfilter.advanced.MagicBrannanFilter;
-import com.seu.magicfilter.advanced.MagicBrooklynFilter;
-import com.seu.magicfilter.advanced.MagicCalmFilter;
-import com.seu.magicfilter.advanced.MagicCoolFilter;
-import com.seu.magicfilter.advanced.MagicCrayonFilter;
-import com.seu.magicfilter.advanced.MagicEarlyBirdFilter;
-import com.seu.magicfilter.advanced.MagicEmeraldFilter;
-import com.seu.magicfilter.advanced.MagicEvergreenFilter;
-import com.seu.magicfilter.advanced.MagicFreudFilter;
-import com.seu.magicfilter.advanced.MagicHealthyFilter;
-import com.seu.magicfilter.advanced.MagicHefeFilter;
-import com.seu.magicfilter.advanced.MagicHudsonFilter;
-import com.seu.magicfilter.advanced.MagicImageAdjustFilter;
-import com.seu.magicfilter.advanced.MagicInkwellFilter;
-import com.seu.magicfilter.advanced.MagicKevinFilter;
-import com.seu.magicfilter.advanced.MagicLatteFilter;
-import com.seu.magicfilter.advanced.MagicLomoFilter;
-import com.seu.magicfilter.advanced.MagicN1977Filter;
-import com.seu.magicfilter.advanced.MagicNashvilleFilter;
-import com.seu.magicfilter.advanced.MagicNostalgiaFilter;
-import com.seu.magicfilter.advanced.MagicPixarFilter;
-import com.seu.magicfilter.advanced.MagicRiseFilter;
-import com.seu.magicfilter.advanced.MagicRomanceFilter;
-import com.seu.magicfilter.advanced.MagicSakuraFilter;
-import com.seu.magicfilter.advanced.MagicSierraFilter;
-import com.seu.magicfilter.advanced.MagicSketchFilter;
-import com.seu.magicfilter.advanced.MagicSkinWhitenFilter;
-import com.seu.magicfilter.advanced.MagicSunriseFilter;
-import com.seu.magicfilter.advanced.MagicSunsetFilter;
-import com.seu.magicfilter.advanced.MagicSutroFilter;
-import com.seu.magicfilter.advanced.MagicSweetsFilter;
-import com.seu.magicfilter.advanced.MagicTenderFilter;
-import com.seu.magicfilter.advanced.MagicToasterFilter;
-import com.seu.magicfilter.advanced.MagicValenciaFilter;
-import com.seu.magicfilter.advanced.MagicWaldenFilter;
-import com.seu.magicfilter.advanced.MagicWarmFilter;
-import com.seu.magicfilter.advanced.MagicWhiteCatFilter;
-import com.seu.magicfilter.advanced.MagicXproIIFilter;
-import com.seu.magicfilter.base.MagicLookupFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageBrightnessFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageContrastFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageExposureFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageHueFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageSaturationFilter;
-import com.seu.magicfilter.base.gpuimage.GPUImageSharpenFilter;
-
-public class MagicFilterFactory{
-
-    public static GPUImageFilter initFilters(MagicFilterType type) {
-        switch (type) {
-            case NONE:
-                return new GPUImageFilter();
-            case WHITECAT:
-                return new MagicWhiteCatFilter();
-            case BLACKCAT:
-                return new MagicBlackCatFilter();
-            case SKINWHITEN:
-                return new MagicSkinWhitenFilter();
-            case BEAUTY:
-                return new MagicBeautyFilter();
-            case ROMANCE:
-                return new MagicRomanceFilter();
-            case SAKURA:
-                return new MagicSakuraFilter();
-            case AMARO:
-                return new MagicAmaroFilter();
-            case WALDEN:
-                return new MagicWaldenFilter();
-            case ANTIQUE:
-                return new MagicAntiqueFilter();
-            case CALM:
-                return new MagicCalmFilter();
-            case BRANNAN:
-                return new MagicBrannanFilter();
-            case BROOKLYN:
-                return new MagicBrooklynFilter();
-            case EARLYBIRD:
-                return new MagicEarlyBirdFilter();
-            case FREUD:
-                return new MagicFreudFilter();
-            case HEFE:
-                return new MagicHefeFilter();
-            case HUDSON:
-                return new MagicHudsonFilter();
-            case INKWELL:
-                return new MagicInkwellFilter();
-            case KEVIN:
-                return new MagicKevinFilter();
-            case LOCKUP:
-                return new MagicLookupFilter("");
-            case LOMO:
-                return new MagicLomoFilter();
-            case N1977:
-                return new MagicN1977Filter();
-            case NASHVILLE:
-                return new MagicNashvilleFilter();
-            case PIXAR:
-                return new MagicPixarFilter();
-            case RISE:
-                return new MagicRiseFilter();
-            case SIERRA:
-                return new MagicSierraFilter();
-            case SUTRO:
-                return new MagicSutroFilter();
-            case TOASTER2:
-                return new MagicToasterFilter();
-            case VALENCIA:
-                return new MagicValenciaFilter();
-            case XPROII:
-                return new MagicXproIIFilter();
-            case EVERGREEN:
-                return new MagicEvergreenFilter();
-            case HEALTHY:
-                return new MagicHealthyFilter();
-            case COOL:
-                return new MagicCoolFilter();
-            case EMERALD:
-                return new MagicEmeraldFilter();
-            case LATTE:
-                return new MagicLatteFilter();
-            case WARM:
-                return new MagicWarmFilter();
-            case TENDER:
-                return new MagicTenderFilter();
-            case SWEETS:
-                return new MagicSweetsFilter();
-            case NOSTALGIA:
-                return new MagicNostalgiaFilter();
-            case SUNRISE:
-                return new MagicSunriseFilter();
-            case SUNSET:
-                return new MagicSunsetFilter();
-            case CRAYON:
-                return new MagicCrayonFilter();
-            case SKETCH:
-                return new MagicSketchFilter();
-            //image adjust
-            case BRIGHTNESS:
-                return new GPUImageBrightnessFilter();
-            case CONTRAST:
-                return new GPUImageContrastFilter();
-            case EXPOSURE:
-                return new GPUImageExposureFilter();
-            case HUE:
-                return new GPUImageHueFilter();
-            case SATURATION:
-                return new GPUImageSaturationFilter();
-            case SHARPEN:
-                return new GPUImageSharpenFilter();
-            case IMAGE_ADJUST:
-                return new MagicImageAdjustFilter();
-            default:
-                return null;
-        }
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/utils/MagicFilterType.java b/android/src/main/java/com/seu/magicfilter/utils/MagicFilterType.java
deleted file mode 100755
index 419c716..0000000
--- a/android/src/main/java/com/seu/magicfilter/utils/MagicFilterType.java
+++ /dev/null
@@ -1,59 +0,0 @@
-package com.seu.magicfilter.utils;
-
-/**
- * Created by why8222 on 2016/2/25.
- */
-public enum  MagicFilterType {
-    NONE,
-    FAIRYTALE,
-    SUNRISE,
-    SUNSET,
-    WHITECAT,
-    BLACKCAT,
-    SKINWHITEN,
-    BEAUTY,
-    HEALTHY,
-    SWEETS,
-    ROMANCE,
-    SAKURA,
-    WARM,
-    ANTIQUE,
-    NOSTALGIA,
-    CALM,
-    LATTE,
-    TENDER,
-    COOL,
-    EMERALD,
-    EVERGREEN,
-    CRAYON,
-    SKETCH,
-    AMARO,
-    BRANNAN,
-    BROOKLYN,
-    EARLYBIRD,
-    FREUD,
-    HEFE,
-    HUDSON,
-    INKWELL,
-    KEVIN,
-    LOCKUP,
-    LOMO,
-    N1977,
-    NASHVILLE,
-    PIXAR,
-    RISE,
-    SIERRA,
-    SUTRO,
-    TOASTER2,
-    VALENCIA,
-    WALDEN,
-    XPROII,
-    //image adjust
-    CONTRAST,
-    BRIGHTNESS,
-    EXPOSURE,
-    HUE,
-    SATURATION,
-    SHARPEN,
-    IMAGE_ADJUST
-}
diff --git a/android/src/main/java/com/seu/magicfilter/utils/OpenGLUtils.java b/android/src/main/java/com/seu/magicfilter/utils/OpenGLUtils.java
deleted file mode 100755
index b84e54b..0000000
--- a/android/src/main/java/com/seu/magicfilter/utils/OpenGLUtils.java
+++ /dev/null
@@ -1,230 +0,0 @@
-package com.seu.magicfilter.utils;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.Buffer;
-
-import javax.microedition.khronos.opengles.GL10;
-
-import android.content.Context;
-import android.content.res.AssetManager;
-import android.graphics.Bitmap;
-import android.graphics.BitmapFactory;
-import android.opengl.GLES11Ext;
-import android.opengl.GLES20;
-import android.opengl.GLUtils;
-import android.util.Log;
-
-public class OpenGLUtils {
-    public static final int NO_TEXTURE = -1;
-    public static final int NOT_INIT = -1;
-    public static final int ON_DRAWN = 1;
-
-    public static int loadTexture(Bitmap img, int usedTexId) {
-        return loadTexture(img, usedTexId, false);
-    }
-
-    public static int loadTexture(Bitmap img, int usedTexId, boolean recyled) {
-        if(img == null)
-            return NO_TEXTURE;
-        int textures[] = new int[1];
-        if (usedTexId == NO_TEXTURE) {
-            GLES20.glGenTextures(1, textures, 0);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textures[0]);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-
-            GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, img, 0);
-        } else {
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, usedTexId);
-            GLUtils.texSubImage2D(GLES20.GL_TEXTURE_2D, 0, 0, 0, img);
-            textures[0] = usedTexId;
-        }
-        if(recyled)
-            img.recycle();
-        return textures[0];
-    }
-
-    public static int loadTexture(Buffer data, int width, int height, int usedTexId) {
-        if(data == null)
-            return NO_TEXTURE;
-        int textures[] = new int[1];
-        if (usedTexId == NO_TEXTURE) {
-            GLES20.glGenTextures(1, textures, 0);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textures[0]);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height,
-                    0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, data);
-        } else {
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, usedTexId);
-            GLES20.glTexSubImage2D(GLES20.GL_TEXTURE_2D, 0, 0, 0, width,
-                    height, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, data);
-            textures[0] = usedTexId;
-        }
-        return textures[0];
-    }
-
-    public static int loadTexture(Buffer data, int width, int height, int usedTexId, int type) {
-        if(data == null)
-            return NO_TEXTURE;
-        int textures[] = new int[1];
-        if (usedTexId == NO_TEXTURE) {
-            GLES20.glGenTextures(1, textures, 0);
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textures[0]);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
-                    GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height,
-                    0, GLES20.GL_RGBA, type, data);
-        } else {
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, usedTexId);
-            GLES20.glTexSubImage2D(GLES20.GL_TEXTURE_2D, 0, 0, 0, width,
-                    height, GLES20.GL_RGBA, type, data);
-            textures[0] = usedTexId;
-        }
-        return textures[0];
-    }
-
-    public static int loadTexture(final Context context, final String name){
-        final int[] textureHandle = new int[1];
-
-        GLES20.glGenTextures(1, textureHandle, 0);
-
-        if (textureHandle[0] != 0){
-
-            // Read in the resource
-            final Bitmap bitmap = getImageFromAssetsFile(context,name);
-
-            // Bind to the texture in OpenGL
-            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureHandle[0]);
-
-            // Set filtering
-            GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
-            GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
-            GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
-            // Load the bitmap into the bound texture.
-            GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, bitmap, 0);
-
-            // Recycle the bitmap, since its data has been loaded into OpenGL.
-            bitmap.recycle();
-        }
-
-        if (textureHandle[0] == 0){
-            throw new RuntimeException("Error loading texture.");
-        }
-
-        return textureHandle[0];
-    }
-
-    private static Bitmap getImageFromAssetsFile(Context context,String fileName){
-        Bitmap image = null;
-        AssetManager am = context.getResources().getAssets();
-        try{
-            InputStream is = am.open(fileName);
-            image = BitmapFactory.decodeStream(is);
-            is.close();
-              }catch (IOException e){
-              e.printStackTrace();
-          }
-          return image;
-    }
-
-    public static int loadProgram(String strVSource, String strFSource) {
-        int iVShader;
-        int iFShader;
-        int iProgId;
-        int[] link = new int[1];
-        iVShader = loadShader(strVSource, GLES20.GL_VERTEX_SHADER);
-        if (iVShader == 0) {
-            Log.d("Load Program", "Vertex Shader Failed");
-            return 0;
-        }
-        iFShader = loadShader(strFSource, GLES20.GL_FRAGMENT_SHADER);
-        if (iFShader == 0) {
-            Log.d("Load Program", "Fragment Shader Failed");
-            return 0;
-        }
-
-        iProgId = GLES20.glCreateProgram();
-        GLES20.glAttachShader(iProgId, iVShader);
-        GLES20.glAttachShader(iProgId, iFShader);
-        GLES20.glLinkProgram(iProgId);
-        GLES20.glGetProgramiv(iProgId, GLES20.GL_LINK_STATUS, link, 0);
-        if (link[0] <= 0) {
-            Log.d("Load Program", "Linking Failed");
-            return 0;
-        }
-        GLES20.glDeleteShader(iVShader);
-        GLES20.glDeleteShader(iFShader);
-        return iProgId;
-    }
-
-    private static int loadShader(String strSource, int iType) {
-        int[] compiled = new int[1];
-        int iShader = GLES20.glCreateShader(iType);
-        GLES20.glShaderSource(iShader, strSource);
-        GLES20.glCompileShader(iShader);
-        GLES20.glGetShaderiv(iShader, GLES20.GL_COMPILE_STATUS, compiled, 0);
-        if (compiled[0] == 0) {
-            Log.e("Load Shader Failed", "Compilation\n" + GLES20.glGetShaderInfoLog(iShader));
-            return 0;
-        }
-        return iShader;
-    }
-
-    public static int getExternalOESTextureID(){
-        int[] texture = new int[1];
-        GLES20.glGenTextures(1, texture, 0);
-        GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, texture[0]);
-        GLES20.glTexParameterf(GLES11Ext.GL_TEXTURE_EXTERNAL_OES,
-                GL10.GL_TEXTURE_MIN_FILTER,GL10.GL_LINEAR);
-        GLES20.glTexParameterf(GLES11Ext.GL_TEXTURE_EXTERNAL_OES,
-                GL10.GL_TEXTURE_MAG_FILTER, GL10.GL_LINEAR);
-        GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES,
-                GL10.GL_TEXTURE_WRAP_S, GL10.GL_CLAMP_TO_EDGE);
-        GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES,
-                GL10.GL_TEXTURE_WRAP_T, GL10.GL_CLAMP_TO_EDGE);
-        return texture[0];
-    }
-
-    public static String readShaderFromRawResource(Context context, int resourceId){
-        final InputStream inputStream = context.getResources().openRawResource(resourceId);
-        final InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
-        final BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
-
-        String nextLine;
-        final StringBuilder body = new StringBuilder();
-
-        try{
-            while ((nextLine = bufferedReader.readLine()) != null){
-                body.append(nextLine);
-                body.append('\n');
-            }
-        }
-        catch (IOException e){
-            return null;
-        }
-        return body.toString();
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/utils/Rotation.java b/android/src/main/java/com/seu/magicfilter/utils/Rotation.java
deleted file mode 100755
index 105cc78..0000000
--- a/android/src/main/java/com/seu/magicfilter/utils/Rotation.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.utils;
-
-public enum Rotation {
-    NORMAL, ROTATION_90, ROTATION_180, ROTATION_270;
-
-    /**
-     * Retrieves the int representation of the Rotation.
-     *
-     * @return 0, 90, 180 or 270
-     */
-    public int asInt() {
-        switch (this) {
-            case NORMAL: return 0;
-            case ROTATION_90: return 90;
-            case ROTATION_180: return 180;
-            case ROTATION_270: return 270;
-            default: throw new IllegalStateException("Unknown Rotation!");
-        }
-    }
-
-    /**
-     * Create a Rotation from an integer. Needs to be either 0, 90, 180 or 270.
-     *
-     * @param rotation 0, 90, 180 or 270
-     * @return Rotation object
-     */
-    public static Rotation fromInt(int rotation) {
-        switch (rotation) {
-            case 0: return NORMAL;
-            case 90: return ROTATION_90;
-            case 180: return ROTATION_180;
-            case 270: return ROTATION_270;
-            case 360: return NORMAL;
-            default: throw new IllegalStateException(
-                    rotation + " is an unknown rotation. Needs to be either 0, 90, 180 or 270!");
-        }
-    }
-}
diff --git a/android/src/main/java/com/seu/magicfilter/utils/TextureRotationUtil.java b/android/src/main/java/com/seu/magicfilter/utils/TextureRotationUtil.java
deleted file mode 100755
index 4b28fea..0000000
--- a/android/src/main/java/com/seu/magicfilter/utils/TextureRotationUtil.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2012 CyberAgent
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.seu.magicfilter.utils;
-
-public class TextureRotationUtil {
-
-    public static final float TEXTURE_NO_ROTATION[] = {
-        0.0f, 1.0f,
-        1.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f,
-    };
-
-    public static final float TEXTURE_ROTATED_90[] = {
-        1.0f, 1.0f,
-        1.0f, 0.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-    };
-    public static final float TEXTURE_ROTATED_180[] = {
-        1.0f, 0.0f,
-        0.0f, 0.0f,
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-    };
-    public static final float TEXTURE_ROTATED_270[] = {
-        0.0f, 0.0f,
-        0.0f, 1.0f,
-        1.0f, 0.0f,
-        1.0f, 1.0f,
-    };
-    
-    public static final float CUBE[] = {
-        -1.0f, -1.0f,
-        1.0f, -1.0f,
-        -1.0f, 1.0f,
-        1.0f, 1.0f,
-    };
-    
-    private TextureRotationUtil() {}
-
-    public static float[] getRotation(final Rotation rotation, final boolean flipHorizontal,
-                                                         final boolean flipVertical) {
-        float[] rotatedTex;
-        switch (rotation) {
-            case ROTATION_90:
-                rotatedTex = TEXTURE_ROTATED_90;
-                break;
-            case ROTATION_180:
-                rotatedTex = TEXTURE_ROTATED_180;
-                break;
-            case ROTATION_270:
-                rotatedTex = TEXTURE_ROTATED_270;
-                break;
-            case NORMAL:
-            default:
-                rotatedTex = TEXTURE_NO_ROTATION;
-                break;
-        }
-        if (flipHorizontal) {
-            rotatedTex = new float[]{
-                flip(rotatedTex[0]), rotatedTex[1],
-                flip(rotatedTex[2]), rotatedTex[3],
-                flip(rotatedTex[4]), rotatedTex[5],
-                flip(rotatedTex[6]), rotatedTex[7],
-            };
-        }
-        if (flipVertical) {
-            rotatedTex = new float[]{
-                rotatedTex[0], flip(rotatedTex[1]),
-                rotatedTex[2], flip(rotatedTex[3]),
-                rotatedTex[4], flip(rotatedTex[5]),
-                rotatedTex[6], flip(rotatedTex[7]),
-            };
-        }
-        return rotatedTex;
-    }
-
-    private static float flip(final float i) {
-        return i == 0.0f ? 1.0f : 0.0f;
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsAllocator.java b/android/src/main/java/net/ossrs/yasea/SrsAllocator.java
deleted file mode 100755
index d50dd57..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsAllocator.java
+++ /dev/null
@@ -1,116 +0,0 @@
-package net.ossrs.yasea;
-
-import java.util.Arrays;
-
-public final class SrsAllocator {
-
-    public class Allocation {
-
-        private byte[] data;
-        private int size;
-
-        public Allocation(int size) {
-            this.data = new byte[size];
-            this.size = 0;
-        }
-
-        public byte[] array() {
-            return data;
-        }
-
-        public int size() {
-            return size;
-        }
-
-        public void appendOffset(int offset) {
-            size += offset;
-        }
-
-        public void clear() {
-            size = 0;
-        }
-
-        public void put(byte b) {
-            data[size++] = b;
-        }
-
-        public void put(byte b, int pos) {
-            data[pos++] = b;
-            size = pos > size ? pos : size;
-        }
-
-        public void put(short s) {
-            put((byte) s);
-            put((byte) (s >>> 8));
-        }
-
-        public void put(int i) {
-            put((byte) i);
-            put((byte) (i >>> 8));
-            put((byte) (i >>> 16));
-            put((byte) (i >>> 24));
-        }
-
-        public void put(byte[] bs) {
-            System.arraycopy(bs, 0, data, size, bs.length);
-            size += bs.length;
-        }
-    }
-
-    private final int individualAllocationSize;
-    private volatile int availableSentinel;
-    private Allocation[] availableAllocations;
-
-    /**
-     * Constructs an instance without creating any {@link Allocation}s up front.
-     *
-     * @param individualAllocationSize The length of each individual {@link Allocation}.
-     */
-    public SrsAllocator(int individualAllocationSize) {
-      this(individualAllocationSize, 0);
-    }
-
-    /**
-     * Constructs an instance with some {@link Allocation}s created up front.
-     * <p>
-     *
-     * @param individualAllocationSize The length of each individual {@link Allocation}.
-     * @param initialAllocationCount The number of allocations to create up front.
-     */
-    public SrsAllocator(int individualAllocationSize, int initialAllocationCount) {
-        this.individualAllocationSize = individualAllocationSize;
-        this.availableSentinel = initialAllocationCount + 10;
-        this.availableAllocations = new Allocation[availableSentinel];
-        for (int i = 0; i < availableSentinel; i++) {
-            availableAllocations[i] = new Allocation(individualAllocationSize);
-        }
-    }
-
-    public synchronized Allocation allocate(int size) {
-        for (int i = 0; i < availableSentinel; i++) {
-            if (availableAllocations[i].size() >= size) {
-                Allocation ret = availableAllocations[i];
-                availableAllocations[i] = null;
-                return ret;
-            }
-        }
-
-        return new Allocation(size > individualAllocationSize ? size : individualAllocationSize);
-    }
-
-    public synchronized void release(Allocation allocation) {
-        allocation.clear();
-
-        for (int i = 0; i < availableSentinel; i++) {
-            if (availableAllocations[i].size() == 0) {
-                availableAllocations[i] = allocation;
-                return;
-            }
-        }
-
-        if (availableSentinel + 1 > availableAllocations.length) {
-            availableAllocations = Arrays.copyOf(availableAllocations, availableAllocations.length * 2);
-        }
-        availableAllocations[availableSentinel++] = allocation;
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsCameraView.java b/android/src/main/java/net/ossrs/yasea/SrsCameraView.java
deleted file mode 100755
index ca38afb..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsCameraView.java
+++ /dev/null
@@ -1,391 +0,0 @@
-package net.ossrs.yasea;
-
-import android.content.Context;
-import android.content.res.Configuration;
-import android.graphics.ImageFormat;
-import android.graphics.SurfaceTexture;
-import android.hardware.Camera;
-import android.opengl.GLES20;
-import android.opengl.GLSurfaceView;
-import android.opengl.Matrix;
-import android.os.Build;
-import android.util.AttributeSet;
-
-import com.seu.magicfilter.base.gpuimage.GPUImageFilter;
-import com.seu.magicfilter.utils.MagicFilterFactory;
-import com.seu.magicfilter.utils.MagicFilterType;
-import com.seu.magicfilter.utils.OpenGLUtils;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.IntBuffer;
-import java.util.List;
-import java.util.concurrent.ConcurrentLinkedQueue;
-
-import javax.microedition.khronos.egl.EGLConfig;
-import javax.microedition.khronos.opengles.GL10;
-
-/**
- * Created by Leo Ma on 2016/2/25.
- */
-public class SrsCameraView extends GLSurfaceView implements GLSurfaceView.Renderer {
-
-    private GPUImageFilter magicFilter;
-    private SurfaceTexture surfaceTexture;
-    private int mOESTextureId = OpenGLUtils.NO_TEXTURE;
-    private int mSurfaceWidth;
-    private int mSurfaceHeight;
-    private int mPreviewWidth;
-    private int mPreviewHeight;
-    private boolean mIsEncoding;
-    private float mInputAspectRatio;
-    private float mOutputAspectRatio;
-    private float[] mProjectionMatrix = new float[16];
-    private float[] mSurfaceMatrix = new float[16];
-    private float[] mTransformMatrix = new float[16];
-
-    private Camera mCamera;
-    private ByteBuffer mGLPreviewBuffer;
-    private int mCamId = -1;
-    private int mPreviewRotation = 90;
-    private int mPreviewOrientation = Configuration.ORIENTATION_PORTRAIT;
-
-    private Thread worker;
-    private final Object writeLock = new Object();
-    private ConcurrentLinkedQueue<IntBuffer> mGLIntBufferCache = new ConcurrentLinkedQueue<>();
-    private PreviewCallback mPrevCb;
-
-    public SrsCameraView(Context context) {
-        this(context, null);
-    }
-
-    public SrsCameraView(Context context, AttributeSet attrs) {
-        super(context, attrs);
-
-        setEGLContextClientVersion(2);
-        setRenderer(this);
-        setRenderMode(GLSurfaceView.RENDERMODE_WHEN_DIRTY);
-    }
-
-    @Override
-    public void onSurfaceCreated(GL10 gl, EGLConfig config) {
-        GLES20.glDisable(GL10.GL_DITHER);
-        GLES20.glClearColor(0, 0, 0, 0);
-
-        magicFilter = new GPUImageFilter(MagicFilterType.NONE);
-        magicFilter.init(getContext().getApplicationContext());
-        magicFilter.onInputSizeChanged(mPreviewWidth, mPreviewHeight);
-
-        mOESTextureId = OpenGLUtils.getExternalOESTextureID();
-        surfaceTexture = new SurfaceTexture(mOESTextureId);
-        surfaceTexture.setOnFrameAvailableListener(new SurfaceTexture.OnFrameAvailableListener() {
-            @Override
-            public void onFrameAvailable(SurfaceTexture surfaceTexture) {
-                requestRender();
-            }
-        });
-
-        // For camera preview on activity creation
-        if (mCamera != null) {
-            try {
-                mCamera.setPreviewTexture(surfaceTexture);
-            } catch (IOException ioe) {
-                ioe.printStackTrace();
-            }
-        }
-    }
-
-    @Override
-    public void onSurfaceChanged(GL10 gl, int width, int height) {
-        GLES20.glViewport(0, 0, width, height);
-        mSurfaceWidth = width;
-        mSurfaceHeight = height;
-        magicFilter.onDisplaySizeChanged(width, height);
-
-        mOutputAspectRatio = width > height ? (float) width / height : (float) height / width;
-        float aspectRatio = mOutputAspectRatio / mInputAspectRatio;
-        if (width > height) {
-            Matrix.orthoM(mProjectionMatrix, 0, -1.0f, 1.0f, -aspectRatio, aspectRatio, -1.0f, 1.0f);
-        } else {
-            Matrix.orthoM(mProjectionMatrix, 0, -aspectRatio, aspectRatio, -1.0f, 1.0f, -1.0f, 1.0f);
-        }
-    }
-
-    @Override
-    public void onDrawFrame(GL10 gl) {
-        GLES20.glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
-        GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT | GLES20.GL_DEPTH_BUFFER_BIT);
-
-        surfaceTexture.updateTexImage();
-
-        surfaceTexture.getTransformMatrix(mSurfaceMatrix);
-        Matrix.multiplyMM(mTransformMatrix, 0, mSurfaceMatrix, 0, mProjectionMatrix, 0);
-        magicFilter.setTextureTransformMatrix(mTransformMatrix);
-        magicFilter.onDrawFrame(mOESTextureId);
-
-        if (mIsEncoding) {
-            mGLIntBufferCache.add(magicFilter.getGLFboBuffer());
-            synchronized (writeLock) {
-                writeLock.notifyAll();
-            }
-        }
-    }
-
-    public void setPreviewCallback(PreviewCallback cb) {
-        mPrevCb = cb;
-    }
-
-    public int[] setPreviewResolution(int width, int height) {
-        getHolder().setFixedSize(width, height);
-
-        mCamera = openCamera();
-        mPreviewWidth = width;
-        mPreviewHeight = height;
-        Camera.Size rs = adaptPreviewResolution(mCamera.new Size(width, height));
-        if (rs != null) {
-            mPreviewWidth = rs.width;
-            mPreviewHeight = rs.height;
-        }
-        mCamera.getParameters().setPreviewSize(mPreviewWidth, mPreviewHeight);
-
-        mGLPreviewBuffer = ByteBuffer.allocateDirect(mPreviewWidth * mPreviewHeight * 4);
-        mInputAspectRatio = mPreviewWidth > mPreviewHeight ?
-            (float) mPreviewWidth / mPreviewHeight : (float) mPreviewHeight / mPreviewWidth;
-
-        return new int[] { mPreviewWidth, mPreviewHeight };
-    }
-
-    public boolean setFilter(final MagicFilterType type) {
-        if (mCamera == null) {
-            return false;
-        }
-
-        queueEvent(new Runnable() {
-            @Override
-            public void run() {
-                if (magicFilter != null) {
-                    magicFilter.destroy();
-                }
-                magicFilter = MagicFilterFactory.initFilters(type);
-                if (magicFilter != null) {
-                    magicFilter.init(getContext().getApplicationContext());
-                    magicFilter.onInputSizeChanged(mPreviewWidth, mPreviewHeight);
-                    magicFilter.onDisplaySizeChanged(mSurfaceWidth, mSurfaceHeight);
-                }
-            }
-        });
-        requestRender();
-        return true;
-    }
-
-    private void deleteTextures() {
-        if (mOESTextureId != OpenGLUtils.NO_TEXTURE) {
-            queueEvent(new Runnable() {
-                @Override
-                public void run() {
-                    GLES20.glDeleteTextures(1, new int[]{ mOESTextureId }, 0);
-                    mOESTextureId = OpenGLUtils.NO_TEXTURE;
-                }
-            });
-        }
-    }
-
-    public void setCameraId(int id) {
-        mCamId = id;
-        setPreviewOrientation(mPreviewOrientation);
-    }
-
-    public void setPreviewOrientation(int orientation) {
-        mPreviewOrientation = orientation;
-        Camera.CameraInfo info = new Camera.CameraInfo();
-        Camera.getCameraInfo(mCamId, info);
-        if (info.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
-            if (orientation == Configuration.ORIENTATION_PORTRAIT) {
-                mPreviewRotation = Build.VERSION.SDK_INT >= Build.VERSION_CODES.N ? 270 : 90;
-            } else if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
-                mPreviewRotation = Build.VERSION.SDK_INT >= Build.VERSION_CODES.N ? 180 : 0;
-            }
-        } else if (info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
-            if (orientation == Configuration.ORIENTATION_PORTRAIT) {
-                mPreviewRotation = 90;
-            } else if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
-                mPreviewRotation = 0;
-            }
-        }
-    }
-
-    public int getCameraId() {
-        return mCamId;
-    }
-
-    public void enableEncoding() {
-        worker = new Thread(new Runnable() {
-            @Override
-            public void run() {
-                while (!Thread.interrupted()) {
-                    while (!mGLIntBufferCache.isEmpty()) {
-                        IntBuffer picture = mGLIntBufferCache.poll();
-                        mGLPreviewBuffer.asIntBuffer().put(picture.array());
-                        mPrevCb.onGetRgbaFrame(mGLPreviewBuffer.array(), mPreviewWidth, mPreviewHeight);
-                    }
-                    // Waiting for next frame
-                    synchronized (writeLock) {
-                        try {
-                            // isEmpty() may take some time, so we set timeout to detect next frame
-                            writeLock.wait(500);
-                        } catch (InterruptedException ie) {
-                            worker.interrupt();
-                        }
-                    }
-                }
-            }
-        });
-        worker.start();
-        mIsEncoding = true;
-    }
-
-    public void disableEncoding() {
-        mIsEncoding = false;
-        mGLIntBufferCache.clear();
-
-        if (worker != null) {
-            worker.interrupt();
-            try {
-                worker.join();
-            } catch (InterruptedException e) {
-                e.printStackTrace();
-                worker.interrupt();
-            }
-            worker = null;
-        }
-    }
-
-    public boolean startCamera() {
-        if (mCamera == null) {
-            mCamera = openCamera();
-            if (mCamera == null) {
-                return false;
-            }
-        }
-
-        Camera.Parameters params = mCamera.getParameters();
-        params.setPictureSize(mPreviewWidth, mPreviewHeight);
-        params.setPreviewSize(mPreviewWidth, mPreviewHeight);
-        int[] range = adaptFpsRange(SrsEncoder.VFPS, params.getSupportedPreviewFpsRange());
-        params.setPreviewFpsRange(range[0], range[1]);
-        params.setPreviewFormat(ImageFormat.NV21);
-        params.setFlashMode(Camera.Parameters.FLASH_MODE_OFF);
-        params.setWhiteBalance(Camera.Parameters.WHITE_BALANCE_AUTO);
-        params.setSceneMode(Camera.Parameters.SCENE_MODE_AUTO);
-
-        List<String> supportedFocusModes = params.getSupportedFocusModes();
-        if (supportedFocusModes != null && !supportedFocusModes.isEmpty()) {
-            if (supportedFocusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
-                params.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-            } else if (supportedFocusModes.contains(Camera.Parameters.FOCUS_MODE_AUTO)) {
-                params.setFocusMode(Camera.Parameters.FOCUS_MODE_AUTO);
-                mCamera.autoFocus(null);
-            } else {
-                params.setFocusMode(supportedFocusModes.get(0));
-            }
-        }
-
-        List<String> supportedFlashModes = params.getSupportedFlashModes();
-        if (supportedFlashModes != null && !supportedFlashModes.isEmpty()) {
-            if (supportedFlashModes.contains(Camera.Parameters.FLASH_MODE_TORCH)) {
-                params.setFlashMode(Camera.Parameters.FLASH_MODE_TORCH);
-            } else {
-                params.setFlashMode(supportedFlashModes.get(0));
-            }
-        }
-
-        mCamera.setParameters(params);
-
-        mCamera.setDisplayOrientation(mPreviewRotation);
-
-        try {
-            mCamera.setPreviewTexture(surfaceTexture);
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-        mCamera.startPreview();
-
-        return true;
-    }
-
-    public void stopCamera() {
-        disableEncoding();
-
-        if (mCamera != null) {
-            mCamera.stopPreview();
-            mCamera.release();
-            mCamera = null;
-        }
-    }
-
-    private Camera openCamera() {
-        Camera camera;
-        if (mCamId < 0) {
-            Camera.CameraInfo info = new Camera.CameraInfo();
-            int numCameras = Camera.getNumberOfCameras();
-            int frontCamId = -1;
-            int backCamId = -1;
-            for (int i = 0; i < numCameras; i++) {
-                Camera.getCameraInfo(i, info);
-                if (info.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
-                    backCamId = i;
-                } else if (info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
-                    frontCamId = i;
-                    break;
-                }
-            }
-            if (frontCamId != -1) {
-                mCamId = frontCamId;
-            } else if (backCamId != -1) {
-                mCamId = backCamId;
-            } else {
-                mCamId = 0;
-            }
-        }
-        camera = Camera.open(mCamId);
-        return camera;
-    }
-
-    private Camera.Size adaptPreviewResolution(Camera.Size resolution) {
-        float diff = 100f;
-        float xdy = (float) resolution.width / (float) resolution.height;
-        Camera.Size best = null;
-        for (Camera.Size size : mCamera.getParameters().getSupportedPreviewSizes()) {
-            if (size.equals(resolution)) {
-                return size;
-            }
-            float tmp = Math.abs(((float) size.width / (float) size.height) - xdy);
-            if (tmp < diff) {
-                diff = tmp;
-                best = size;
-            }
-        }
-        return best;
-    }
-
-    private int[] adaptFpsRange(int expectedFps, List<int[]> fpsRanges) {
-        expectedFps *= 1000;
-        int[] closestRange = fpsRanges.get(0);
-        int measure = Math.abs(closestRange[0] - expectedFps) + Math.abs(closestRange[1] - expectedFps);
-        for (int[] range : fpsRanges) {
-            if (range[0] <= expectedFps && range[1] >= expectedFps) {
-                int curMeasure = Math.abs(range[0] - expectedFps) + Math.abs(range[1] - expectedFps);
-                if (curMeasure < measure) {
-                    closestRange = range;
-                    measure = curMeasure;
-                }
-            }
-        }
-        return closestRange;
-    }
-
-    public interface PreviewCallback {
-
-        void onGetRgbaFrame(byte[] data, int width, int height);
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsEncodeHandler.java b/android/src/main/java/net/ossrs/yasea/SrsEncodeHandler.java
deleted file mode 100755
index e33a202..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsEncodeHandler.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package net.ossrs.yasea;
-
-import android.os.Handler;
-import android.os.Message;
-
-import com.facebook.react.uimanager.ThemedReactContext;
-
-import java.lang.ref.WeakReference;
-
-/**
- * Created by leo.ma on 2016/11/4.
- */
-
-public class SrsEncodeHandler extends Handler {
-
-    private static final int MSG_ENCODE_NETWORK_WEAK = 0;
-    private static final int MSG_ENCODE_NETWORK_RESUME = 1;
-    private static final int MSG_ENCODE_ILLEGAL_ARGUMENT_EXCEPTION = 2;
-
-    private WeakReference<SrsEncodeListener> mWeakListener;
-
-    public SrsEncodeHandler(SrsEncodeListener listener) {
-        mWeakListener = new WeakReference<>(listener);
-    }
-
-
-    public void notifyNetworkWeak() {
-        sendEmptyMessage(MSG_ENCODE_NETWORK_WEAK);
-    }
-
-    public void notifyNetworkResume() {
-        sendEmptyMessage(MSG_ENCODE_NETWORK_RESUME);
-    }
-
-    public void notifyEncodeIllegalArgumentException(IllegalArgumentException e) {
-        obtainMessage(MSG_ENCODE_ILLEGAL_ARGUMENT_EXCEPTION, e).sendToTarget();
-    }
-    
-    @Override  // runs on UI thread
-    public void handleMessage(Message msg) {
-        SrsEncodeListener listener = mWeakListener.get();
-        if (listener == null) {
-            return;
-        }
-
-        switch (msg.what) {
-            case MSG_ENCODE_NETWORK_WEAK:
-                listener.onNetworkWeak();
-                break;
-            case MSG_ENCODE_NETWORK_RESUME:
-                listener.onNetworkResume();
-                break;
-            case MSG_ENCODE_ILLEGAL_ARGUMENT_EXCEPTION:
-                listener.onEncodeIllegalArgumentException((IllegalArgumentException) msg.obj);
-            default:
-                throw new RuntimeException("unknown msg " + msg.what);
-        }
-    }
-
-    public interface SrsEncodeListener {
-
-        void onNetworkWeak();
-
-        void onNetworkResume();
-
-        void onEncodeIllegalArgumentException(IllegalArgumentException e);
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsEncoder.java b/android/src/main/java/net/ossrs/yasea/SrsEncoder.java
deleted file mode 100755
index 71128b8..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsEncoder.java
+++ /dev/null
@@ -1,508 +0,0 @@
-package net.ossrs.yasea;
-
-import android.content.res.Configuration;
-import android.media.AudioFormat;
-import android.media.AudioRecord;
-import android.media.MediaCodec;
-import android.media.MediaCodecInfo;
-import android.media.MediaCodecList;
-import android.media.MediaFormat;
-import android.media.MediaRecorder;
-import android.util.Log;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Created by Leo Ma on 4/1/2016.
- */
-public class SrsEncoder {
-    private static final String TAG = "SrsEncoder";
-
-    public static final String VCODEC = "video/avc";
-    public static final String ACODEC = "audio/mp4a-latm";
-    public static String x264Preset = "veryfast";
-    public static int vPrevWidth = 640;
-    public static int vPrevHeight = 360;
-    public static int vPortraitWidth = 720;
-    public static int vPortraitHeight = 1280;
-    public static int vLandscapeWidth = 1280;
-    public static int vLandscapeHeight = 720;
-    public static int vOutWidth = 720;   // Note: the stride of resolution must be set as 16x for hard encoding with some chip like MTK
-    public static int vOutHeight = 1280;  // Since Y component is quadruple size as U and V component, the stride must be set as 32x
-    public static int vBitrate = 1200 * 1024;  // 1200 kbps
-    public static final int VFPS = 24;
-    public static final int VGOP = 48;
-    public static final int ASAMPLERATE = 44100;
-    public static int aChannelConfig = AudioFormat.CHANNEL_IN_STEREO;
-    public static final int ABITRATE = 128 * 1024;  // 128 kbps
-
-    private SrsEncodeHandler mHandler;
-
-    private SrsFlvMuxer flvMuxer;
-    private SrsMp4Muxer mp4Muxer;
-
-    private MediaCodecInfo vmci;
-    private MediaCodec vencoder;
-    private MediaCodec aencoder;
-    private MediaCodec.BufferInfo vebi = new MediaCodec.BufferInfo();
-    private MediaCodec.BufferInfo aebi = new MediaCodec.BufferInfo();
-
-    private boolean networkWeakTriggered = false;
-    private boolean mCameraFaceFront = true;
-    private boolean useSoftEncoder = false;
-    private boolean canSoftEncode = false;
-
-    private long mPresentTimeUs;
-
-    private int mVideoColorFormat;
-
-    private int videoFlvTrack;
-    private int videoMp4Track;
-    private int audioFlvTrack;
-    private int audioMp4Track;
-
-    // Y, U (Cb) and V (Cr)
-    // yuv420                     yuv yuv yuv yuv
-    // yuv420p (planar)   yyyy*2 uu vv
-    // yuv420sp(semi-planner)   yyyy*2 uv uv
-    // I420 -> YUV420P   yyyy*2 uu vv
-    // YV12 -> YUV420P   yyyy*2 vv uu
-    // NV12 -> YUV420SP  yyyy*2 uv uv
-    // NV21 -> YUV420SP  yyyy*2 vu vu
-    // NV16 -> YUV422SP  yyyy uv uv
-    // YUY2 -> YUV422SP  yuyv yuyv
-
-    public SrsEncoder(SrsEncodeHandler handler) {
-        mHandler = handler;
-        mVideoColorFormat = chooseVideoEncoder();
-    }
-
-    public void setFlvMuxer(SrsFlvMuxer flvMuxer) {
-        this.flvMuxer = flvMuxer;
-    }
-
-    public void setMp4Muxer(SrsMp4Muxer mp4Muxer) {
-        this.mp4Muxer = mp4Muxer;
-    }
-
-    public boolean start() {
-        if (flvMuxer == null || mp4Muxer == null) {
-            return false;
-        }
-
-        // the referent PTS for video and audio encoder.
-        mPresentTimeUs = System.nanoTime() / 1000;
-
-        // Note: the stride of resolution must be set as 16x for hard encoding with some chip like MTK
-        // Since Y component is quadruple size as U and V component, the stride must be set as 32x
-        if (!useSoftEncoder && (vOutWidth % 32 != 0 || vOutHeight % 32 != 0)) {
-            if (vmci.getName().contains("MTK")) {
-                //throw new AssertionError("MTK encoding revolution stride must be 32x");
-            }
-        }
-
-        setEncoderResolution(vOutWidth, vOutHeight);
-        setEncoderFps(VFPS);
-        setEncoderGop(VGOP);
-        // Unfortunately for some android phone, the output fps is less than 10 limited by the
-        // capacity of poor cheap chips even with x264. So for the sake of quick appearance of
-        // the first picture on the player, a spare lower GOP value is suggested. But note that
-        // lower GOP will produce more I frames and therefore more streaming data flow.
-        // setEncoderGop(15);
-        setEncoderBitrate(vBitrate);
-        setEncoderPreset(x264Preset);
-
-        if (useSoftEncoder) {
-            canSoftEncode = openSoftEncoder();
-            if (!canSoftEncode) {
-                return false;
-            }
-        }
-
-        // aencoder pcm to aac raw stream.
-        // requires sdk level 16+, Android 4.1, 4.1.1, the JELLY_BEAN
-        try {
-            aencoder = MediaCodec.createEncoderByType(ACODEC);
-        } catch (IOException e) {
-            Log.e(TAG, "create aencoder failed.");
-            e.printStackTrace();
-            return false;
-        }
-
-        // setup the aencoder.
-        // @see https://developer.android.com/reference/android/media/MediaCodec.html
-        int ach = aChannelConfig == AudioFormat.CHANNEL_IN_STEREO ? 2 : 1;
-        MediaFormat audioFormat = MediaFormat.createAudioFormat(ACODEC, ASAMPLERATE, ach);
-        audioFormat.setInteger(MediaFormat.KEY_BIT_RATE, ABITRATE);
-        audioFormat.setInteger(MediaFormat.KEY_MAX_INPUT_SIZE, 0);
-        aencoder.configure(audioFormat, null, null, MediaCodec.CONFIGURE_FLAG_ENCODE);
-        // add the audio tracker to muxer.
-        audioFlvTrack = flvMuxer.addTrack(audioFormat);
-        audioMp4Track = mp4Muxer.addTrack(audioFormat);
-
-        // vencoder yuv to 264 es stream.
-        // requires sdk level 16+, Android 4.1, 4.1.1, the JELLY_BEAN
-        try {
-            vencoder = MediaCodec.createByCodecName(vmci.getName());
-        } catch (IOException e) {
-            Log.e(TAG, "create vencoder failed.");
-            e.printStackTrace();
-            return false;
-        }
-
-        // setup the vencoder.
-        // Note: landscape to portrait, 90 degree rotation, so we need to switch width and height in configuration
-        MediaFormat videoFormat = MediaFormat.createVideoFormat(VCODEC, vOutWidth, vOutHeight);
-        videoFormat.setInteger(MediaFormat.KEY_COLOR_FORMAT, mVideoColorFormat);
-        videoFormat.setInteger(MediaFormat.KEY_MAX_INPUT_SIZE, 0);
-        videoFormat.setInteger(MediaFormat.KEY_BIT_RATE, vBitrate);
-        videoFormat.setInteger(MediaFormat.KEY_FRAME_RATE, VFPS);
-        videoFormat.setInteger(MediaFormat.KEY_I_FRAME_INTERVAL, VGOP / VFPS);
-        vencoder.configure(videoFormat, null, null, MediaCodec.CONFIGURE_FLAG_ENCODE);
-        // add the video tracker to muxer.
-        videoFlvTrack = flvMuxer.addTrack(videoFormat);
-        videoMp4Track = mp4Muxer.addTrack(videoFormat);
-
-        // start device and encoder.
-        vencoder.start();
-        aencoder.start();
-        return true;
-    }
-
-    public void stop() {
-        if (useSoftEncoder) {
-            closeSoftEncoder();
-            canSoftEncode = false;
-        }
-
-        if (aencoder != null) {
-            Log.i(TAG, "stop aencoder");
-            aencoder.stop();
-            aencoder.release();
-            aencoder = null;
-        }
-
-        if (vencoder != null) {
-            Log.i(TAG, "stop vencoder");
-            vencoder.stop();
-            vencoder.release();
-            vencoder = null;
-        }
-    }
-
-    public void setCameraFrontFace() {
-        mCameraFaceFront = true;
-    }
-
-    public void setCameraBackFace() {
-        mCameraFaceFront = false;
-    }
-
-    public void switchToSoftEncoder() {
-        useSoftEncoder = true;
-    }
-
-    public void switchToHardEncoder() {
-        useSoftEncoder = false;
-    }
-
-    public boolean isSoftEncoder() {
-        return useSoftEncoder;
-    }
-
-    public boolean canHardEncode() {
-        return vencoder != null;
-    }
-
-    public boolean canSoftEncode() {
-        return canSoftEncode;
-    }
-
-    public boolean isEnabled() {
-        return canHardEncode() || canSoftEncode();
-    }
-
-    public void setPreviewResolution(int width, int height) {
-        vPrevWidth = width;
-        vPrevHeight = height;
-    }
-
-    public void setPortraitResolution(int width, int height) {
-        vOutWidth = width;
-        vOutHeight = height;
-        vPortraitWidth = width;
-        vPortraitHeight = height;
-        vLandscapeWidth = height;
-        vLandscapeHeight = width;
-    }
-
-    public void setLandscapeResolution(int width, int height) {
-        vOutWidth = width;
-        vOutHeight = height;
-        vLandscapeWidth = width;
-        vLandscapeHeight = height;
-        vPortraitWidth = height;
-        vPortraitHeight = width;
-    }
-
-    public void setVideoHDMode() {
-        vBitrate = 1200 * 1024;  // 1200 kbps
-        x264Preset = "veryfast";
-    }
-
-    public void setVideoSmoothMode() {
-        vBitrate = 500 * 1024;  // 500 kbps
-        x264Preset = "superfast";
-    }
-
-    public int getPreviewWidth() {
-        return vPrevWidth;
-    }
-
-    public int getPreviewHeight() {
-        return vPrevHeight;
-    }
-
-    public int getOutputWidth() {
-        return vOutWidth;
-    }
-
-    public int getOutputHeight() {
-        return vOutHeight;
-    }
-
-    public void setScreenOrientation(int orientation) {
-        if (orientation == Configuration.ORIENTATION_PORTRAIT) {
-            vOutWidth = vPortraitWidth;
-            vOutHeight = vPortraitHeight;
-        } else if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
-            vOutWidth = vLandscapeWidth;
-            vOutHeight = vLandscapeHeight;
-        }
-        
-        // Note: the stride of resolution must be set as 16x for hard encoding with some chip like MTK
-        // Since Y component is quadruple size as U and V component, the stride must be set as 32x
-        if (!useSoftEncoder && (vOutWidth % 32 != 0 || vOutHeight % 32 != 0)) {
-            if (vmci.getName().contains("MTK")) {
-                //throw new AssertionError("MTK encoding revolution stride must be 32x");
-            }
-        }
-
-        setEncoderResolution(vOutWidth, vOutHeight);
-    }
-
-    private void onProcessedYuvFrame(byte[] yuvFrame, long pts) {
-        ByteBuffer[] inBuffers = vencoder.getInputBuffers();
-        ByteBuffer[] outBuffers = vencoder.getOutputBuffers();
-
-        int inBufferIndex = vencoder.dequeueInputBuffer(-1);
-        if (inBufferIndex >= 0) {
-            ByteBuffer bb = inBuffers[inBufferIndex];
-            bb.clear();
-            bb.put(yuvFrame, 0, yuvFrame.length);
-            vencoder.queueInputBuffer(inBufferIndex, 0, yuvFrame.length, pts, 0);
-        }
-
-        for (; ; ) {
-            int outBufferIndex = vencoder.dequeueOutputBuffer(vebi, 0);
-            if (outBufferIndex >= 0) {
-                ByteBuffer bb = outBuffers[outBufferIndex];
-                onEncodedAnnexbFrame(bb, vebi);
-                vencoder.releaseOutputBuffer(outBufferIndex, false);
-            } else {
-                break;
-            }
-        }
-    }
-
-    private void onSoftEncodedData(byte[] es, long pts, boolean isKeyFrame) {
-        ByteBuffer bb = ByteBuffer.wrap(es);
-        vebi.offset = 0;
-        vebi.size = es.length;
-        vebi.presentationTimeUs = pts;
-        vebi.flags = isKeyFrame ? MediaCodec.BUFFER_FLAG_KEY_FRAME : 0;
-        onEncodedAnnexbFrame(bb, vebi);
-    }
-
-    // when got encoded h264 es stream.
-    private void onEncodedAnnexbFrame(ByteBuffer es, MediaCodec.BufferInfo bi) {
-        mp4Muxer.writeSampleData(videoMp4Track, es.duplicate(), bi);
-        flvMuxer.writeSampleData(videoFlvTrack, es, bi);
-    }
-
-    // when got encoded aac raw stream.
-    private void onEncodedAacFrame(ByteBuffer es, MediaCodec.BufferInfo bi) {
-        mp4Muxer.writeSampleData(audioMp4Track, es.duplicate(), bi);
-        flvMuxer.writeSampleData(audioFlvTrack, es, bi);
-    }
-
-    public void onGetPcmFrame(byte[] data, int size) {
-        ByteBuffer[] inBuffers = aencoder.getInputBuffers();
-        ByteBuffer[] outBuffers = aencoder.getOutputBuffers();
-
-        int inBufferIndex = aencoder.dequeueInputBuffer(-1);
-        if (inBufferIndex >= 0) {
-            ByteBuffer bb = inBuffers[inBufferIndex];
-            bb.clear();
-            bb.put(data, 0, size);
-            long pts = System.nanoTime() / 1000 - mPresentTimeUs;
-            aencoder.queueInputBuffer(inBufferIndex, 0, size, pts, 0);
-        }
-
-        for (; ; ) {
-            int outBufferIndex = aencoder.dequeueOutputBuffer(aebi, 0);
-            if (outBufferIndex >= 0) {
-                ByteBuffer bb = outBuffers[outBufferIndex];
-                onEncodedAacFrame(bb, aebi);
-                aencoder.releaseOutputBuffer(outBufferIndex, false);
-            } else {
-                break;
-            }
-        }
-    }
-
-    public void onGetRgbaFrame(byte[] data, int width, int height) {
-        // Check video frame cache number to judge the networking situation.
-        // Just cache GOP / FPS seconds data according to latency.
-        AtomicInteger videoFrameCacheNumber = flvMuxer.getVideoFrameCacheNumber();
-        if (videoFrameCacheNumber != null && videoFrameCacheNumber.get() < VGOP) {
-            long pts = System.nanoTime() / 1000 - mPresentTimeUs;
-            if (useSoftEncoder) {
-                swRgbaFrame(data, width, height, pts);
-            } else {
-                byte[] processedData = hwRgbaFrame(data, width, height);
-                if (processedData != null) {
-                    onProcessedYuvFrame(processedData, pts);
-                } else {
-                    mHandler.notifyEncodeIllegalArgumentException(new IllegalArgumentException("libyuv failure"));
-                }
-            }
-
-            if (networkWeakTriggered) {
-                networkWeakTriggered = false;
-                mHandler.notifyNetworkResume();
-            }
-        } else {
-            mHandler.notifyNetworkWeak();
-            networkWeakTriggered = true;
-        }
-    }
-
-    private byte[] hwRgbaFrame(byte[] data, int width, int height) {
-        switch (mVideoColorFormat) {
-            case MediaCodecInfo.CodecCapabilities.COLOR_FormatYUV420Planar:
-                return RGBAToI420(data, width, height, true, 180);
-            case MediaCodecInfo.CodecCapabilities.COLOR_FormatYUV420SemiPlanar:
-                return RGBAToNV12(data, width, height, true, 180);
-            default:
-                throw new IllegalStateException("Unsupported color format!");
-        }
-    }
-
-    private void swRgbaFrame(byte[] data, int width, int height, long pts) {
-        RGBASoftEncode(data, width, height, true, 180, pts);
-    }
-
-    public AudioRecord chooseAudioRecord() {
-        AudioRecord mic = new AudioRecord(MediaRecorder.AudioSource.DEFAULT, SrsEncoder.ASAMPLERATE,
-            AudioFormat.CHANNEL_IN_STEREO, AudioFormat.ENCODING_PCM_16BIT, getPcmBufferSize() * 4);
-        if (mic.getState() != AudioRecord.STATE_INITIALIZED) {
-            mic = new AudioRecord(MediaRecorder.AudioSource.DEFAULT, SrsEncoder.ASAMPLERATE,
-                AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, getPcmBufferSize() * 4);
-            if (mic.getState() != AudioRecord.STATE_INITIALIZED) {
-                mic = null;
-            } else {
-                SrsEncoder.aChannelConfig = AudioFormat.CHANNEL_IN_MONO;
-            }
-        } else {
-            SrsEncoder.aChannelConfig = AudioFormat.CHANNEL_IN_STEREO;
-        }
-
-        return mic;
-    }
-
-    private int getPcmBufferSize() {
-        int pcmBufSize = AudioRecord.getMinBufferSize(ASAMPLERATE, AudioFormat.CHANNEL_IN_STEREO,
-            AudioFormat.ENCODING_PCM_16BIT) + 8191;
-        return pcmBufSize - (pcmBufSize % 8192);
-    }
-
-    // choose the video encoder by name.
-    private MediaCodecInfo chooseVideoEncoder(String name) {
-        int nbCodecs = MediaCodecList.getCodecCount();
-        for (int i = 0; i < nbCodecs; i++) {
-            MediaCodecInfo mci = MediaCodecList.getCodecInfoAt(i);
-            if (!mci.isEncoder()) {
-                continue;
-            }
-
-            String[] types = mci.getSupportedTypes();
-            for (int j = 0; j < types.length; j++) {
-                if (types[j].equalsIgnoreCase(VCODEC)) {
-                    Log.i(TAG, String.format("vencoder %s types: %s", mci.getName(), types[j]));
-                    if (name == null) {
-                        return mci;
-                    }
-
-                    if (mci.getName().contains(name)) {
-                        return mci;
-                    }
-                }
-            }
-        }
-
-        return null;
-    }
-
-    // choose the right supported color format. @see below:
-    private int chooseVideoEncoder() {
-        // choose the encoder "video/avc":
-        //      1. select default one when type matched.
-        //      2. google avc is unusable.
-        //      3. choose qcom avc.
-        vmci = chooseVideoEncoder(null);
-        //vmci = chooseVideoEncoder("google");
-        //vmci = chooseVideoEncoder("qcom");
-
-        int matchedColorFormat = 0;
-        MediaCodecInfo.CodecCapabilities cc = vmci.getCapabilitiesForType(VCODEC);
-        for (int i = 0; i < cc.colorFormats.length; i++) {
-            int cf = cc.colorFormats[i];
-            Log.i(TAG, String.format("vencoder %s supports color fomart 0x%x(%d)", vmci.getName(), cf, cf));
-
-            // choose YUV for h.264, prefer the bigger one.
-            // corresponding to the color space transform in onPreviewFrame
-            if (cf >= cc.COLOR_FormatYUV420Planar && cf <= cc.COLOR_FormatYUV420SemiPlanar) {
-                if (cf > matchedColorFormat) {
-                    matchedColorFormat = cf;
-                }
-            }
-        }
-
-        for (int i = 0; i < cc.profileLevels.length; i++) {
-            MediaCodecInfo.CodecProfileLevel pl = cc.profileLevels[i];
-            Log.i(TAG, String.format("vencoder %s support profile %d, level %d", vmci.getName(), pl.profile, pl.level));
-        }
-
-        Log.i(TAG, String.format("vencoder %s choose color format 0x%x(%d)", vmci.getName(), matchedColorFormat, matchedColorFormat));
-        return matchedColorFormat;
-    }
-
-    private native void setEncoderResolution(int outWidth, int outHeight);
-    private native void setEncoderFps(int fps);
-    private native void setEncoderGop(int gop);
-    private native void setEncoderBitrate(int bitrate);
-    private native void setEncoderPreset(String preset);
-    private native byte[] RGBAToI420(byte[] rgbaFrame, int width, int height, boolean flip, int rotate);
-    private native byte[] RGBAToNV12(byte[] rgbaFrame, int width, int height, boolean flip, int rotate);
-    private native int RGBASoftEncode(byte[] rgbaFrame, int width, int height, boolean flip, int rotate, long pts);
-    private native boolean openSoftEncoder();
-    private native void closeSoftEncoder();
-
-    static {
-        System.loadLibrary("yuv");
-        System.loadLibrary("enc");
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsFlvMuxer.java b/android/src/main/java/net/ossrs/yasea/SrsFlvMuxer.java
deleted file mode 100755
index 1508b20..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsFlvMuxer.java
+++ /dev/null
@@ -1,986 +0,0 @@
-package net.ossrs.yasea;
-
-import android.media.MediaCodec;
-import android.media.MediaFormat;
-import android.util.Log;
-
-import com.github.faucamp.simplertmp.DefaultRtmpPublisher;
-import com.github.faucamp.simplertmp.RtmpHandler;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Created by winlin on 5/2/15.
- * Updated by leoma on 4/1/16.
- * to POST the h.264/avc annexb frame over RTMP.
- * @see android.media.MediaMuxer https://developer.android.com/reference/android/media/MediaMuxer.html
- */
-public class SrsFlvMuxer {
-
-    private static final int VIDEO_ALLOC_SIZE = 128 * 1024;
-    private static final int AUDIO_ALLOC_SIZE = 4 * 1024;
-
-    private volatile boolean connected = false;
-    private DefaultRtmpPublisher publisher;
-    private RtmpHandler mHandler;
-
-    private Thread worker;
-    private final Object txFrameLock = new Object();
-
-    private SrsFlv flv = new SrsFlv();
-    private boolean needToFindKeyFrame = true;
-    private SrsFlvFrame mVideoSequenceHeader;
-    private SrsFlvFrame mAudioSequenceHeader;
-    private SrsAllocator mVideoAllocator = new SrsAllocator(VIDEO_ALLOC_SIZE);
-    private SrsAllocator mAudioAllocator = new SrsAllocator(AUDIO_ALLOC_SIZE);
-    private ConcurrentLinkedQueue<SrsFlvFrame> mFlvTagCache = new ConcurrentLinkedQueue<>();
-
-    private static final int VIDEO_TRACK = 100;
-    private static final int AUDIO_TRACK = 101;
-    private static final String TAG = "SrsFlvMuxer";
-
-    /**
-     * constructor.
-     * @param handler the rtmp event handler.
-     */
-    public SrsFlvMuxer(RtmpHandler handler) {
-        mHandler = handler;
-        publisher = new DefaultRtmpPublisher(handler);
-    }
-
-    /**
-     * get cached video frame number in publisher
-     */
-    public AtomicInteger getVideoFrameCacheNumber() {
-        return publisher == null ? null : publisher.getVideoFrameCacheNumber();
-    }
-
-    /**
-     * set video resolution for publisher
-     * @param width width
-     * @param height height
-     */
-    public void setVideoResolution(int width, int height) {
-        if (publisher != null) {
-            publisher.setVideoResolution(width, height);
-        }
-    }
-
-    /**
-     * Adds a track with the specified format.
-     * @param format The media format for the track.
-     * @return The track index for this newly added track.
-     */
-    public int addTrack(MediaFormat format) {
-        if (format.getString(MediaFormat.KEY_MIME).contentEquals(SrsEncoder.VCODEC)) {
-            flv.setVideoTrack(format);
-            return VIDEO_TRACK;
-        } else {
-            flv.setAudioTrack(format);
-            return AUDIO_TRACK;
-        }
-    }
-
-    private void disconnect() {
-        try {
-            publisher.close();
-        } catch (IllegalStateException e) {
-            // Ignore illegal state.
-        }
-        connected = false;
-        mVideoSequenceHeader = null;
-        mAudioSequenceHeader = null;
-        Log.i(TAG, "worker: disconnect ok.");
-    }
-
-    private boolean connect(String url) {
-        if (!connected) {
-            Log.i(TAG, String.format("worker: connecting to RTMP server by url=%s\n", url));
-            if (publisher.connect(url)) {
-                connected = publisher.publish("live");
-            }
-            mVideoSequenceHeader = null;
-            mAudioSequenceHeader = null;
-        }
-        return connected;
-    }
-
-    private void sendFlvTag(SrsFlvFrame frame) {
-        if (!connected || frame == null) {
-            return;
-        }
-
-        if (frame.isVideo()) {
-            if (frame.isKeyFrame()) {
-                Log.i(TAG, String.format("worker: send frame type=%d, dts=%d, size=%dB",
-                    frame.type, frame.dts, frame.flvTag.array().length));
-            }
-            publisher.publishVideoData(frame.flvTag.array(), frame.flvTag.size(), frame.dts);
-            mVideoAllocator.release(frame.flvTag);
-        } else if (frame.isAudio()) {
-            publisher.publishAudioData(frame.flvTag.array(), frame.flvTag.size(), frame.dts);
-            mAudioAllocator.release(frame.flvTag);
-        }
-    }
-
-    /**
-     * start to the remote server for remux.
-     */
-    public void start(final String rtmpUrl) {
-        worker = new Thread(new Runnable() {
-            @Override
-            public void run() {
-                if (!connect(rtmpUrl)) {
-                    return;
-                }
-
-                while (!Thread.interrupted()) {
-                    while (!mFlvTagCache.isEmpty()) {
-                        SrsFlvFrame frame = mFlvTagCache.poll();
-                        if (frame.isSequenceHeader()) {
-                            if (frame.isVideo()) {
-                                mVideoSequenceHeader = frame;
-                                sendFlvTag(mVideoSequenceHeader);
-                            } else if (frame.isAudio()) {
-                                mAudioSequenceHeader = frame;
-                                sendFlvTag(mAudioSequenceHeader);
-                            }
-                        } else {
-                            if (frame.isVideo() && mVideoSequenceHeader != null) {
-                                sendFlvTag(frame);
-                            } else if (frame.isAudio() && mAudioSequenceHeader != null) {
-                                sendFlvTag(frame);
-                            }
-                        }
-                    }
-                    // Waiting for next frame
-                    synchronized (txFrameLock) {
-                        try {
-                            // isEmpty() may take some time, so we set timeout to detect next frame
-                            txFrameLock.wait(500);
-                        } catch (InterruptedException ie) {
-                            worker.interrupt();
-                        }
-                    }
-                }
-            }
-        });
-        worker.start();
-    }
-
-    /**
-     * stop the muxer, disconnect RTMP connection.
-     */
-    public void stop() {
-        mFlvTagCache.clear();
-        if (worker != null) {
-            worker.interrupt();
-            try {
-                worker.join();
-            } catch (InterruptedException e) {
-                e.printStackTrace();
-                worker.interrupt();
-            }
-            worker = null;
-        }
-        flv.reset();
-        needToFindKeyFrame = true;
-        Log.i(TAG, "SrsFlvMuxer closed");
-
-        new Thread(new Runnable() {
-            @Override
-            public void run() {
-                disconnect();
-            }
-        }).start();
-    }
-
-    /**
-     * send the annexb frame over RTMP.
-     * @param trackIndex The track index for this sample.
-     * @param byteBuf The encoded sample.
-     * @param bufferInfo The buffer information related to this sample.
-     */
-    public void writeSampleData(int trackIndex, ByteBuffer byteBuf, MediaCodec.BufferInfo bufferInfo) {
-        if (bufferInfo.offset > 0) {
-            Log.w(TAG, String.format("encoded frame %dB, offset=%d pts=%dms",
-                    bufferInfo.size, bufferInfo.offset, bufferInfo.presentationTimeUs / 1000
-            ));
-        }
-
-        if (VIDEO_TRACK == trackIndex) {
-            flv.writeVideoSample(byteBuf, bufferInfo);
-        } else {
-            flv.writeAudioSample(byteBuf, bufferInfo);
-        }
-    }
-
-    // E.4.3.1 VIDEODATA
-    // Frame Type UB [4]
-    // Type of video frame. The following values are defined:
-    //     1 = key frame (for AVC, a seekable frame)
-    //     2 = inter frame (for AVC, a non-seekable frame)
-    //     3 = disposable inter frame (H.263 only)
-    //     4 = generated key frame (reserved for server use only)
-    //     5 = video info/command frame
-    private class SrsCodecVideoAVCFrame
-    {
-        // set to the zero to reserved, for array map.
-        public final static int Reserved = 0;
-        public final static int Reserved1 = 6;
-
-        public final static int KeyFrame                     = 1;
-        public final static int InterFrame                 = 2;
-        public final static int DisposableInterFrame         = 3;
-        public final static int GeneratedKeyFrame            = 4;
-        public final static int VideoInfoFrame                = 5;
-    }
-
-    // AVCPacketType IF CodecID == 7 UI8
-    // The following values are defined:
-    //     0 = AVC sequence header
-    //     1 = AVC NALU
-    //     2 = AVC end of sequence (lower level NALU sequence ender is
-    //         not required or supported)
-    private class SrsCodecVideoAVCType
-    {
-        // set to the max value to reserved, for array map.
-        public final static int Reserved                    = 3;
-
-        public final static int SequenceHeader                 = 0;
-        public final static int NALU                         = 1;
-        public final static int SequenceHeaderEOF             = 2;
-    }
-
-    /**
-     * E.4.1 FLV Tag, page 75
-     */
-    private class SrsCodecFlvTag
-    {
-        // set to the zero to reserved, for array map.
-        public final static int Reserved = 0;
-
-        // 8 = audio
-        public final static int Audio = 8;
-        // 9 = video
-        public final static int Video = 9;
-        // 18 = script data
-        public final static int Script = 18;
-    };
-
-    // E.4.3.1 VIDEODATA
-    // CodecID UB [4]
-    // Codec Identifier. The following values are defined:
-    //     2 = Sorenson H.263
-    //     3 = Screen video
-    //     4 = On2 VP6
-    //     5 = On2 VP6 with alpha channel
-    //     6 = Screen video version 2
-    //     7 = AVC
-    private class SrsCodecVideo
-    {
-        // set to the zero to reserved, for array map.
-        public final static int Reserved                = 0;
-        public final static int Reserved1                = 1;
-        public final static int Reserved2                = 9;
-
-        // for user to disable video, for example, use pure audio hls.
-        public final static int Disabled                = 8;
-
-        public final static int SorensonH263             = 2;
-        public final static int ScreenVideo             = 3;
-        public final static int On2VP6                 = 4;
-        public final static int On2VP6WithAlphaChannel = 5;
-        public final static int ScreenVideoVersion2     = 6;
-        public final static int AVC                     = 7;
-    }
-
-    /**
-     * the aac object type, for RTMP sequence header
-     * for AudioSpecificConfig, @see aac-mp4a-format-ISO_IEC_14496-3+2001.pdf, page 33
-     * for audioObjectType, @see aac-mp4a-format-ISO_IEC_14496-3+2001.pdf, page 23
-     */
-    private class SrsAacObjectType
-    {
-        public final static int Reserved = 0;
-
-        // Table 1.1 – Audio Object Type definition
-        // @see @see aac-mp4a-format-ISO_IEC_14496-3+2001.pdf, page 23
-        public final static int AacMain = 1;
-        public final static int AacLC = 2;
-        public final static int AacSSR = 3;
-
-        // AAC HE = LC+SBR
-        public final static int AacHE = 5;
-        // AAC HEv2 = LC+SBR+PS
-        public final static int AacHEV2 = 29;
-    }
-
-    /**
-     * the aac profile, for ADTS(HLS/TS)
-     * @see https://github.com/simple-rtmp-server/srs/issues/310
-     */
-    private class SrsAacProfile
-    {
-        public final static int Reserved = 3;
-
-        // @see 7.1 Profiles, aac-iso-13818-7.pdf, page 40
-        public final static int Main = 0;
-        public final static int LC = 1;
-        public final static int SSR = 2;
-    }
-
-    /**
-     * the FLV/RTMP supported audio sample rate.
-     * Sampling rate. The following values are defined:
-     * 0 = 5.5 kHz = 5512 Hz
-     * 1 = 11 kHz = 11025 Hz
-     * 2 = 22 kHz = 22050 Hz
-     * 3 = 44 kHz = 44100 Hz
-     */
-    private class SrsCodecAudioSampleRate
-    {
-        // set to the max value to reserved, for array map.
-        public final static int Reserved                 = 4;
-
-        public final static int R5512                     = 0;
-        public final static int R11025                    = 1;
-        public final static int R22050                    = 2;
-        public final static int R44100                    = 3;
-    }
-
-    /**
-     * Table 7-1 – NAL unit type codes, syntax element categories, and NAL unit type classes
-     * H.264-AVC-ISO_IEC_14496-10-2012.pdf, page 83.
-     */
-    private class SrsAvcNaluType
-    {
-        // Unspecified
-        public final static int Reserved = 0;
-
-        // Coded slice of a non-IDR picture slice_layer_without_partitioning_rbsp( )
-        public final static int NonIDR = 1;
-        // Coded slice data partition A slice_data_partition_a_layer_rbsp( )
-        public final static int DataPartitionA = 2;
-        // Coded slice data partition B slice_data_partition_b_layer_rbsp( )
-        public final static int DataPartitionB = 3;
-        // Coded slice data partition C slice_data_partition_c_layer_rbsp( )
-        public final static int DataPartitionC = 4;
-        // Coded slice of an IDR picture slice_layer_without_partitioning_rbsp( )
-        public final static int IDR = 5;
-        // Supplemental enhancement information (SEI) sei_rbsp( )
-        public final static int SEI = 6;
-        // Sequence parameter set seq_parameter_set_rbsp( )
-        public final static int SPS = 7;
-        // Picture parameter set pic_parameter_set_rbsp( )
-        public final static int PPS = 8;
-        // Access unit delimiter access_unit_delimiter_rbsp( )
-        public final static int AccessUnitDelimiter = 9;
-        // End of sequence end_of_seq_rbsp( )
-        public final static int EOSequence = 10;
-        // End of stream end_of_stream_rbsp( )
-        public final static int EOStream = 11;
-        // Filler data filler_data_rbsp( )
-        public final static int FilterData = 12;
-        // Sequence parameter set extension seq_parameter_set_extension_rbsp( )
-        public final static int SPSExt = 13;
-        // Prefix NAL unit prefix_nal_unit_rbsp( )
-        public final static int PrefixNALU = 14;
-        // Subset sequence parameter set subset_seq_parameter_set_rbsp( )
-        public final static int SubsetSPS = 15;
-        // Coded slice of an auxiliary coded picture without partitioning slice_layer_without_partitioning_rbsp( )
-        public final static int LayerWithoutPartition = 19;
-        // Coded slice extension slice_layer_extension_rbsp( )
-        public final static int CodedSliceExt = 20;
-    }
-
-    /**
-     * the search result for annexb.
-     */
-    private class SrsAnnexbSearch {
-        public int nb_start_code = 0;
-        public boolean match = false;
-    }
-
-    /**
-     * the demuxed tag frame.
-     */
-    private class SrsFlvFrameBytes {
-        public ByteBuffer data;
-        public int size;
-    }
-
-    /**
-     * the muxed flv frame.
-     */
-    private class SrsFlvFrame {
-        // the tag bytes.
-        public SrsAllocator.Allocation flvTag;
-        // the codec type for audio/aac and video/avc for instance.
-        public int avc_aac_type;
-        // the frame type, keyframe or not.
-        public int frame_type;
-        // the tag type, audio, video or data.
-        public int type;
-        // the dts in ms, tbn is 1000.
-        public int dts;
-
-        public boolean isKeyFrame() {
-            return isVideo() && frame_type == SrsCodecVideoAVCFrame.KeyFrame;
-        }
-
-        public boolean isSequenceHeader() {
-            return avc_aac_type == 0;
-        }
-
-        public boolean isVideo() {
-            return type == SrsCodecFlvTag.Video;
-        }
-
-        public boolean isAudio() {
-            return type == SrsCodecFlvTag.Audio;
-        }
-    }
-
-    /**
-     * the raw h.264 stream, in annexb.
-     */
-    private class SrsRawH264Stream {
-        private final static String TAG = "SrsFlvMuxer";
-
-        private SrsAnnexbSearch annexb = new SrsAnnexbSearch();
-        private SrsFlvFrameBytes seq_hdr = new SrsFlvFrameBytes();
-        private SrsFlvFrameBytes sps_hdr = new SrsFlvFrameBytes();
-        private SrsFlvFrameBytes sps_bb = new SrsFlvFrameBytes();
-        private SrsFlvFrameBytes pps_hdr = new SrsFlvFrameBytes();
-        private SrsFlvFrameBytes pps_bb = new SrsFlvFrameBytes();
-
-        public boolean isSps(SrsFlvFrameBytes frame) {
-            return frame.size >= 1 && (frame.data.get(0) & 0x1f) == SrsAvcNaluType.SPS;
-        }
-
-        public boolean isPps(SrsFlvFrameBytes frame) {
-            return frame.size >= 1 && (frame.data.get(0) & 0x1f) == SrsAvcNaluType.PPS;
-        }
-
-        public SrsFlvFrameBytes muxNaluHeader(SrsFlvFrameBytes frame) {
-            SrsFlvFrameBytes nalu_hdr = new SrsFlvFrameBytes();
-            nalu_hdr.data = ByteBuffer.allocateDirect(4);
-            nalu_hdr.size = 4;
-            // 5.3.4.2.1 Syntax, H.264-AVC-ISO_IEC_14496-15.pdf, page 16
-            // lengthSizeMinusOne, or NAL_unit_length, always use 4bytes size
-            int NAL_unit_length = frame.size;
-
-            // mux the avc NALU in "ISO Base Media File Format"
-            // from H.264-AVC-ISO_IEC_14496-15.pdf, page 20
-            // NALUnitLength
-            nalu_hdr.data.putInt(NAL_unit_length);
-
-            // reset the buffer.
-            nalu_hdr.data.rewind();
-            return nalu_hdr;
-        }
-
-        public void muxSequenceHeader(ByteBuffer sps, ByteBuffer pps, int dts, int pts,
-                                        ArrayList<SrsFlvFrameBytes> frames) {
-            // 5bytes sps/pps header:
-            //      configurationVersion, AVCProfileIndication, profile_compatibility,
-            //      AVCLevelIndication, lengthSizeMinusOne
-            // 3bytes size of sps:
-            //      numOfSequenceParameterSets, sequenceParameterSetLength(2B)
-            // Nbytes of sps.
-            //      sequenceParameterSetNALUnit
-            // 3bytes size of pps:
-            //      numOfPictureParameterSets, pictureParameterSetLength
-            // Nbytes of pps:
-            //      pictureParameterSetNALUnit
-
-            // decode the SPS:
-            // @see: 7.3.2.1.1, H.264-AVC-ISO_IEC_14496-10-2012.pdf, page 62
-            if (seq_hdr.data == null) {
-                seq_hdr.data = ByteBuffer.allocate(5);
-                seq_hdr.size = 5;
-            }
-            seq_hdr.data.rewind();
-            // @see: Annex A Profiles and levels, H.264-AVC-ISO_IEC_14496-10.pdf, page 205
-            //      Baseline profile profile_idc is 66(0x42).
-            //      Main profile profile_idc is 77(0x4d).
-            //      Extended profile profile_idc is 88(0x58).
-            byte profile_idc = sps.get(1);
-            //u_int8_t constraint_set = frame[2];
-            byte level_idc = sps.get(3);
-
-            // generate the sps/pps header
-            // 5.3.4.2.1 Syntax, H.264-AVC-ISO_IEC_14496-15.pdf, page 16
-            // configurationVersion
-            seq_hdr.data.put((byte) 0x01);
-            // AVCProfileIndication
-            seq_hdr.data.put(profile_idc);
-            // profile_compatibility
-            seq_hdr.data.put((byte) 0x00);
-            // AVCLevelIndication
-            seq_hdr.data.put(level_idc);
-            // lengthSizeMinusOne, or NAL_unit_length, always use 4bytes size,
-            // so we always set it to 0x03.
-            seq_hdr.data.put((byte) 0x03);
-
-            // reset the buffer.
-            seq_hdr.data.rewind();
-            frames.add(seq_hdr);
-
-            // sps
-            if (sps_hdr.data == null) {
-                sps_hdr.data = ByteBuffer.allocate(3);
-                sps_hdr.size = 3;
-            }
-            sps_hdr.data.rewind();
-            // 5.3.4.2.1 Syntax, H.264-AVC-ISO_IEC_14496-15.pdf, page 16
-            // numOfSequenceParameterSets, always 1
-            sps_hdr.data.put((byte) 0x01);
-            // sequenceParameterSetLength
-            sps_hdr.data.putShort((short) sps.array().length);
-
-            sps_hdr.data.rewind();
-            frames.add(sps_hdr);
-
-            // sequenceParameterSetNALUnit
-            sps_bb.size = sps.array().length;
-            sps_bb.data = sps.duplicate();
-            frames.add(sps_bb);
-
-            // pps
-            if (pps_hdr.data == null) {
-                pps_hdr.data = ByteBuffer.allocate(3);
-                pps_hdr.size = 3;
-            }
-            pps_hdr.data.rewind();
-            // 5.3.4.2.1 Syntax, H.264-AVC-ISO_IEC_14496-15.pdf, page 16
-            // numOfPictureParameterSets, always 1
-            pps_hdr.data.put((byte) 0x01);
-            // pictureParameterSetLength
-            pps_hdr.data.putShort((short) pps.array().length);
-
-            pps_hdr.data.rewind();
-            frames.add(pps_hdr);
-
-            // pictureParameterSetNALUnit
-            pps_bb.size = pps.array().length;
-            pps_bb.data = pps.duplicate();
-            frames.add(pps_bb);
-        }
-
-        public SrsAllocator.Allocation muxFlvTag(ArrayList<SrsFlvFrameBytes> frames, int frame_type,
-                                                 int avc_packet_type, int dts, int pts) {
-            // for h264 in RTMP video payload, there is 5bytes header:
-            //      1bytes, FrameType | CodecID
-            //      1bytes, AVCPacketType
-            //      3bytes, CompositionTime, the cts.
-            // @see: E.4.3 Video Tags, video_file_format_spec_v10_1.pdf, page 78
-            int size = 5;
-            for (int i = 0; i < frames.size(); i++) {
-                size += frames.get(i).size;
-            }
-            SrsAllocator.Allocation allocation = mVideoAllocator.allocate(size);
-
-            // @see: E.4.3 Video Tags, video_file_format_spec_v10_1.pdf, page 78
-            // Frame Type, Type of video frame.
-            // CodecID, Codec Identifier.
-            // set the rtmp header
-            allocation.put((byte) ((frame_type << 4) | SrsCodecVideo.AVC));
-
-            // AVCPacketType
-            allocation.put((byte)avc_packet_type);
-
-            // CompositionTime
-            // pts = dts + cts, or
-            // cts = pts - dts.
-            // where cts is the header in rtmp video packet payload header.
-            int cts = pts - dts;
-            allocation.put((byte)(cts >> 16));
-            allocation.put((byte)(cts >> 8));
-            allocation.put((byte)cts);
-
-            // h.264 raw data.
-            for (int i = 0; i < frames.size(); i++) {
-                SrsFlvFrameBytes frame = frames.get(i);
-                frame.data.get(allocation.array(), allocation.size(), frame.size);
-                allocation.appendOffset(frame.size);
-            }
-
-            return allocation;
-        }
-
-        private SrsAnnexbSearch searchAnnexb(ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            annexb.match = false;
-            annexb.nb_start_code = 0;
-
-            for (int i = bb.position(); i < bi.size - 3; i++) {
-                // not match.
-                if (bb.get(i) != 0x00 || bb.get(i + 1) != 0x00) {
-                    break;
-                }
-
-                // match N[00] 00 00 01, where N>=0
-                if (bb.get(i + 2) == 0x01) {
-                    annexb.match = true;
-                    annexb.nb_start_code = i + 3 - bb.position();
-                    break;
-                }
-            }
-
-            return annexb;
-        }
-
-        public SrsFlvFrameBytes demuxAnnexb(ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            SrsFlvFrameBytes tbb = new SrsFlvFrameBytes();
-
-            while (bb.position() < bi.size) {
-                // each frame must prefixed by annexb format.
-                // about annexb, @see H.264-AVC-ISO_IEC_14496-10.pdf, page 211.
-                SrsAnnexbSearch tbbsc = searchAnnexb(bb, bi);
-                if (!tbbsc.match || tbbsc.nb_start_code < 3) {
-                    Log.e(TAG, "annexb not match.");
-                    mHandler.notifyRtmpIllegalArgumentException(new IllegalArgumentException(
-                        String.format("annexb not match for %dB, pos=%d", bi.size, bb.position())));
-                }
-
-                // the start codes.
-                for (int i = 0; i < tbbsc.nb_start_code; i++) {
-                    bb.get();
-                }
-
-                // find out the frame size.
-                tbb.data = bb.slice();
-                int pos = bb.position();
-                while (bb.position() < bi.size) {
-                    SrsAnnexbSearch bsc = searchAnnexb(bb, bi);
-                    if (bsc.match) {
-                        break;
-                    }
-                    bb.get();
-                }
-
-                tbb.size = bb.position() - pos;
-                break;
-            }
-
-            return tbb;
-        }
-    }
-
-    private class SrsRawAacStreamCodec {
-        public byte protection_absent;
-        // SrsAacObjectType
-        public int aac_object;
-        public byte sampling_frequency_index;
-        public byte channel_configuration;
-        public short frame_length;
-
-        public byte sound_format;
-        public byte sound_rate;
-        public byte sound_size;
-        public byte sound_type;
-        // 0 for sh; 1 for raw data.
-        public byte aac_packet_type;
-
-        public byte[] frame;
-    }
-
-    /**
-     * remux the annexb to flv tags.
-     */
-    private class SrsFlv {
-        private MediaFormat videoTrack;
-        private MediaFormat audioTrack;
-        private int achannel;
-        private int asample_rate;
-        private SrsRawH264Stream avc = new SrsRawH264Stream();
-        private ArrayList<SrsFlvFrameBytes> ipbs = new ArrayList<>();
-        private SrsAllocator.Allocation audio_tag;
-        private SrsAllocator.Allocation video_tag;
-        private ByteBuffer h264_sps;
-        private boolean h264_sps_changed;
-        private ByteBuffer h264_pps;
-        private boolean h264_pps_changed;
-        private boolean h264_sps_pps_sent;
-        private boolean aac_specific_config_got;
-
-        public SrsFlv() {
-            reset();
-        }
-
-        public void reset() {
-            h264_sps_changed = false;
-            h264_pps_changed = false;
-            h264_sps_pps_sent = false;
-            aac_specific_config_got = false;
-        }
-
-        public void setVideoTrack(MediaFormat format) {
-            videoTrack = format;
-        }
-
-        public void setAudioTrack(MediaFormat format) {
-            audioTrack = format;
-            achannel = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT);
-            asample_rate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE);
-        }
-
-        public void writeAudioSample(final ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            int pts = (int)(bi.presentationTimeUs / 1000);
-            int dts = pts;
-
-            audio_tag = mAudioAllocator.allocate(bi.size + 2);
-            byte aac_packet_type = 1; // 1 = AAC raw
-            if (!aac_specific_config_got) {
-                // @see aac-mp4a-format-ISO_IEC_14496-3+2001.pdf
-                // AudioSpecificConfig (), page 33
-                // 1.6.2.1 AudioSpecificConfig
-                // audioObjectType; 5 bslbf
-                byte ch = (byte)(bb.get(0) & 0xf8);
-                // 3bits left.
-
-                // samplingFrequencyIndex; 4 bslbf
-                byte samplingFrequencyIndex = 0x04;
-                if (asample_rate == SrsCodecAudioSampleRate.R22050) {
-                    samplingFrequencyIndex = 0x07;
-                } else if (asample_rate == SrsCodecAudioSampleRate.R11025) {
-                    samplingFrequencyIndex = 0x0a;
-                }
-                ch |= (samplingFrequencyIndex >> 1) & 0x07;
-                audio_tag.put(ch, 2);
-
-                ch = (byte)((samplingFrequencyIndex << 7) & 0x80);
-                // 7bits left.
-
-                // channelConfiguration; 4 bslbf
-                byte channelConfiguration = 1;
-                if (achannel == 2) {
-                    channelConfiguration = 2;
-                }
-                ch |= (channelConfiguration << 3) & 0x78;
-                // 3bits left.
-
-                // GASpecificConfig(), page 451
-                // 4.4.1 Decoder configuration (GASpecificConfig)
-                // frameLengthFlag; 1 bslbf
-                // dependsOnCoreCoder; 1 bslbf
-                // extensionFlag; 1 bslbf
-                audio_tag.put(ch, 3);
-
-                aac_specific_config_got = true;
-                aac_packet_type = 0; // 0 = AAC sequence header
-
-                writeAdtsHeader(audio_tag.array(), 4);
-                audio_tag.appendOffset(7);
-            } else {
-                bb.get(audio_tag.array(), 2, bi.size);
-                audio_tag.appendOffset(bi.size + 2);
-            }
-
-            byte sound_format = 10; // AAC
-            byte sound_type = 0; // 0 = Mono sound
-            if (achannel == 2) {
-                sound_type = 1; // 1 = Stereo sound
-            }
-            byte sound_size = 1; // 1 = 16-bit samples
-            byte sound_rate = 3; // 44100, 22050, 11025
-            if (asample_rate == 22050) {
-                sound_rate = 2;
-            } else if (asample_rate == 11025) {
-                sound_rate = 1;
-            }
-
-            // for audio frame, there is 1 or 2 bytes header:
-            //      1bytes, SoundFormat|SoundRate|SoundSize|SoundType
-            //      1bytes, AACPacketType for SoundFormat == 10, 0 is sequence header.
-            byte audio_header = (byte) (sound_type & 0x01);
-            audio_header |= (sound_size << 1) & 0x02;
-            audio_header |= (sound_rate << 2) & 0x0c;
-            audio_header |= (sound_format << 4) & 0xf0;
-
-            audio_tag.put(audio_header, 0);
-            audio_tag.put(aac_packet_type, 1);
-
-            writeRtmpPacket(SrsCodecFlvTag.Audio, dts, 0, aac_packet_type, audio_tag);
-        }
-
-        private void writeAdtsHeader(byte[] frame, int offset) {
-            // adts sync word 0xfff (12-bit)
-            frame[offset] = (byte) 0xff;
-            frame[offset + 1] = (byte) 0xf0;
-            // versioin 0 for MPEG-4, 1 for MPEG-2 (1-bit)
-            frame[offset + 1] |= 0 << 3;
-            // layer 0 (2-bit)
-            frame[offset + 1] |= 0 << 1;
-            // protection absent: 1 (1-bit)
-            frame[offset + 1] |= 1;
-            // profile: audio_object_type - 1 (2-bit)
-            frame[offset + 2] = (SrsAacObjectType.AacLC - 1) << 6;
-            // sampling frequency index: 4 (4-bit)
-            frame[offset + 2] |= (4 & 0xf) << 2;
-            // channel configuration (3-bit)
-            frame[offset + 2] |= (2 & (byte) 0x4) >> 2;
-            frame[offset + 3] = (byte) ((2 & (byte) 0x03) << 6);
-            // original: 0 (1-bit)
-            frame[offset + 3] |= 0 << 5;
-            // home: 0 (1-bit)
-            frame[offset + 3] |= 0 << 4;
-            // copyright id bit: 0 (1-bit)
-            frame[offset + 3] |= 0 << 3;
-            // copyright id start: 0 (1-bit)
-            frame[offset + 3] |= 0 << 2;
-            // frame size (13-bit)
-            frame[offset + 3] |= ((frame.length - 2) & 0x1800) >> 11;
-            frame[offset + 4] = (byte) (((frame.length - 2) & 0x7f8) >> 3);
-            frame[offset + 5] = (byte) (((frame.length - 2) & 0x7) << 5);
-            // buffer fullness (0x7ff for variable bitrate)
-            frame[offset + 5] |= (byte) 0x1f;
-            frame[offset + 6] = (byte) 0xfc;
-            // number of data block (nb - 1)
-            frame[offset + 6] |= 0x0;
-        }
-
-        public void writeVideoSample(final ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            int pts = (int) (bi.presentationTimeUs / 1000);
-            int dts = pts;
-
-            int type = SrsCodecVideoAVCFrame.InterFrame;
-
-            // send each frame.
-            while (bb.position() < bi.size) {
-                SrsFlvFrameBytes frame = avc.demuxAnnexb(bb, bi);
-
-                // 5bits, 7.3.1 NAL unit syntax,
-                // H.264-AVC-ISO_IEC_14496-10.pdf, page 44.
-                // 7: SPS, 8: PPS, 5: I Frame, 1: P Frame
-                int nal_unit_type = (int)(frame.data.get(0) & 0x1f);
-                if (nal_unit_type == SrsAvcNaluType.SPS || nal_unit_type == SrsAvcNaluType.PPS) {
-                    Log.i(TAG, String.format("annexb demux %dB, pts=%d, frame=%dB, nalu=%d",
-                        bi.size, pts, frame.size, nal_unit_type));
-                }
-
-                // for IDR frame, the frame is keyframe.
-                if (nal_unit_type == SrsAvcNaluType.IDR) {
-                    type = SrsCodecVideoAVCFrame.KeyFrame;
-                }
-
-                // ignore the nalu type aud(9)
-                if (nal_unit_type == SrsAvcNaluType.AccessUnitDelimiter) {
-                    continue;
-                }
-
-                // for sps
-                if (avc.isSps(frame)) {
-                    if (!frame.data.equals(h264_sps)) {
-                        byte[] sps = new byte[frame.size];
-                        frame.data.get(sps);
-                        h264_sps_changed = true;
-                        h264_sps = ByteBuffer.wrap(sps);
-                    }
-                    continue;
-                }
-
-                // for pps
-                if (avc.isPps(frame)) {
-                    if (!frame.data.equals(h264_pps)) {
-                        byte[] pps = new byte[frame.size];
-                        frame.data.get(pps);
-                        h264_pps_changed = true;
-                        h264_pps = ByteBuffer.wrap(pps);
-                    }
-                    continue;
-                }
-
-                // IPB frame.
-                ipbs.add(avc.muxNaluHeader(frame));
-                ipbs.add(frame);
-            }
-
-            writeH264SpsPps(dts, pts);
-            writeH264IpbFrame(ipbs, type, dts, pts);
-            ipbs.clear();
-        }
-
-        private void writeH264SpsPps(int dts, int pts) {
-            // when sps or pps changed, update the sequence header,
-            // for the pps maybe not changed while sps changed.
-            // so, we must check when each video ts message frame parsed.
-            if (h264_sps_pps_sent && !h264_sps_changed && !h264_pps_changed) {
-                return;
-            }
-
-            // when not got sps/pps, wait.
-            if (h264_pps == null || h264_sps == null) {
-                return;
-            }
-
-            // h264 raw to h264 packet.
-            ArrayList<SrsFlvFrameBytes> frames = new ArrayList<>();
-            avc.muxSequenceHeader(h264_sps, h264_pps, dts, pts, frames);
-
-            // h264 packet to flv packet.
-            int frame_type = SrsCodecVideoAVCFrame.KeyFrame;
-            int avc_packet_type = SrsCodecVideoAVCType.SequenceHeader;
-            video_tag = avc.muxFlvTag(frames, frame_type, avc_packet_type, dts, pts);
-
-            // the timestamp in rtmp message header is dts.
-            writeRtmpPacket(SrsCodecFlvTag.Video, dts, frame_type, avc_packet_type, video_tag);
-
-            // reset sps and pps.
-            h264_sps_changed = false;
-            h264_pps_changed = false;
-            h264_sps_pps_sent = true;
-            Log.i(TAG, String.format("flv: h264 sps/pps sent, sps=%dB, pps=%dB",
-                h264_sps.array().length, h264_pps.array().length));
-        }
-
-        private void writeH264IpbFrame(ArrayList<SrsFlvFrameBytes> frames, int type, int dts, int pts) {
-            // when sps or pps not sent, ignore the packet.
-            // @see https://github.com/simple-rtmp-server/srs/issues/203
-            if (!h264_sps_pps_sent) {
-                return;
-            }
-
-            video_tag = avc.muxFlvTag(frames, type, SrsCodecVideoAVCType.NALU, dts, pts);
-
-            // the timestamp in rtmp message header is dts.
-            writeRtmpPacket(SrsCodecFlvTag.Video, dts, type, SrsCodecVideoAVCType.NALU, video_tag);
-        }
-
-        private void writeRtmpPacket(int type, int dts, int frame_type, int avc_aac_type, SrsAllocator.Allocation tag) {
-            SrsFlvFrame frame = new SrsFlvFrame();
-            frame.flvTag = tag;
-            frame.type = type;
-            frame.dts = dts;
-            frame.frame_type = frame_type;
-            frame.avc_aac_type = avc_aac_type;
-
-            if (frame.isVideo()) {
-                if (needToFindKeyFrame) {
-                    if (frame.isKeyFrame()) {
-                        needToFindKeyFrame = false;
-                        flvTagCacheAdd(frame);
-                    }
-                } else {
-                    flvTagCacheAdd(frame);
-                }
-            } else if (frame.isAudio()) {
-                flvTagCacheAdd(frame);
-            }
-        }
-
-        private void flvTagCacheAdd(SrsFlvFrame frame) {
-            mFlvTagCache.add(frame);
-            if (frame.isVideo()) {
-                getVideoFrameCacheNumber().incrementAndGet();
-            }
-            synchronized (txFrameLock) {
-                txFrameLock.notifyAll();
-            }
-        }
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsMp4Muxer.java b/android/src/main/java/net/ossrs/yasea/SrsMp4Muxer.java
deleted file mode 100755
index ca4675d..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsMp4Muxer.java
+++ /dev/null
@@ -1,1086 +0,0 @@
-package net.ossrs.yasea;
-
-import android.media.MediaCodec;
-import android.media.MediaFormat;
-import android.util.Log;
-
-import com.coremedia.iso.BoxParser;
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.IsoTypeWriter;
-import com.coremedia.iso.boxes.AbstractMediaHeaderBox;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.coremedia.iso.boxes.DataEntryUrlBox;
-import com.coremedia.iso.boxes.DataInformationBox;
-import com.coremedia.iso.boxes.DataReferenceBox;
-import com.coremedia.iso.boxes.FileTypeBox;
-import com.coremedia.iso.boxes.HandlerBox;
-import com.coremedia.iso.boxes.MediaBox;
-import com.coremedia.iso.boxes.MediaHeaderBox;
-import com.coremedia.iso.boxes.MediaInformationBox;
-import com.coremedia.iso.boxes.MovieBox;
-import com.coremedia.iso.boxes.MovieHeaderBox;
-import com.coremedia.iso.boxes.SampleDescriptionBox;
-import com.coremedia.iso.boxes.SampleSizeBox;
-import com.coremedia.iso.boxes.SampleTableBox;
-import com.coremedia.iso.boxes.SampleToChunkBox;
-import com.coremedia.iso.boxes.SoundMediaHeaderBox;
-import com.coremedia.iso.boxes.StaticChunkOffsetBox;
-import com.coremedia.iso.boxes.SyncSampleBox;
-import com.coremedia.iso.boxes.TimeToSampleBox;
-import com.coremedia.iso.boxes.TrackBox;
-import com.coremedia.iso.boxes.TrackHeaderBox;
-import com.coremedia.iso.boxes.VideoMediaHeaderBox;
-import com.coremedia.iso.boxes.h264.AvcConfigurationBox;
-import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
-import com.coremedia.iso.boxes.sampleentry.VisualSampleEntry;
-import com.googlecode.mp4parser.boxes.mp4.ESDescriptorBox;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.AudioSpecificConfig;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.DecoderConfigDescriptor;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.ESDescriptor;
-import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.SLConfigDescriptor;
-import com.googlecode.mp4parser.util.Math;
-import com.googlecode.mp4parser.util.Matrix;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentLinkedQueue;
-
-/**
- * Created by LeoMa on 2016/5/21.
- */
-public class SrsMp4Muxer {
-
-    private static final String TAG = "SrsMp4Muxer";
-    private static final int VIDEO_TRACK = 100;
-    private static final int AUDIO_TRACK = 101;
-
-    private File mRecFile;
-    private SrsRecordHandler mHandler;
-
-    private MediaFormat videoFormat = null;
-    private MediaFormat audioFormat = null;
-
-    private SrsRawH264Stream avc = new SrsRawH264Stream();
-    private Mp4Movie mp4Movie = new Mp4Movie();
-
-    private boolean aacSpecConfig = false;
-    private ByteBuffer h264_sps = null;
-    private ByteBuffer h264_pps = null;
-    private ArrayList<byte[]> spsList = new ArrayList<>();
-    private ArrayList<byte[]> ppsList = new ArrayList<>();
-
-    private Thread worker;
-    private volatile boolean bRecording = false;
-    private volatile boolean bPaused = false;
-    private volatile boolean needToFindKeyFrame = true;
-    private final Object writeLock = new Object();
-    private ConcurrentLinkedQueue<SrsEsFrame> frameCache = new ConcurrentLinkedQueue<>();
-
-    private static Map<Integer, Integer> samplingFrequencyIndexMap = new HashMap<>();
-
-    static {
-        samplingFrequencyIndexMap.put(96000, 0x0);
-        samplingFrequencyIndexMap.put(88200, 0x1);
-        samplingFrequencyIndexMap.put(64000, 0x2);
-        samplingFrequencyIndexMap.put(48000, 0x3);
-        samplingFrequencyIndexMap.put(44100, 0x4);
-        samplingFrequencyIndexMap.put(32000, 0x5);
-        samplingFrequencyIndexMap.put(24000, 0x6);
-        samplingFrequencyIndexMap.put(22050, 0x7);
-        samplingFrequencyIndexMap.put(16000, 0x8);
-        samplingFrequencyIndexMap.put(12000, 0x9);
-        samplingFrequencyIndexMap.put(11025, 0xa);
-        samplingFrequencyIndexMap.put(8000, 0xb);
-    }
-
-    public SrsMp4Muxer(SrsRecordHandler handler) {
-        mHandler = handler;
-    }
-    
-    /**
-     * start recording.
-     */
-    public boolean record(File outputFile) {
-        if (videoFormat == null && audioFormat == null) {
-            return false;
-        }
-
-        mRecFile = outputFile;
-        createMovie(mRecFile);
-        mHandler.notifyRecordStarted(mRecFile.getPath());
-
-        if (!spsList.isEmpty() && !ppsList.isEmpty()) {
-            mp4Movie.addTrack(videoFormat, false);
-        }
-        mp4Movie.addTrack(audioFormat, true);
-
-        worker = new Thread(new Runnable() {
-            @Override
-            public void run() {
-                bRecording = true;
-                while (bRecording) {
-                    // Keep at least one audio and video frame in cache to ensure monotonically increasing.
-                    while (!frameCache.isEmpty()) {
-                        SrsEsFrame frame = frameCache.poll();
-                        writeSampleData(frame.bb, frame.bi, frame.is_audio());
-                    }
-                    // Waiting for next frame
-                    synchronized (writeLock) {
-                        try {
-                            // isEmpty() may take some time, so we set timeout to detect next frame
-                            writeLock.wait(500);
-                        } catch (InterruptedException ie) {
-                            worker.interrupt();
-                        }
-                    }
-                }
-            }
-        });
-        worker.start();
-
-        return true;
-    }
-
-    /**
-     * pause recording.
-     */
-    public void pause() {
-        if (bRecording) {
-            bPaused = true;
-            mHandler.notifyRecordPause();
-        }
-    }
-
-    /**
-     * resume recording.
-     */
-    public void resume() {
-        if (bRecording) {
-            bPaused = false;
-            needToFindKeyFrame = true;
-            mHandler.notifyRecordResume();
-        }
-    }
-
-    /**
-     * finish recording.
-     */
-    public void stop() {
-        bRecording = false;
-        bPaused = false;
-        needToFindKeyFrame = true;
-        aacSpecConfig = false;
-        frameCache.clear();
-
-        if (worker != null) {
-            try {
-                worker.join();
-            } catch (InterruptedException e) {
-                e.printStackTrace();
-                worker.interrupt();
-            }
-            worker = null;
-
-            finishMovie();
-            mHandler.notifyRecordFinished(mRecFile.getPath());
-        }
-        Log.i(TAG, "SrsMp4Muxer closed");
-    }
-
-    /**
-     * Adds a track with the specified format.
-     *
-     * @param format The media format for the track.
-     * @return The track index for this newly added track.
-     */
-    public int addTrack(MediaFormat format) {
-        if (format.getString(MediaFormat.KEY_MIME).contentEquals(SrsEncoder.VCODEC)) {
-            videoFormat = format;
-            return VIDEO_TRACK;
-        } else {
-            audioFormat = format;
-            return AUDIO_TRACK;
-        }
-    }
-
-    /**
-     * send the annexb frame to SRS over RTMP.
-     *
-     * @param trackIndex The track index for this sample.
-     * @param byteBuf    The encoded sample.
-     * @param bufferInfo The buffer information related to this sample.
-     */
-    public void writeSampleData(int trackIndex, ByteBuffer byteBuf, MediaCodec.BufferInfo bufferInfo) {
-        if (VIDEO_TRACK == trackIndex) {
-            writeVideoSample(byteBuf, bufferInfo);
-        } else {
-            writeAudioSample(byteBuf, bufferInfo);
-        }
-    }
-
-    /**
-     * Table 7-1 – NAL unit type codes, syntax element categories, and NAL unit type classes
-     * H.264-AVC-ISO_IEC_14496-10-2012.pdf, page 83.
-     */
-    private class SrsAvcNaluType
-    {
-        // Unspecified
-        public final static int Reserved = 0;
-
-        // Coded slice of a non-IDR picture slice_layer_without_partitioning_rbsp( )
-        public final static int NonIDR = 1;
-        // Coded slice data partition A slice_data_partition_a_layer_rbsp( )
-        public final static int DataPartitionA = 2;
-        // Coded slice data partition B slice_data_partition_b_layer_rbsp( )
-        public final static int DataPartitionB = 3;
-        // Coded slice data partition C slice_data_partition_c_layer_rbsp( )
-        public final static int DataPartitionC = 4;
-        // Coded slice of an IDR picture slice_layer_without_partitioning_rbsp( )
-        public final static int IDR = 5;
-        // Supplemental enhancement information (SEI) sei_rbsp( )
-        public final static int SEI = 6;
-        // Sequence parameter set seq_parameter_set_rbsp( )
-        public final static int SPS = 7;
-        // Picture parameter set pic_parameter_set_rbsp( )
-        public final static int PPS = 8;
-        // Access unit delimiter access_unit_delimiter_rbsp( )
-        public final static int AccessUnitDelimiter = 9;
-        // End of sequence end_of_seq_rbsp( )
-        public final static int EOSequence = 10;
-        // End of stream end_of_stream_rbsp( )
-        public final static int EOStream = 11;
-        // Filler data filler_data_rbsp( )
-        public final static int FilterData = 12;
-        // Sequence parameter set extension seq_parameter_set_extension_rbsp( )
-        public final static int SPSExt = 13;
-        // Prefix NAL unit prefix_nal_unit_rbsp( )
-        public final static int PrefixNALU = 14;
-        // Subset sequence parameter set subset_seq_parameter_set_rbsp( )
-        public final static int SubsetSPS = 15;
-        // Coded slice of an auxiliary coded picture without partitioning slice_layer_without_partitioning_rbsp( )
-        public final static int LayerWithoutPartition = 19;
-        // Coded slice extension slice_layer_extension_rbsp( )
-        public final static int CodedSliceExt = 20;
-    }
-
-    private void writeVideoSample(final ByteBuffer bb, MediaCodec.BufferInfo bi) {
-        int nal_unit_type = bb.get(4) & 0x1f;
-        if (nal_unit_type == SrsAvcNaluType.IDR || nal_unit_type == SrsAvcNaluType.NonIDR) {
-            writeFrameByte(VIDEO_TRACK, bb, bi, nal_unit_type == SrsAvcNaluType.IDR);
-        } else {
-            while (bb.position() < bi.size) {
-                SrsEsFrameBytes frame = avc.annexb_demux(bb, bi);
-
-                if (avc.is_sps(frame)) {
-                    if (!frame.data.equals(h264_sps)) {
-                        byte[] sps = new byte[frame.size];
-                        frame.data.get(sps);
-                        h264_sps = ByteBuffer.wrap(sps);
-                        spsList.clear();
-                        spsList.add(sps);
-                    }
-                    continue;
-                }
-
-                if (avc.is_pps(frame)) {
-                    if (!frame.data.equals(h264_pps)) {
-                        byte[] pps = new byte[frame.size];
-                        frame.data.get(pps);
-                        h264_pps = ByteBuffer.wrap(pps);
-                        ppsList.clear();
-                        ppsList.add(pps);
-                    }
-                    continue;
-                }
-            }
-        }
-    }
-
-    private void writeAudioSample(final ByteBuffer bb, MediaCodec.BufferInfo bi) {
-        if (!aacSpecConfig) {
-            aacSpecConfig = true;
-        } else {
-            writeFrameByte(AUDIO_TRACK, bb, bi, false);
-        }
-    }
-
-    private void writeFrameByte(int track, ByteBuffer bb, MediaCodec.BufferInfo bi, boolean isKeyFrame) {
-        SrsEsFrame frame = new SrsEsFrame();
-        frame.bb = bb;
-        frame.bi = bi;
-        frame.isKeyFrame = isKeyFrame;
-        frame.track = track;
-
-        if (bRecording && !bPaused) {
-            if (needToFindKeyFrame) {
-                if (frame.isKeyFrame) {
-                    needToFindKeyFrame = false;
-                    frameCache.add(frame);
-                    synchronized (writeLock) {
-                        writeLock.notifyAll();
-                    }
-                }
-            } else {
-                frameCache.add(frame);
-                synchronized (writeLock) {
-                    writeLock.notifyAll();
-                }
-            }
-        }
-    }
-
-    /**
-     * the search result for annexb.
-     */
-    private class SrsAnnexbSearch {
-        public int nb_start_code = 0;
-        public boolean match = false;
-    }
-
-    /**
-     * the demuxed tag frame.
-     */
-    private class SrsEsFrameBytes {
-        public ByteBuffer data;
-        public int size;
-    }
-
-    /**
-     * the AV frame.
-     */
-    private class SrsEsFrame {
-        public ByteBuffer bb;
-        public MediaCodec.BufferInfo bi;
-        public int track;
-        public boolean isKeyFrame;
-
-        public boolean is_video() {
-            return track == VIDEO_TRACK;
-        }
-
-        public boolean is_audio() {
-            return track == AUDIO_TRACK;
-        }
-    }
-
-    /**
-     * the raw h.264 stream, in annexb.
-     */
-    private class SrsRawH264Stream {
-        public boolean is_sps(SrsEsFrameBytes frame) {
-            if (frame.size < 1) {
-                return false;
-            }
-
-            return (frame.data.get(0) & 0x1f) == SrsAvcNaluType.SPS;
-        }
-
-        public boolean is_pps(SrsEsFrameBytes frame) {
-            if (frame.size < 1) {
-                return false;
-            }
-            return (frame.data.get(0) & 0x1f) == SrsAvcNaluType.PPS;
-        }
-
-        public SrsAnnexbSearch srs_avc_startswith_annexb(ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            SrsAnnexbSearch as = new SrsAnnexbSearch();
-            as.match = false;
-
-            int pos = bb.position();
-            while (pos < bi.size - 3) {
-                // not match.
-                if (bb.get(pos) != 0x00 || bb.get(pos + 1) != 0x00) {
-                    break;
-                }
-
-                // match N[00] 00 00 01, where N>=0
-                if (bb.get(pos + 2) == 0x01) {
-                    as.match = true;
-                    as.nb_start_code = pos + 3 - bb.position();
-                    break;
-                }
-
-                pos++;
-            }
-
-            return as;
-        }
-
-        public SrsEsFrameBytes annexb_demux(ByteBuffer bb, MediaCodec.BufferInfo bi) {
-            SrsEsFrameBytes tbb = new SrsEsFrameBytes();
-
-            while (bb.position() < bi.size) {
-                // each frame must prefixed by annexb format.
-                // about annexb, @see H.264-AVC-ISO_IEC_14496-10.pdf, page 211.
-                SrsAnnexbSearch tbbsc = srs_avc_startswith_annexb(bb, bi);
-                if (!tbbsc.match || tbbsc.nb_start_code < 3) {
-                    Log.e(TAG, "annexb not match.");
-                    mHandler.notifyRecordIllegalArgumentException(new IllegalArgumentException(
-                        String.format("annexb not match for %dB, pos=%d", bi.size, bb.position())));
-                }
-
-                // the start codes.
-                ByteBuffer tbbs = bb.slice();
-                for (int i = 0; i < tbbsc.nb_start_code; i++) {
-                    bb.get();
-                }
-
-                // find out the frame size.
-                tbb.data = bb.slice();
-                int pos = bb.position();
-                while (bb.position() < bi.size) {
-                    SrsAnnexbSearch bsc = srs_avc_startswith_annexb(bb, bi);
-                    if (bsc.match) {
-                        break;
-                    }
-                    bb.get();
-                }
-
-                tbb.size = bb.position() - pos;
-                break;
-            }
-
-            return tbb;
-        }
-    }
-
-    private class Sample {
-        private long offset = 0;
-        private long size = 0;
-
-        public Sample(long offset, long size) {
-            this.offset = offset;
-            this.size = size;
-        }
-
-        public long getOffset() {
-            return offset;
-        }
-
-        public long getSize() {
-            return size;
-        }
-    }
-
-    private class Track {
-        private int trackId = 0;
-        private ArrayList<Sample> samples = new ArrayList<>();
-        private long duration = 0;
-        private String handler;
-        private AbstractMediaHeaderBox headerBox = null;
-        private SampleDescriptionBox sampleDescriptionBox = null;
-        private LinkedList<Integer> syncSamples = null;
-        private int timeScale;
-        private Date creationTime = new Date();
-        private int height;
-        private int width;
-        private float volume = 0;
-        private ArrayList<Long> sampleDurations = new ArrayList<>();
-        private boolean isAudio = false;
-        private long lastPresentationTimeUs = 0;
-        private boolean first = true;
-
-        public Track(int id, MediaFormat format, boolean audio) {
-            trackId = id;
-            isAudio = audio;
-            if (!isAudio) {
-                sampleDurations.add((long) 3015);
-                duration = 3015;
-                width = format.getInteger(MediaFormat.KEY_WIDTH);
-                height = format.getInteger(MediaFormat.KEY_HEIGHT);
-                timeScale = 90000;
-                syncSamples = new LinkedList<>();
-                handler = "vide";
-                headerBox = new VideoMediaHeaderBox();
-                sampleDescriptionBox = new SampleDescriptionBox();
-                if (format.getString(MediaFormat.KEY_MIME).contentEquals(SrsEncoder.VCODEC)) {
-                    VisualSampleEntry visualSampleEntry = new VisualSampleEntry("avc1");
-                    visualSampleEntry.setDataReferenceIndex(1);
-                    visualSampleEntry.setDepth(24);
-                    visualSampleEntry.setFrameCount(1);
-                    visualSampleEntry.setHorizresolution(72);
-                    visualSampleEntry.setVertresolution(72);
-                    visualSampleEntry.setWidth(width);
-                    visualSampleEntry.setHeight(height);
-                    visualSampleEntry.setCompressorname("AVC Coding");
-
-                    AvcConfigurationBox avcConfigurationBox = new AvcConfigurationBox();
-                    avcConfigurationBox.setConfigurationVersion(1);
-                    avcConfigurationBox.setAvcProfileIndication((int) h264_sps.get(1));
-                    avcConfigurationBox.setProfileCompatibility(0);
-                    avcConfigurationBox.setAvcLevelIndication((int) h264_sps.get(3));
-                    avcConfigurationBox.setLengthSizeMinusOne(3);
-                    avcConfigurationBox.setSequenceParameterSets(spsList);
-                    avcConfigurationBox.setPictureParameterSets(ppsList);
-                    avcConfigurationBox.setBitDepthLumaMinus8(-1);
-                    avcConfigurationBox.setBitDepthChromaMinus8(-1);
-                    avcConfigurationBox.setChromaFormat(-1);
-                    avcConfigurationBox.setHasExts(false);
-
-                    visualSampleEntry.addBox(avcConfigurationBox);
-                    sampleDescriptionBox.addBox(visualSampleEntry);
-                }
-            } else {
-                sampleDurations.add((long) 1024);
-                duration = 1024;
-                volume = 1;
-                timeScale = format.getInteger(MediaFormat.KEY_SAMPLE_RATE);
-                handler = "soun";
-                headerBox = new SoundMediaHeaderBox();
-                sampleDescriptionBox = new SampleDescriptionBox();
-                AudioSampleEntry audioSampleEntry = new AudioSampleEntry("mp4a");
-                audioSampleEntry.setChannelCount(format.getInteger(MediaFormat.KEY_CHANNEL_COUNT));
-                audioSampleEntry.setSampleRate(format.getInteger(MediaFormat.KEY_SAMPLE_RATE));
-                audioSampleEntry.setDataReferenceIndex(1);
-                audioSampleEntry.setSampleSize(16);
-
-                ESDescriptorBox esds = new ESDescriptorBox();
-                ESDescriptor descriptor = new ESDescriptor();
-                descriptor.setEsId(0);
-
-                SLConfigDescriptor slConfigDescriptor = new SLConfigDescriptor();
-                slConfigDescriptor.setPredefined(2);
-                descriptor.setSlConfigDescriptor(slConfigDescriptor);
-
-                DecoderConfigDescriptor decoderConfigDescriptor = new DecoderConfigDescriptor();
-                decoderConfigDescriptor.setObjectTypeIndication(0x40);
-                decoderConfigDescriptor.setStreamType(5);
-                decoderConfigDescriptor.setBufferSizeDB(1536);
-                decoderConfigDescriptor.setMaxBitRate(96000);
-                decoderConfigDescriptor.setAvgBitRate(96000);
-
-                AudioSpecificConfig audioSpecificConfig = new AudioSpecificConfig();
-                audioSpecificConfig.setAudioObjectType(2);
-                audioSpecificConfig.setSamplingFrequencyIndex(samplingFrequencyIndexMap.get((int) audioSampleEntry.getSampleRate()));
-                audioSpecificConfig.setChannelConfiguration(audioSampleEntry.getChannelCount());
-                decoderConfigDescriptor.setAudioSpecificInfo(audioSpecificConfig);
-
-                descriptor.setDecoderConfigDescriptor(decoderConfigDescriptor);
-
-                ByteBuffer data = descriptor.serialize();
-                esds.setEsDescriptor(descriptor);
-                esds.setData(data);
-                audioSampleEntry.addBox(esds);
-                sampleDescriptionBox.addBox(audioSampleEntry);
-            }
-        }
-
-        public void addSample(long offset, MediaCodec.BufferInfo bi) {
-            long delta = bi.presentationTimeUs - lastPresentationTimeUs;
-            if (delta < 0) {
-                return;
-            }
-            boolean isSyncFrame = !isAudio && (bi.flags & MediaCodec.BUFFER_FLAG_SYNC_FRAME) != 0;
-            samples.add(new Sample(offset, bi.size));
-            if (syncSamples != null && isSyncFrame) {
-                syncSamples.add(samples.size());
-            }
-
-            delta = (delta * timeScale + 500000L) / 1000000L;
-            lastPresentationTimeUs = bi.presentationTimeUs;
-            if (!first) {
-                sampleDurations.add(sampleDurations.size() - 1, delta);
-                duration += delta;
-            }
-            first = false;
-        }
-
-        public void clearSample() {
-            first = true;
-            samples.clear();
-            syncSamples.clear();
-            sampleDurations.clear();
-        }
-
-        public ArrayList<Sample> getSamples() {
-            return samples;
-        }
-
-        public long getDuration() {
-            return duration;
-        }
-
-        public String getHandler() {
-            return handler;
-        }
-
-        public AbstractMediaHeaderBox getMediaHeaderBox() {
-            return headerBox;
-        }
-
-        public SampleDescriptionBox getSampleDescriptionBox() {
-            return sampleDescriptionBox;
-        }
-
-        public long[] getSyncSamples() {
-            if (syncSamples == null || syncSamples.isEmpty()) {
-                return null;
-            }
-            long[] returns = new long[syncSamples.size()];
-            for (int i = 0; i < syncSamples.size(); i++) {
-                returns[i] = syncSamples.get(i);
-            }
-            return returns;
-        }
-
-        public int getTimeScale() {
-            return timeScale;
-        }
-
-        public Date getCreationTime() {
-            return creationTime;
-        }
-
-        public int getWidth() {
-            return width;
-        }
-
-        public int getHeight() {
-            return height;
-        }
-
-        public float getVolume() {
-            return volume;
-        }
-
-        public ArrayList<Long> getSampleDurations() {
-            return sampleDurations;
-        }
-
-        public boolean isAudio() {
-            return isAudio;
-        }
-
-        public int getTrackId() {
-            return trackId;
-        }
-    }
-
-    private class Mp4Movie {
-        private Matrix matrix = Matrix.ROTATE_0;
-        private HashMap<Integer, Track> tracks = new HashMap<>();
-
-        public Matrix getMatrix() {
-            return matrix;
-        }
-
-        public HashMap<Integer, Track> getTracks() {
-            return tracks;
-        }
-
-        public void addSample(int trackIndex, long offset, MediaCodec.BufferInfo bi) {
-            Track track = tracks.get(trackIndex);
-            track.addSample(offset, bi);
-        }
-
-        public void addTrack(MediaFormat format, boolean isAudio) {
-            if (format != null) {
-                if (isAudio) {
-                    tracks.put(AUDIO_TRACK, new Track(tracks.size(), format, true));
-                } else {
-                    tracks.put(VIDEO_TRACK, new Track(tracks.size(), format, false));
-                }
-            }
-        }
-
-        public void removeTrack(int trackIndex) {
-            tracks.remove(trackIndex);
-        }
-    }
-
-    private class InterleaveChunkMdat implements Box {
-        private boolean first = true;
-        private ContainerBox parent;
-        private ByteBuffer header = ByteBuffer.allocateDirect(16);
-        private long contentSize = 1024 * 1024 * 1024;
-
-        public ContainerBox getParent() {
-            return parent;
-        }
-
-        public void setParent(ContainerBox parent) {
-            this.parent = parent;
-        }
-
-        public void setContentSize(long contentSize) {
-            this.contentSize = contentSize;
-        }
-
-        public long getContentSize() {
-            return contentSize;
-        }
-
-        public String getType() {
-            return "mdat";
-        }
-
-        public long getSize() {
-            return header.limit() + contentSize;
-        }
-
-        public int getHeaderSize() {
-            return header.limit();
-        }
-
-        private boolean isSmallBox(long contentSize) {
-            return (contentSize + header.limit()) < 4294967296L;
-        }
-
-        public void getBox(WritableByteChannel writableByteChannel) {
-            header.rewind();
-            long size = getSize();
-            if (isSmallBox(size)) {
-                IsoTypeWriter.writeUInt32(header, size);
-            } else {
-                IsoTypeWriter.writeUInt32(header, 1);
-            }
-            header.put(IsoFile.fourCCtoBytes("mdat"));
-            if (isSmallBox(size)) {
-                header.put(new byte[8]);
-            } else {
-                IsoTypeWriter.writeUInt64(header, size);
-            }
-            header.rewind();
-
-            try {
-                writableByteChannel.write(header);
-            } catch (IOException e) {
-                mHandler.notifyRecordIOException(e);
-            }
-        }
-
-        @Override
-        public void parse(ReadableByteChannel readableByteChannel, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException {
-        }
-    }
-
-    private InterleaveChunkMdat mdat = null;
-    private FileOutputStream fos = null;
-    private FileChannel fc = null;
-    private volatile long recFileSize = 0;
-    private volatile long mdatOffset = 0;
-    private volatile long flushBytes = 0;
-    private HashMap<Track, long[]> track2SampleSizes = new HashMap<>();
-
-    private void createMovie(File outputFile) {
-        try {
-            fos = new FileOutputStream(outputFile);
-            fc = fos.getChannel();
-            mdat = new InterleaveChunkMdat();
-            mdatOffset = 0;
-
-            FileTypeBox fileTypeBox = createFileTypeBox();
-            fileTypeBox.getBox(fc);
-            recFileSize += fileTypeBox.getSize();
-        } catch (IOException e) {
-            e.printStackTrace();
-            mHandler.notifyRecordIOException(e);
-        }
-    }
-
-    private void writeSampleData(ByteBuffer byteBuf, MediaCodec.BufferInfo bi, boolean isAudio) {
-        int trackIndex = isAudio ? AUDIO_TRACK : VIDEO_TRACK;
-        if (!mp4Movie.getTracks().containsKey(trackIndex)) {
-            return;
-        }
-
-        try {
-            if (mdat.first) {
-                mdat.setContentSize(0);
-                mdat.getBox(fc);
-                mdatOffset = recFileSize;
-                recFileSize += mdat.getHeaderSize();
-                mdat.first = false;
-            }
-
-            mp4Movie.addSample(trackIndex, recFileSize, bi);
-            byteBuf.position(bi.offset + (isAudio ? 0 : 4));
-            byteBuf.limit(bi.offset + bi.size);
-            if (!isAudio) {
-                ByteBuffer size = ByteBuffer.allocateDirect(4);
-                size.position(0);
-                size.putInt(bi.size - 4);
-                size.position(0);
-                recFileSize += fc.write(size);
-            }
-            int writeBytes = fc.write(byteBuf);
-
-            recFileSize += writeBytes;
-            flushBytes += writeBytes;
-            if (flushBytes > 64 * 1024) {
-                fos.flush();
-                flushBytes = 0;
-            }
-        } catch (IOException e) {
-            e.printStackTrace();
-            mHandler.notifyRecordIOException(e);
-        }
-    }
-
-    private void finishMovie() {
-        try {
-            if (flushBytes > 0) {
-                fos.flush();
-                flushBytes = 0;
-            }
-            if (mdat.getSize() != 0) {
-                // flush cached mdat box
-                long oldPosition = fc.position();
-                fc.position(mdatOffset);
-                mdat.setContentSize(recFileSize - mdat.getHeaderSize() - mdatOffset);
-                mdat.getBox(fc);
-                fc.position(oldPosition);
-                mdat.setContentSize(0);
-                fos.flush();
-            }
-
-            for (Track track : mp4Movie.getTracks().values()) {
-                List<Sample> samples = track.getSamples();
-                long[] sizes = new long[samples.size()];
-                for (int i = 0; i < sizes.length; i++) {
-                    sizes[i] = samples.get(i).getSize();
-                }
-                track2SampleSizes.put(track, sizes);
-            }
-
-            Box moov = createMovieBox(mp4Movie);
-            moov.getBox(fc);
-            fos.flush();
-
-            fc.close();
-            fos.close();
-            mp4Movie.getTracks().clear();
-            track2SampleSizes.clear();
-            recFileSize = 0;
-            flushBytes = 0;
-        } catch (IOException e) {
-            mHandler.notifyRecordIOException(e);
-        }
-    }
-
-    private FileTypeBox createFileTypeBox() {
-        LinkedList<String> minorBrands = new LinkedList<>();
-        minorBrands.add("isom");
-        minorBrands.add("3gp4");
-        return new FileTypeBox("isom", 0, minorBrands);
-    }
-
-    private long getTimescale(Mp4Movie mp4Movie) {
-        long timescale = 0;
-        if (!mp4Movie.getTracks().isEmpty()) {
-            timescale = mp4Movie.getTracks().values().iterator().next().getTimeScale();
-        }
-        for (Track track : mp4Movie.getTracks().values()) {
-            timescale = Math.gcd(track.getTimeScale(), timescale);
-        }
-        return timescale;
-    }
-
-    private MovieBox createMovieBox(Mp4Movie movie) {
-        MovieBox movieBox = new MovieBox();
-        MovieHeaderBox mvhd = new MovieHeaderBox();
-
-        mvhd.setCreationTime(new Date());
-        mvhd.setModificationTime(new Date());
-        mvhd.setMatrix(Matrix.ROTATE_0);
-        long movieTimeScale = getTimescale(movie);
-        long duration = 0;
-
-        for (Track track : movie.getTracks().values()) {
-            long tracksDuration = track.getDuration() * movieTimeScale / track.getTimeScale();
-            if (tracksDuration > duration) {
-                duration = tracksDuration;
-            }
-        }
-
-        mvhd.setDuration(duration);
-        mvhd.setTimescale(movieTimeScale);
-        mvhd.setNextTrackId(movie.getTracks().size() + 1);
-
-        movieBox.addBox(mvhd);
-        for (Track track : movie.getTracks().values()) {
-            movieBox.addBox(createTrackBox(track, movie));
-        }
-        return movieBox;
-    }
-
-    private TrackBox createTrackBox(Track track, Mp4Movie movie) {
-        TrackBox trackBox = new TrackBox();
-        TrackHeaderBox tkhd = new TrackHeaderBox();
-
-        tkhd.setEnabled(true);
-        tkhd.setInMovie(true);
-        tkhd.setInPreview(true);
-        if (track.isAudio()) {
-            tkhd.setMatrix(Matrix.ROTATE_0);
-        } else {
-            tkhd.setMatrix(movie.getMatrix());
-        }
-        tkhd.setAlternateGroup(0);
-        tkhd.setCreationTime(track.getCreationTime());
-        tkhd.setModificationTime(track.getCreationTime());
-        tkhd.setDuration(track.getDuration() * getTimescale(movie) / track.getTimeScale());
-        tkhd.setHeight(track.getHeight());
-        tkhd.setWidth(track.getWidth());
-        tkhd.setLayer(0);
-        tkhd.setModificationTime(new Date());
-        tkhd.setTrackId(track.getTrackId() + 1);
-        tkhd.setVolume(track.getVolume());
-
-        trackBox.addBox(tkhd);
-
-        MediaBox mdia = new MediaBox();
-        trackBox.addBox(mdia);
-        MediaHeaderBox mdhd = new MediaHeaderBox();
-        mdhd.setCreationTime(track.getCreationTime());
-        mdhd.setModificationTime(track.getCreationTime());
-        mdhd.setDuration(track.getDuration());
-        mdhd.setTimescale(track.getTimeScale());
-        mdhd.setLanguage("eng");
-        mdia.addBox(mdhd);
-        HandlerBox hdlr = new HandlerBox();
-        hdlr.setName(track.isAudio() ? "SoundHandle" : "VideoHandle");
-        hdlr.setHandlerType(track.getHandler());
-
-        mdia.addBox(hdlr);
-
-        MediaInformationBox minf = new MediaInformationBox();
-        minf.addBox(track.getMediaHeaderBox());
-
-        DataInformationBox dinf = new DataInformationBox();
-        DataReferenceBox dref = new DataReferenceBox();
-        dinf.addBox(dref);
-        DataEntryUrlBox url = new DataEntryUrlBox();
-        url.setFlags(1);
-        dref.addBox(url);
-        minf.addBox(dinf);
-
-        Box stbl = createStbl(track);
-        minf.addBox(stbl);
-        mdia.addBox(minf);
-
-        return trackBox;
-    }
-
-    private Box createStbl(Track track) {
-        SampleTableBox stbl = new SampleTableBox();
-        createStsd(track, stbl);
-        createStts(track, stbl);
-        createStss(track, stbl);
-        createStsc(track, stbl);
-        createStsz(track, stbl);
-        createStco(track, stbl);
-        return stbl;
-    }
-
-    private void createStsd(Track track, SampleTableBox stbl) {
-        stbl.addBox(track.getSampleDescriptionBox());
-    }
-
-    private void createStts(Track track, SampleTableBox stbl) {
-        TimeToSampleBox.Entry lastEntry = null;
-        List<TimeToSampleBox.Entry> entries = new ArrayList<>();
-
-        for (long delta : track.getSampleDurations()) {
-            if (lastEntry != null && lastEntry.getDelta() == delta) {
-                lastEntry.setCount(lastEntry.getCount() + 1);
-            } else {
-                lastEntry = new TimeToSampleBox.Entry(1, delta);
-                entries.add(lastEntry);
-            }
-        }
-        TimeToSampleBox stts = new TimeToSampleBox();
-        stts.setEntries(entries);
-        stbl.addBox(stts);
-    }
-
-    private void createStss(Track track, SampleTableBox stbl) {
-        long[] syncSamples = track.getSyncSamples();
-        if (syncSamples != null && syncSamples.length > 0) {
-            SyncSampleBox stss = new SyncSampleBox();
-            stss.setSampleNumber(syncSamples);
-            stbl.addBox(stss);
-        }
-    }
-
-    private void createStsc(Track track, SampleTableBox stbl) {
-        SampleToChunkBox stsc = new SampleToChunkBox();
-        stsc.setEntries(new LinkedList<SampleToChunkBox.Entry>());
-
-        long lastOffset;
-        int lastChunkNumber = 1;
-        int lastSampleCount = 0;
-
-        int previousWritedChunkCount = -1;
-
-        int samplesCount = track.getSamples().size();
-        for (int a = 0; a < samplesCount; a++) {
-            Sample sample = track.getSamples().get(a);
-            long offset = sample.getOffset();
-            long size = sample.getSize();
-
-            lastOffset = offset + size;
-            lastSampleCount++;
-
-            boolean write = false;
-            if (a != samplesCount - 1) {
-                Sample nextSample = track.getSamples().get(a + 1);
-                if (lastOffset != nextSample.getOffset()) {
-                    write = true;
-                }
-            } else {
-                write = true;
-            }
-            if (write) {
-                if (previousWritedChunkCount != lastSampleCount) {
-                    stsc.getEntries().add(new SampleToChunkBox.Entry(lastChunkNumber, lastSampleCount, 1));
-                    previousWritedChunkCount = lastSampleCount;
-                }
-                lastSampleCount = 0;
-                lastChunkNumber++;
-            }
-        }
-        stbl.addBox(stsc);
-    }
-
-    private void createStsz(Track track, SampleTableBox stbl) {
-        SampleSizeBox stsz = new SampleSizeBox();
-        stsz.setSampleSizes(track2SampleSizes.get(track));
-        stbl.addBox(stsz);
-    }
-
-    private void createStco(Track track, SampleTableBox stbl) {
-        ArrayList<Long> chunksOffsets = new ArrayList<>();
-        long lastOffset = -1;
-        for (Sample sample : track.getSamples()) {
-            long offset = sample.getOffset();
-            if (lastOffset != -1 && lastOffset != offset) {
-                lastOffset = -1;
-            }
-            if (lastOffset == -1) {
-                chunksOffsets.add(offset);
-            }
-            lastOffset = offset + sample.getSize();
-        }
-        long[] chunkOffsetsLong = new long[chunksOffsets.size()];
-        for (int a = 0; a < chunksOffsets.size(); a++) {
-            chunkOffsetsLong[a] = chunksOffsets.get(a);
-        }
-
-        StaticChunkOffsetBox stco = new StaticChunkOffsetBox();
-        stco.setChunkOffsets(chunkOffsetsLong);
-        stbl.addBox(stco);
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsPublisher.java b/android/src/main/java/net/ossrs/yasea/SrsPublisher.java
deleted file mode 100755
index 5ea9302..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsPublisher.java
+++ /dev/null
@@ -1,298 +0,0 @@
-package net.ossrs.yasea;
-
-import android.media.AudioRecord;
-import android.media.audiofx.AcousticEchoCanceler;
-import android.media.audiofx.AutomaticGainControl;
-
-import com.github.faucamp.simplertmp.RtmpHandler;
-import com.seu.magicfilter.utils.MagicFilterType;
-
-import java.io.File;
-
-/**
- * Created by Leo Ma on 2016/7/25.
- */
-public class SrsPublisher {
-
-    private static AudioRecord mic;
-    private static AcousticEchoCanceler aec;
-    private static AutomaticGainControl agc;
-    private byte[] mPcmBuffer = new byte[4096];
-    private byte[] mMuteBuffer = new byte[11];
-    private Thread aworker;
-
-    private SrsCameraView mCameraView;
-
-    private boolean sendVideoOnly = false;
-    private boolean sendAudioOnly = false;
-    private int videoFrameCount;
-    private long lastTimeMillis;
-    private double mSamplingFps;
-
-    private SrsFlvMuxer mFlvMuxer;
-    private SrsMp4Muxer mMp4Muxer;
-    private SrsEncoder mEncoder;
-
-    public SrsPublisher(SrsCameraView view) {
-        mCameraView = view;
-        mCameraView.setPreviewCallback(new SrsCameraView.PreviewCallback() {
-            @Override
-            public void onGetRgbaFrame(byte[] data, int width, int height) {
-                calcSamplingFps();
-                if (!sendAudioOnly) {
-                    mEncoder.onGetRgbaFrame(data, width, height);
-                }
-            }
-        });
-    }
-
-    private void calcSamplingFps() {
-        // Calculate sampling FPS
-        if (videoFrameCount == 0) {
-            lastTimeMillis = System.nanoTime() / 1000000;
-            videoFrameCount++;
-        } else {
-            if (++videoFrameCount >= SrsEncoder.VGOP) {
-                long diffTimeMillis = System.nanoTime() / 1000000 - lastTimeMillis;
-                mSamplingFps = (double) videoFrameCount * 1000 / diffTimeMillis;
-                videoFrameCount = 0;
-            }
-        }
-    }
-
-    public void startCamera() {
-        mCameraView.startCamera();
-    }
-
-    public void stopCamera() {
-        mCameraView.stopCamera();
-    }
-
-    public void startAudio() {
-        mic = mEncoder.chooseAudioRecord();
-        if (mic == null) {
-            return;
-        }
-
-        if (AcousticEchoCanceler.isAvailable()) {
-            aec = AcousticEchoCanceler.create(mic.getAudioSessionId());
-            if (aec != null) {
-                aec.setEnabled(true);
-            }
-        }
-
-        if (AutomaticGainControl.isAvailable()) {
-            agc = AutomaticGainControl.create(mic.getAudioSessionId());
-            if (agc != null) {
-                agc.setEnabled(true);
-            }
-        }
-
-        aworker = new Thread(new Runnable() {
-            @Override
-            public void run() {
-                android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
-                mic.startRecording();
-                while (!Thread.interrupted()) {
-                    if (sendVideoOnly) {
-                        mEncoder.onGetPcmFrame(mMuteBuffer, mMuteBuffer.length);
-                    } else {
-                        int size = mic.read(mPcmBuffer, 0, mPcmBuffer.length);
-                        if (size > 0) {
-                            mEncoder.onGetPcmFrame(mPcmBuffer, size);
-                        }
-                    }
-                }
-            }
-        });
-        aworker.start();
-    }
-
-    public void stopAudio() {
-        if (aworker != null) {
-            aworker.interrupt();
-            try {
-                aworker.join();
-            } catch (InterruptedException e) {
-                aworker.interrupt();
-            }
-            aworker = null;
-        }
-
-        if (mic != null) {
-            mic.setRecordPositionUpdateListener(null);
-            mic.stop();
-            mic.release();
-            mic = null;
-        }
-
-        if (aec != null) {
-            aec.setEnabled(false);
-            aec.release();
-            aec = null;
-        }
-
-        if (agc != null) {
-            agc.setEnabled(false);
-            agc.release();
-            agc = null;
-        }
-    }
-
-    public void startEncode() {
-        if (!mEncoder.start()) {
-            return;
-        }
-
-        mCameraView.enableEncoding();
-
-        startAudio();
-    }
-
-    public void stopEncode() {
-        stopAudio();
-        stopCamera();
-        mEncoder.stop();
-    }
-
-    public void startPublish(String rtmpUrl) {
-        if (mFlvMuxer != null) {
-            mFlvMuxer.start(rtmpUrl);
-            mFlvMuxer.setVideoResolution(mEncoder.getOutputWidth(), mEncoder.getOutputHeight());
-            startEncode();
-        }
-    }
-
-    public void stopPublish() {
-        if (mFlvMuxer != null) {
-            stopEncode();
-            mFlvMuxer.stop();
-        }
-    }
-
-    public boolean startRecord(String recPath) {
-        return mMp4Muxer != null && mMp4Muxer.record(new File(recPath));
-    }
-
-    public void stopRecord() {
-        if (mMp4Muxer != null) {
-            mMp4Muxer.stop();
-        }
-    }
-
-    public void pauseRecord() {
-        if (mMp4Muxer != null) {
-            mMp4Muxer.pause();
-        }
-    }
-
-    public void resumeRecord() {
-        if (mMp4Muxer != null) {
-            mMp4Muxer.resume();
-        }
-    }
-
-    public void switchToSoftEncoder() {
-        mEncoder.switchToSoftEncoder();
-    }
-
-    public void switchToHardEncoder() {
-        mEncoder.switchToHardEncoder();
-    }
-
-    public boolean isSoftEncoder() {
-        return mEncoder.isSoftEncoder();
-    }
-
-    public int getPreviewWidth() {
-        return mEncoder.getPreviewWidth();
-    }
-
-    public int getPreviewHeight() {
-        return mEncoder.getPreviewHeight();
-    }
-
-    public double getmSamplingFps() {
-        return mSamplingFps;
-    }
-
-    public int getCamraId() {
-        return mCameraView.getCameraId();
-    }
-
-    public void setPreviewResolution(int width, int height) {
-        int resolution[] = mCameraView.setPreviewResolution(width, height);
-        mEncoder.setPreviewResolution(resolution[0], resolution[1]);
-    }
-
-    public void setOutputResolution(int width, int height) {
-        if (width <= height) {
-            mEncoder.setPortraitResolution(width, height);
-        } else {
-            mEncoder.setLandscapeResolution(width, height);
-        }
-    }
-
-    public void setScreenOrientation(int orientation) {
-        mCameraView.setPreviewOrientation(orientation);
-        mEncoder.setScreenOrientation(orientation);
-    }
-
-    public void setVideoHDMode() {
-        mEncoder.setVideoHDMode();
-    }
-
-    public void setVideoSmoothMode() {
-        mEncoder.setVideoSmoothMode();
-    }
-
-    public void setSendVideoOnly(boolean flag) {
-        sendVideoOnly = flag;
-    }
-
-    public void setSendAudioOnly(boolean flag) {
-        sendAudioOnly = flag;
-    }
-
-    public boolean switchCameraFilter(MagicFilterType type) {
-        return mCameraView.setFilter(type);
-    }
-
-    public void switchCameraFace(int id) {
-        mCameraView.stopCamera();
-        mCameraView.setCameraId(id);
-        if (id == 0) {
-            mEncoder.setCameraBackFace();
-        } else {
-            mEncoder.setCameraFrontFace();
-        }
-        if (mEncoder != null && mEncoder.isEnabled()) {
-            mCameraView.enableEncoding();
-        }
-        mCameraView.startCamera();
-    }
-
-    public void setRtmpHandler(RtmpHandler handler) {
-        mFlvMuxer = new SrsFlvMuxer(handler);
-        if (mEncoder != null) {
-            mEncoder.setFlvMuxer(mFlvMuxer);
-        }
-    }
-
-    public void setRecordHandler(SrsRecordHandler handler) {
-        mMp4Muxer = new SrsMp4Muxer(handler);
-        if (mEncoder != null) {
-            mEncoder.setMp4Muxer(mMp4Muxer);
-        }
-    }
-
-    public void setEncodeHandler(SrsEncodeHandler handler) {
-        mEncoder = new SrsEncoder(handler);
-        if (mFlvMuxer != null) {
-            mEncoder.setFlvMuxer(mFlvMuxer);
-        }
-        if (mMp4Muxer != null) {
-            mEncoder.setMp4Muxer(mMp4Muxer);
-        }
-    }
-}
diff --git a/android/src/main/java/net/ossrs/yasea/SrsRecordHandler.java b/android/src/main/java/net/ossrs/yasea/SrsRecordHandler.java
deleted file mode 100755
index 4be009d..0000000
--- a/android/src/main/java/net/ossrs/yasea/SrsRecordHandler.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package net.ossrs.yasea;
-
-import android.os.Handler;
-import android.os.Message;
-
-import java.io.IOException;
-import java.lang.ref.WeakReference;
-
-/**
- * Created by leo.ma on 2016/11/4.
- */
-
-public class SrsRecordHandler extends Handler {
-
-    private static final int MSG_RECORD_PAUSE = 0;
-    private static final int MSG_RECORD_RESUME = 1;
-    private static final int MSG_RECORD_STARTED = 2;
-    private static final int MSG_RECORD_FINISHED = 3;
-
-    private static final int MSG_RECORD_ILLEGEL_ARGUMENT_EXCEPTION = 4;
-    private static final int MSG_RECORD_IO_EXCEPTION = 5;
-
-    private WeakReference<SrsRecordListener> mWeakListener;
-
-    public SrsRecordHandler(SrsRecordListener listener) {
-        mWeakListener = new WeakReference<>(listener);
-    }
-
-    public void notifyRecordPause() {
-        sendEmptyMessage(MSG_RECORD_PAUSE);
-    }
-
-    public void notifyRecordResume() {
-        sendEmptyMessage(MSG_RECORD_RESUME);
-    }
-
-    public void notifyRecordStarted(String msg) {
-        obtainMessage(MSG_RECORD_STARTED, msg).sendToTarget();
-    }
-
-    public void notifyRecordFinished(String msg) {
-        obtainMessage(MSG_RECORD_FINISHED, msg).sendToTarget();
-    }
-
-    public void notifyRecordIllegalArgumentException(IllegalArgumentException e) {
-        obtainMessage(MSG_RECORD_ILLEGEL_ARGUMENT_EXCEPTION, e).sendToTarget();
-    }
-
-    public void notifyRecordIOException(IOException e) {
-        obtainMessage(MSG_RECORD_IO_EXCEPTION, e).sendToTarget();
-    }
-
-    @Override  // runs on UI thread
-    public void handleMessage(Message msg) {
-        SrsRecordListener listener = mWeakListener.get();
-        if (listener == null) {
-            return;
-        }
-
-        switch (msg.what) {
-            case MSG_RECORD_PAUSE:
-                listener.onRecordPause();
-                break;
-            case MSG_RECORD_RESUME:
-                listener.onRecordResume();
-                break;
-            case MSG_RECORD_STARTED:
-                listener.onRecordStarted((String) msg.obj);
-                break;
-            case MSG_RECORD_FINISHED:
-                listener.onRecordFinished((String) msg.obj);
-                break;
-            case MSG_RECORD_ILLEGEL_ARGUMENT_EXCEPTION:
-                listener.onRecordIllegalArgumentException((IllegalArgumentException) msg.obj);
-                break;
-            case MSG_RECORD_IO_EXCEPTION:
-                listener.onRecordIOException((IOException) msg.obj);
-                break;
-            default:
-                throw new RuntimeException("unknown msg " + msg.what);
-        }
-    }
-    
-    public interface SrsRecordListener {
-
-        void onRecordPause();
-
-        void onRecordResume();
-
-        void onRecordStarted(String msg);
-
-        void onRecordFinished(String msg);
-
-        void onRecordIllegalArgumentException(IllegalArgumentException e);
-
-        void onRecordIOException(IOException e);
-    }
-}
diff --git a/android/src/main/jniLibs/armeabi-v7a/libenc.so b/android/src/main/jniLibs/armeabi-v7a/libenc.so
deleted file mode 100755
index dc5d3b2..0000000
Binary files a/android/src/main/jniLibs/armeabi-v7a/libenc.so and /dev/null differ
diff --git a/android/src/main/jniLibs/armeabi-v7a/libyuv.so b/android/src/main/jniLibs/armeabi-v7a/libyuv.so
deleted file mode 100755
index e217657..0000000
Binary files a/android/src/main/jniLibs/armeabi-v7a/libyuv.so and /dev/null differ
diff --git a/android/src/main/jniLibs/x86/libenc.so b/android/src/main/jniLibs/x86/libenc.so
deleted file mode 100755
index b3edb19..0000000
Binary files a/android/src/main/jniLibs/x86/libenc.so and /dev/null differ
diff --git a/android/src/main/jniLibs/x86/libyuv.so b/android/src/main/jniLibs/x86/libyuv.so
deleted file mode 100755
index 36f1467..0000000
Binary files a/android/src/main/jniLibs/x86/libyuv.so and /dev/null differ
diff --git a/android/src/main/libenc/jni/Android.mk b/android/src/main/libenc/jni/Android.mk
deleted file mode 100755
index 942dbc7..0000000
--- a/android/src/main/libenc/jni/Android.mk
+++ /dev/null
@@ -1,43 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-
-############# prebuilt ###############
-
-include $(CLEAR_VARS)
-LOCAL_MODULE := libyuv
-
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-    LOCAL_SRC_FILES := libs/armeabi-v7a/libyuv.so
-endif
-
-ifeq ($(TARGET_ARCH_ABI),x86)
-    LOCAL_SRC_FILES := libs/x86/libyuv.so
-endif
-
-include $(PREBUILT_SHARED_LIBRARY)
-
-include $(CLEAR_VARS)
-LOCAL_MODULE := libx264
-
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-    LOCAL_SRC_FILES := libs/armeabi-v7a/libx264.a
-endif
-
-ifeq ($(TARGET_ARCH_ABI),x86)
-    LOCAL_SRC_FILES := libs/x86/libx264.a
-endif
-
-include $(PREBUILT_STATIC_LIBRARY)
-
-############# build libenc ###########
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libenc
-LOCAL_SRC_FILES := libenc.cc
-LOCAL_CFLAGS    :=
-LOCAL_LDLIBS    := -llog
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/libyuv/jni/include $(LOCAL_PATH)/libx264
-LOCAL_STATIC_LIBRARIES := libx264
-LOCAL_SHARED_LIBRARIES := libyuv
-LOCAL_DISABLE_FORMAT_STRING_CHECKS := true
-LOCAL_DISABLE_FATAL_LINKER_WARNINGS := true
-include $(BUILD_SHARED_LIBRARY)
diff --git a/android/src/main/libenc/jni/Application.mk b/android/src/main/libenc/jni/Application.mk
deleted file mode 100755
index baad0ac..0000000
--- a/android/src/main/libenc/jni/Application.mk
+++ /dev/null
@@ -1,2 +0,0 @@
-APP_ABI := armeabi-v7a x86
-APP_PLATFORM := android-19
diff --git a/android/src/main/libenc/jni/libenc.cc b/android/src/main/libenc/jni/libenc.cc
deleted file mode 100755
index ba1a388..0000000
--- a/android/src/main/libenc/jni/libenc.cc
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <jni.h>
-#include <libyuv.h>
-#include <x264.h>
-
-#include <android/log.h>
-#define LIBENC_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, "libenc", __VA_ARGS__))
-#define LIBENC_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO , "libenc", __VA_ARGS__))
-#define LIBENC_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN , "libenc", __VA_ARGS__))
-#define LIBENC_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, "libenc", __VA_ARGS__))
-
-#define LIBENC_ARRAY_ELEMS(a)  (sizeof(a) / sizeof(a[0]))
-
-using namespace libyuv;
-
-struct YuvFrame {
-    int width;
-    int height;
-    uint8_t *data;
-    uint8_t *y;
-    uint8_t *u;
-    uint8_t *v;
-};
-
-typedef struct x264_context {
-    // encode parameter
-    x264_param_t params;
-    x264_t *encoder;
-    x264_picture_t picture;
-    bool global_nal_header;
-    // input
-    int width;
-    int height;
-    int bitrate;
-    int fps;
-    int gop;
-    char preset[16];
-    // output
-    int64_t pts;
-    int dts;
-    bool is_key_frame;
-} x264_context;
-
-static JavaVM* jvm;
-static JNIEnv *jenv;
-
-static struct x264_context x264_ctx;
-static uint8_t h264_es[1024 * 1024];
-
-static const int SRC_COLOR_FMT = FOURCC_RGBA;
-static const int DST_COLOR_FMT = FOURCC_NV12;
-
-static struct YuvFrame i420_rotated_frame;
-static struct YuvFrame i420_scaled_frame;
-static struct YuvFrame nv12_frame;
-
-static bool convert_to_i420(jbyte *src_frame, jint src_width, jint src_height,
-                            jboolean need_flip, jint rotate_degree, int format) {
-    int y_size = src_width * src_height;
-
-    if (rotate_degree % 180 == 0) {
-        if (i420_rotated_frame.width != src_width || i420_rotated_frame.height != src_height) {
-            free(i420_rotated_frame.data);
-            i420_rotated_frame.width = src_width;
-            i420_rotated_frame.height = src_height;
-            i420_rotated_frame.data = (uint8_t *) malloc(y_size * 3 / 2);
-            i420_rotated_frame.y = i420_rotated_frame.data;
-            i420_rotated_frame.u = i420_rotated_frame.y + y_size;
-            i420_rotated_frame.v = i420_rotated_frame.u + y_size / 4;
-        }
-    } else {
-        if (i420_rotated_frame.width != src_height || i420_rotated_frame.height != src_width) {
-            free(i420_rotated_frame.data);
-            i420_rotated_frame.width = src_height;
-            i420_rotated_frame.height = src_width;
-            i420_rotated_frame.data = (uint8_t *) malloc(y_size * 3 / 2);
-            i420_rotated_frame.y = i420_rotated_frame.data;
-            i420_rotated_frame.u = i420_rotated_frame.y + y_size;
-            i420_rotated_frame.v = i420_rotated_frame.u + y_size / 4;
-        }
-    }
-
-    jint ret = ConvertToI420((uint8_t *) src_frame, y_size,
-                             i420_rotated_frame.y, i420_rotated_frame.width,
-                             i420_rotated_frame.u, i420_rotated_frame.width / 2,
-                             i420_rotated_frame.v, i420_rotated_frame.width / 2,
-                             0, 0,
-                             src_width, src_height,
-                             src_width, src_height,
-                             (RotationMode) rotate_degree, format);
-    if (ret < 0) {
-        LIBENC_LOGE("ConvertToI420 failure");
-        return false;
-    }
-
-    ret = I420Scale(i420_rotated_frame.y, i420_rotated_frame.width,
-                    i420_rotated_frame.u, i420_rotated_frame.width / 2,
-                    i420_rotated_frame.v, i420_rotated_frame.width / 2,
-                    need_flip ? -i420_rotated_frame.width : i420_rotated_frame.width, i420_rotated_frame.height,
-                    i420_scaled_frame.y, i420_scaled_frame.width,
-                    i420_scaled_frame.u, i420_scaled_frame.width / 2,
-                    i420_scaled_frame.v, i420_scaled_frame.width / 2,
-                    i420_scaled_frame.width, i420_scaled_frame.height,
-                    kFilterNone);
-    if (ret < 0) {
-         LIBENC_LOGE("I420Scale failure");
-         return false;
-    }
-
-    return true;
-}
-
-static void libenc_setEncoderBitrate(JNIEnv* env, jobject thiz, jint bitrate) {
-    x264_ctx.bitrate = bitrate / 1000;  // kbps
-}
-
-static void libenc_setEncoderFps(JNIEnv* env, jobject thiz, jint fps) {
-    x264_ctx.fps = fps;
-}
-
-static void libenc_setEncoderGop(JNIEnv* env, jobject thiz, jint gop_size) {
-    x264_ctx.gop = gop_size;
-}
-
-static void libenc_setEncoderPreset(JNIEnv* env, jobject thiz, jstring preset) {
-    const char *enc_preset = env->GetStringUTFChars(preset, NULL);
-    strcpy(x264_ctx.preset, enc_preset);
-    env->ReleaseStringUTFChars(preset, enc_preset);
-}
-
-static void libenc_setEncoderResolution(JNIEnv* env, jobject thiz, jint out_width, jint out_height) {
-    int y_size = out_width * out_height;
-
-    if (i420_scaled_frame.width != out_width || i420_scaled_frame.height != out_height) {
-        free(i420_scaled_frame.data);
-        i420_scaled_frame.width = out_width;
-        i420_scaled_frame.height = out_height;
-        i420_scaled_frame.data = (uint8_t *) malloc(y_size * 3 / 2);
-        i420_scaled_frame.y = i420_scaled_frame.data;
-        i420_scaled_frame.u = i420_scaled_frame.y + y_size;
-        i420_scaled_frame.v = i420_scaled_frame.u + y_size / 4;
-    }
-
-    if (nv12_frame.width != out_width || nv12_frame.height != out_height) {
-        free(nv12_frame.data);
-        nv12_frame.width = out_width;
-        nv12_frame.height = out_height;
-        nv12_frame.data = (uint8_t *) malloc(y_size * 3 / 2);
-        nv12_frame.y = nv12_frame.data;
-        nv12_frame.u = nv12_frame.y + y_size;
-        nv12_frame.v = nv12_frame.u + y_size / 4;
-    }
-
-    x264_ctx.width = out_width;
-    x264_ctx.height = out_height;
-}
-
-// For COLOR_FormatYUV420Planar
-static jbyteArray libenc_RGBAToI420(JNIEnv* env, jobject thiz, jbyteArray frame, jint src_width,
-                                    jint src_height, jboolean need_flip, jint rotate_degree) {
-    jbyte* rgba_frame = env->GetByteArrayElements(frame, NULL);
-
-    if (!convert_to_i420(rgba_frame, src_width, src_height, need_flip, rotate_degree, SRC_COLOR_FMT)) {
-        return NULL;
-    }
-
-    int y_size = i420_scaled_frame.width * i420_scaled_frame.height;
-    jbyteArray i420Frame = env->NewByteArray(y_size * 3 / 2);
-    env->SetByteArrayRegion(i420Frame, 0, y_size * 3 / 2, (jbyte *) i420_scaled_frame.data);
-
-    env->ReleaseByteArrayElements(frame, rgba_frame, JNI_ABORT);
-    return i420Frame;
-}
-
-// For COLOR_FormatYUV420SemiPlanar
-static jbyteArray libenc_RGBAToNV12(JNIEnv* env, jobject thiz, jbyteArray frame, jint src_width,
-                                    jint src_height, jboolean need_flip, jint rotate_degree) {
-    jbyte* rgba_frame = env->GetByteArrayElements(frame, NULL);
-
-    if (!convert_to_i420(rgba_frame, src_width, src_height, need_flip, rotate_degree, SRC_COLOR_FMT)) {
-        return NULL;
-    }
-
-    int ret = ConvertFromI420(i420_scaled_frame.y, i420_scaled_frame.width,
-                              i420_scaled_frame.u, i420_scaled_frame.width / 2,
-                              i420_scaled_frame.v, i420_scaled_frame.width / 2,
-                              nv12_frame.data, nv12_frame.width,
-                              nv12_frame.width, nv12_frame.height,
-                              DST_COLOR_FMT);
-    if (ret < 0) {
-        LIBENC_LOGE("ConvertFromI420 failure");
-        return NULL;
-    }
-
-    int y_size = nv12_frame.width * nv12_frame.height;
-    jbyteArray nv12Frame = env->NewByteArray(y_size * 3 / 2);
-    env->SetByteArrayRegion(nv12Frame, 0, y_size * 3 / 2, (jbyte *) nv12_frame.data);
-
-    env->ReleaseByteArrayElements(frame, rgba_frame, JNI_ABORT);
-    return nv12Frame;
-}
-
-static int encode_nals(const x264_nal_t *nals, int nnal) {
-    int i;
-    uint8_t *p = h264_es;
-
-    for (i = 0; i < nnal; i++) {
-        memcpy(p, nals[i].p_payload, nals[i].i_payload);
-        p += nals[i].i_payload;
-    }
-
-    return p - h264_es;
-}
-
-static int encode_global_nal_header() {
-    int nnal;
-    x264_nal_t *nals;
-
-    x264_ctx.global_nal_header = false;
-    x264_encoder_headers(x264_ctx.encoder, &nals, &nnal);
-    return encode_nals(nals, nnal);
-}
-
-static int x264_encode(struct YuvFrame *i420_frame, int64_t pts) {
-    int out_len, nnal;
-    x264_nal_t *nal;
-    x264_picture_t pic_out;
-    int y_size = i420_frame->width * i420_frame->height;
-
-    x264_ctx.picture.img.i_csp = X264_CSP_I420;
-    x264_ctx.picture.img.i_plane = 3;
-    x264_ctx.picture.img.plane[0] = i420_frame->y;
-    x264_ctx.picture.img.i_stride[0] = i420_frame->width;
-    x264_ctx.picture.img.plane[1] = i420_frame->u;
-    x264_ctx.picture.img.i_stride[1] = i420_frame->width / 2;
-    x264_ctx.picture.img.plane[2] = i420_frame->v;
-    x264_ctx.picture.img.i_stride[2] = i420_frame->width / 2;
-    x264_ctx.picture.i_pts = pts;
-    x264_ctx.picture.i_type = X264_TYPE_AUTO;
-
-    if (x264_encoder_encode(x264_ctx.encoder, &nal, &nnal, &x264_ctx.picture, &pic_out) < 0) {
-        LIBENC_LOGE("Fail to encode in x264");
-        return -1;
-    }
-
-    x264_ctx.pts = pic_out.i_pts;
-    x264_ctx.dts = pic_out.i_dts;
-    x264_ctx.is_key_frame = pic_out.i_type == X264_TYPE_IDR;
-
-    return encode_nals(nal, nnal);
-}
-
-static jint libenc_RGBASoftEncode(JNIEnv* env, jobject thiz, jbyteArray frame, jint src_width,
-                                  jint src_height, jboolean need_flip, jint rotate_degree, jlong pts) {
-    jbyte* rgba_frame = env->GetByteArrayElements(frame, NULL);
-
-    if (!convert_to_i420(rgba_frame, src_width, src_height, need_flip, rotate_degree, SRC_COLOR_FMT)) {
-        return JNI_ERR;
-    }
-
-    int es_len = x264_ctx.global_nal_header ? encode_global_nal_header() : x264_encode(&i420_scaled_frame, pts);
-    if (es_len <= 0) {
-        LIBENC_LOGE("Fail to encode nalu");
-        return JNI_ERR;
-    }
-
-    jbyteArray outputFrame = env->NewByteArray(es_len);
-    env->SetByteArrayRegion(outputFrame, 0, es_len, (jbyte *) h264_es);
-
-    jclass clz = env->GetObjectClass(thiz);
-    jmethodID mid = env->GetMethodID(clz, "onSoftEncodedData", "([BJZ)V");
-    env->CallVoidMethod(thiz, mid, outputFrame, x264_ctx.pts, x264_ctx.is_key_frame);
-
-    env->ReleaseByteArrayElements(frame, rgba_frame, JNI_ABORT);
-    return JNI_OK;
-}
-
-static void libenc_closeSoftEncoder(JNIEnv* env, jobject thiz) {
-    int nnal;
-    x264_nal_t *nal;
-    x264_picture_t pic_out;
-
-    if (x264_ctx.encoder != NULL) {
-        while(x264_encoder_delayed_frames(x264_ctx.encoder)) {
-            x264_encoder_encode(x264_ctx.encoder, &nal, &nnal, NULL, &pic_out);
-        }
-        x264_encoder_close(x264_ctx.encoder);
-        x264_ctx.encoder = NULL;
-    }
-}
-
-static jboolean libenc_openSoftEncoder(JNIEnv* env, jobject thiz) {
-    // presetting
-    x264_param_default_preset(&x264_ctx.params, x264_ctx.preset, "zerolatency");
-
-    x264_ctx.params.b_repeat_headers = 0;
-    x264_ctx.global_nal_header = true;
-
-    // resolution
-    x264_ctx.params.i_width = x264_ctx.width;
-    x264_ctx.params.i_height = x264_ctx.height;
-
-    // bitrate
-    x264_ctx.params.rc.i_bitrate = x264_ctx.bitrate;  // kbps
-    x264_ctx.params.rc.i_rc_method = X264_RC_ABR;
-
-    // fps
-    x264_ctx.params.i_fps_num = x264_ctx.fps;
-    x264_ctx.params.i_fps_den = 1;
-
-    // gop
-    x264_ctx.params.i_keyint_max = x264_ctx.gop;
-
-    if (x264_param_apply_profile(&x264_ctx.params, "baseline" ) < 0) {
-        LIBENC_LOGE("Fail to apply profile");
-        return JNI_FALSE;
-    }
-
-    x264_ctx.encoder = x264_encoder_open(&x264_ctx.params);
-    if (x264_ctx.encoder == NULL) {
-        LIBENC_LOGE("Fail to open x264 encoder!");
-        return JNI_FALSE;
-    }
-
-    return JNI_TRUE;
-}
-
-static JNINativeMethod libenc_methods[] = {
-    { "setEncoderResolution", "(II)V", (void *)libenc_setEncoderResolution },
-    { "setEncoderFps", "(I)V", (void *)libenc_setEncoderFps },
-    { "setEncoderGop", "(I)V", (void *)libenc_setEncoderGop },
-    { "setEncoderBitrate", "(I)V", (void *)libenc_setEncoderBitrate },
-    { "setEncoderPreset", "(Ljava/lang/String;)V", (void *)libenc_setEncoderPreset },
-    { "RGBAToI420", "([BIIZI)[B", (void *)libenc_RGBAToI420 },
-    { "RGBAToNV12", "([BIIZI)[B", (void *)libenc_RGBAToNV12 },
-    { "openSoftEncoder", "()Z", (void *)libenc_openSoftEncoder },
-    { "closeSoftEncoder", "()V", (void *)libenc_closeSoftEncoder },
-    { "RGBASoftEncode", "([BIIZIJ)I", (void *)libenc_RGBASoftEncode },
-};
-
-jint JNI_OnLoad(JavaVM* vm, void* reserved) {
-    jvm = vm;
-
-    if (jvm->GetEnv((void **) &jenv, JNI_VERSION_1_6) != JNI_OK) {
-        LIBENC_LOGE("Env not got");
-    	return JNI_ERR;
-    }
-
-    jclass clz = jenv->FindClass("net/ossrs/yasea/SrsEncoder");
-    if (clz == NULL) {
-        LIBENC_LOGE("Class \"net/ossrs/yasea/SrsEncoder\" not found");
-        return JNI_ERR;
-    }
-
-    if (jenv->RegisterNatives(clz, libenc_methods, LIBENC_ARRAY_ELEMS(libenc_methods))) {
-        LIBENC_LOGE("methods not registered");
-        return JNI_ERR;
-    }
-
-    return JNI_VERSION_1_6;
-}
diff --git a/android/src/main/libenc/jni/libs/armeabi-v7a/libx264.a b/android/src/main/libenc/jni/libs/armeabi-v7a/libx264.a
deleted file mode 100755
index 566f60d..0000000
Binary files a/android/src/main/libenc/jni/libs/armeabi-v7a/libx264.a and /dev/null differ
diff --git a/android/src/main/libenc/jni/libs/armeabi-v7a/libyuv.so b/android/src/main/libenc/jni/libs/armeabi-v7a/libyuv.so
deleted file mode 100755
index e217657..0000000
Binary files a/android/src/main/libenc/jni/libs/armeabi-v7a/libyuv.so and /dev/null differ
diff --git a/android/src/main/libenc/jni/libs/x86/libx264.a b/android/src/main/libenc/jni/libs/x86/libx264.a
deleted file mode 100755
index 7b229c3..0000000
Binary files a/android/src/main/libenc/jni/libs/x86/libx264.a and /dev/null differ
diff --git a/android/src/main/libenc/jni/libs/x86/libyuv.so b/android/src/main/libenc/jni/libs/x86/libyuv.so
deleted file mode 100755
index 36f1467..0000000
Binary files a/android/src/main/libenc/jni/libs/x86/libyuv.so and /dev/null differ
diff --git a/android/src/main/libenc/jni/libx264/.gitignore b/android/src/main/libenc/jni/libx264/.gitignore
deleted file mode 100755
index 174458b..0000000
--- a/android/src/main/libenc/jni/libx264/.gitignore
+++ /dev/null
@@ -1,49 +0,0 @@
-*~
-*.a
-*.diff
-*.orig
-*.rej
-*.dll*
-*.exe
-*.def
-*.lib
-*.pdb
-*.mo
-*.o
-*.patch
-*.pc
-*.pot
-*.so*
-*.dylib
-.*.swp
-.depend
-.DS_Store
-config.h
-config.mak
-config.log
-x264_config.h
-x264
-checkasm
-
-*.264
-*.h264
-*.2pass
-*.ffindex
-*.avs
-*.mkv
-*.flv
-*.mp4
-*.y4m
-*.yuv
-*.log
-*.mbtree
-*.temp
-*.pyc
-*.pgd
-*.pgc
-
-.digress_x264
-dataDec.txt
-log.dec
-common/oclobj.h
-x264_lookahead.clbin
diff --git a/android/src/main/libenc/jni/libx264/AUTHORS b/android/src/main/libenc/jni/libx264/AUTHORS
deleted file mode 100755
index d14deb8..0000000
--- a/android/src/main/libenc/jni/libx264/AUTHORS
+++ /dev/null
@@ -1,99 +0,0 @@
-# Contributors to x264
-#
-# The format of this file was inspired by the Linux kernel CREDITS file.
-# Authors are listed alphabetically.
-#
-# The fields are: name (N), email (E), web-address (W), CVS account login (C),
-# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
-
-N: Alex Izvorski
-E: aizvorski AT gmail DOT com
-D: x86 asm (sse2)
-
-N: Alex Wright
-E: alexw0885 AT gmail DOT com
-D: Motion estimation (subpel and mixed refs)
-D: B-RDO
-
-N: bobololo
-D: Avisynth input
-D: MP4 muxing
-
-N: Christian Heine
-E: sennindemokrit AT gmx DOT net
-D: x86 asm
-
-N: David Wolstencroft
-D: Altivec optimizations
-
-N: Eric Petit
-E: eric.petit AT lapsus DOT org
-C: titer
-D: Altivec asm
-D: BeOS and MacOS X ports.
-S: France
-
-N: Fiona Glaser
-E: fiona AT x264 DOT com
-D: Maintainer
-D: All areas of encoder analysis and algorithms
-D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
-D: x86 asm
-S: USA
-
-N: Gabriel Bouvigne
-E: bouvigne AT mp3-tech DOT org
-D: 2pass VBV
-
-N: Guillaume Poirier
-E: gpoirier CHEZ mplayerhq POINT hu
-D: Altivec optimizations
-S: Brittany, France
-
-N: Henrik Gramner
-E: henrik AT gramner DOT com
-D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
-S: Sweden
-
-N: Laurent Aimar
-E: fenrir AT videolan DOT org
-C: fenrir
-D: Intial import, former maintainer
-D: x86 asm (mmx/mmx2)
-S: France
-
-N: Loren Merritt
-E: pengvado AT akuvian DOT org
-C: pengvado
-D: Maintainer
-D: All areas of encoder analysis and algorithms
-D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
-D: Multithreading
-D: x86 asm
-S: USA
-
-N: Mans Rullgard
-E: mru AT mansr DOT com
-C: mru
-D: Rate control
-S: Southampton, UK
-
-N: Michael Niedermayer
-E: michaelni AT gmx DOT at
-D: Rate control
-
-N: Mike Matsnev
-E: mike AT po DOT cs DOT msu DOT su
-D: Matroska muxing
-
-N: Min Chen
-E: chenm001 AT 163 DOT com
-C: chenm001
-D: Win32/VC 6.0 port
-D: gcc asm to nasm conversion
-S: China
-
-N: Radek Czyz
-E: radoslaw AT syskin DOT cjb DOT net
-D: Cached motion compensation
-
diff --git a/android/src/main/libenc/jni/libx264/COPYING b/android/src/main/libenc/jni/libx264/COPYING
deleted file mode 100755
index d60c31a..0000000
--- a/android/src/main/libenc/jni/libx264/COPYING
+++ /dev/null
@@ -1,340 +0,0 @@
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
-
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users.  This
-General Public License applies to most of the Free Software
-Foundation's software and to any other program whose authors commit to
-using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if you
-distribute copies of the software, or if you modify it.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must give the recipients all the rights that
-you have.  You must make sure that they, too, receive or can get the
-source code.  And you must show them these terms so they know their
-rights.
-
-  We protect your rights with two steps: (1) copyright the software, and
-(2) offer you this license which gives you legal permission to copy,
-distribute and/or modify the software.
-
-  Also, for each author's protection and ours, we want to make certain
-that everyone understands that there is no warranty for this free
-software.  If the software is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original, so
-that any problems introduced by others will not reflect on the original
-authors' reputations.
-
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that redistributors of a free
-program will individually obtain patent licenses, in effect making the
-program proprietary.  To prevent this, we have made it clear that any
-patent must be licensed for everyone's free use or not licensed at all.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-		    GNU GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License applies to any program or other work which contains
-a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License.  The "Program", below,
-refers to any such program or work, and a "work based on the Program"
-means either the Program or any derivative work under copyright law:
-that is to say, a work containing the Program or a portion of it,
-either verbatim or with modifications and/or translated into another
-language.  (Hereinafter, translation is included without limitation in
-the term "modification".)  Each licensee is addressed as "you".
-
-Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running the Program is not restricted, and the output from the Program
-is covered only if its contents constitute a work based on the
-Program (independent of having been made by running the Program).
-Whether that is true depends on what the Program does.
-
-  1. You may copy and distribute verbatim copies of the Program's
-source code as you receive it, in any medium, provided that you
-conspicuously and appropriately publish on each copy an appropriate
-copyright notice and disclaimer of warranty; keep intact all the
-notices that refer to this License and to the absence of any warranty;
-and give any other recipients of the Program a copy of this License
-along with the Program.
-
-You may charge a fee for the physical act of transferring a copy, and
-you may at your option offer warranty protection in exchange for a fee.
-
-  2. You may modify your copy or copies of the Program or any portion
-of it, thus forming a work based on the Program, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) You must cause the modified files to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    b) You must cause any work that you distribute or publish, that in
-    whole or in part contains or is derived from the Program or any
-    part thereof, to be licensed as a whole at no charge to all third
-    parties under the terms of this License.
-
-    c) If the modified program normally reads commands interactively
-    when run, you must cause it, when started running for such
-    interactive use in the most ordinary way, to print or display an
-    announcement including an appropriate copyright notice and a
-    notice that there is no warranty (or else, saying that you provide
-    a warranty) and that users may redistribute the program under
-    these conditions, and telling the user how to view a copy of this
-    License.  (Exception: if the Program itself is interactive but
-    does not normally print such an announcement, your work based on
-    the Program is not required to print an announcement.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Program,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Program, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Program.
-
-In addition, mere aggregation of another work not based on the Program
-with the Program (or with a work based on the Program) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may copy and distribute the Program (or a work based on it,
-under Section 2) in object code or executable form under the terms of
-Sections 1 and 2 above provided that you also do one of the following:
-
-    a) Accompany it with the complete corresponding machine-readable
-    source code, which must be distributed under the terms of Sections
-    1 and 2 above on a medium customarily used for software interchange; or,
-
-    b) Accompany it with a written offer, valid for at least three
-    years, to give any third party, for a charge no more than your
-    cost of physically performing source distribution, a complete
-    machine-readable copy of the corresponding source code, to be
-    distributed under the terms of Sections 1 and 2 above on a medium
-    customarily used for software interchange; or,
-
-    c) Accompany it with the information you received as to the offer
-    to distribute corresponding source code.  (This alternative is
-    allowed only for noncommercial distribution and only if you
-    received the program in object code or executable form with such
-    an offer, in accord with Subsection b above.)
-
-The source code for a work means the preferred form of the work for
-making modifications to it.  For an executable work, complete source
-code means all the source code for all modules it contains, plus any
-associated interface definition files, plus the scripts used to
-control compilation and installation of the executable.  However, as a
-special exception, the source code distributed need not include
-anything that is normally distributed (in either source or binary
-form) with the major components (compiler, kernel, and so on) of the
-operating system on which the executable runs, unless that component
-itself accompanies the executable.
-
-If distribution of executable or object code is made by offering
-access to copy from a designated place, then offering equivalent
-access to copy the source code from the same place counts as
-distribution of the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  4. You may not copy, modify, sublicense, or distribute the Program
-except as expressly provided under this License.  Any attempt
-otherwise to copy, modify, sublicense or distribute the Program is
-void, and will automatically terminate your rights under this License.
-However, parties who have received copies, or rights, from you under
-this License will not have their licenses terminated so long as such
-parties remain in full compliance.
-
-  5. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Program or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Program (or any work based on the
-Program), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Program or works based on it.
-
-  6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the
-original licensor to copy, distribute or modify the Program subject to
-these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-
-  7. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Program at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Program by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Program.
-
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply and the section as a whole is intended to apply in other
-circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system, which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  8. If the distribution and/or use of the Program is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Program under this License
-may add an explicit geographical distribution limitation excluding
-those countries, so that distribution is permitted only in or among
-countries not thus excluded.  In such case, this License incorporates
-the limitation as if written in the body of this License.
-
-  9. The Free Software Foundation may publish revised and/or new versions
-of the General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Program
-specifies a version number of this License which applies to it and "any
-later version", you have the option of following the terms and conditions
-either of that version or of any later version published by the Free
-Software Foundation.  If the Program does not specify a version number of
-this License, you may choose any version ever published by the Free Software
-Foundation.
-
-  10. If you wish to incorporate parts of the Program into other free
-programs whose distribution conditions are different, write to the author
-to ask for permission.  For software which is copyrighted by the Free
-Software Foundation, write to the Free Software Foundation; we sometimes
-make exceptions for this.  Our decision will be guided by the two goals
-of preserving the free status of all derivatives of our free software and
-of promoting the sharing and reuse of software generally.
-
-			    NO WARRANTY
-
-  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
-FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
-OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
-OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
-PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
-REPAIR OR CORRECTION.
-
-  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-	    How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
-
-    Gnomovision version 69, Copyright (C) year  name of author
-    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
-  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
-  <signature of Ty Coon>, 1 April 1989
-  Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs.  If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
-Public License instead of this License.
diff --git a/android/src/main/libenc/jni/libx264/Makefile b/android/src/main/libenc/jni/libx264/Makefile
deleted file mode 100755
index d0b1633..0000000
--- a/android/src/main/libenc/jni/libx264/Makefile
+++ /dev/null
@@ -1,321 +0,0 @@
-# Makefile
-
-include config.mak
-
-vpath %.c $(SRCPATH)
-vpath %.h $(SRCPATH)
-vpath %.S $(SRCPATH)
-vpath %.asm $(SRCPATH)
-vpath %.rc $(SRCPATH)
-
-GENERATED =
-
-all: default
-default:
-
-SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
-       common/frame.c common/dct.c common/cpu.c common/cabac.c \
-       common/common.c common/osdep.c common/rectangle.c \
-       common/set.c common/quant.c common/deblock.c common/vlc.c \
-       common/mvpred.c common/bitstream.c \
-       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
-       encoder/set.c encoder/macroblock.c encoder/cabac.c \
-       encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
-
-SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
-         output/raw.c output/matroska.c output/matroska_ebml.c \
-         output/flv.c output/flv_bytestream.c filters/filters.c \
-         filters/video/video.c filters/video/source.c filters/video/internal.c \
-         filters/video/resize.c filters/video/cache.c filters/video/fix_vfr_pts.c \
-         filters/video/select_every.c filters/video/crop.c filters/video/depth.c
-
-SRCSO =
-OBJS =
-OBJSO =
-OBJCLI =
-
-OBJCHK = tools/checkasm.o
-
-OBJEXAMPLE = example.o
-
-CONFIG := $(shell cat config.h)
-
-# GPL-only files
-ifneq ($(findstring HAVE_GPL 1, $(CONFIG)),)
-SRCCLI +=
-endif
-
-# Optional module sources
-ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
-SRCCLI += input/avs.c
-endif
-
-ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
-SRCCLI += input/thread.c
-SRCS   += common/threadpool.c
-endif
-
-ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
-SRCS += common/win32thread.c
-endif
-
-ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
-SRCCLI += input/lavf.c
-endif
-
-ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
-SRCCLI += input/ffms.c
-endif
-
-ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
-SRCCLI += output/mp4.c
-endif
-
-ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),)
-SRCCLI += output/mp4_lsmash.c
-endif
-
-# MMX/SSE optims
-ifneq ($(AS),)
-X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
-          mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm \
-          cpu-a.asm dct-32.asm bitstream-a.asm
-ifneq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
-X86SRC0 += sad16-a.asm
-else
-X86SRC0 += sad-a.asm
-endif
-X86SRC = $(X86SRC0:%=common/x86/%)
-
-ifeq ($(SYS_ARCH),X86)
-ARCH_X86 = yes
-ASMSRC   = $(X86SRC) common/x86/pixel-32.asm
-endif
-
-ifeq ($(SYS_ARCH),X86_64)
-ARCH_X86 = yes
-ASMSRC   = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
-endif
-
-ifdef ARCH_X86
-SRCS   += common/x86/mc-c.c common/x86/predict-c.c
-OBJASM  = $(ASMSRC:%.asm=%.o)
-$(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
-OBJCHK += tools/checkasm-a.o
-endif
-endif
-
-# AltiVec optims
-ifeq ($(SYS_ARCH),PPC)
-ifneq ($(AS),)
-SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
-        common/ppc/quant.c common/ppc/deblock.c \
-        common/ppc/predict.c
-endif
-endif
-
-# NEON optims
-ifeq ($(SYS_ARCH),ARM)
-ifneq ($(AS),)
-ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
-          common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
-          common/arm/predict-a.S common/arm/bitstream-a.S
-SRCS   += common/arm/mc-c.c common/arm/predict-c.c
-OBJASM  = $(ASMSRC:%.S=%.o)
-OBJCHK += tools/checkasm-arm.o
-endif
-endif
-
-# AArch64 NEON optims
-ifeq ($(SYS_ARCH),AARCH64)
-ifneq ($(AS),)
-ASMSRC += common/aarch64/bitstream-a.S \
-          common/aarch64/cabac-a.S     \
-          common/aarch64/dct-a.S     \
-          common/aarch64/deblock-a.S \
-          common/aarch64/mc-a.S      \
-          common/aarch64/pixel-a.S   \
-          common/aarch64/predict-a.S \
-          common/aarch64/quant-a.S
-SRCS   += common/aarch64/asm-offsets.c \
-          common/aarch64/mc-c.c        \
-          common/aarch64/predict-c.c
-OBJASM  = $(ASMSRC:%.S=%.o)
-OBJCHK += tools/checkasm-aarch64.o
-endif
-endif
-
-# MSA optims
-ifeq ($(SYS_ARCH),MIPS)
-ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
-SRCS += common/mips/mc-c.c common/mips/dct-c.c \
-        common/mips/deblock-c.c common/mips/pixel-c.c \
-        common/mips/predict-c.c common/mips/quant-c.c
-endif
-endif
-
-ifneq ($(HAVE_GETOPT_LONG),1)
-SRCCLI += extras/getopt.c
-endif
-
-ifeq ($(SYS),WINDOWS)
-OBJCLI += $(if $(RC), x264res.o)
-ifneq ($(SONAME),)
-SRCSO  += x264dll.c
-OBJSO  += $(if $(RC), x264res.dll.o)
-endif
-endif
-
-ifeq ($(HAVE_OPENCL),yes)
-common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
-	cat $^ | $(SRCPATH)/tools/cltostr.sh $@
-GENERATED += common/oclobj.h
-SRCS += common/opencl.c encoder/slicetype-cl.c
-endif
-
-OBJS   += $(SRCS:%.c=%.o)
-OBJCLI += $(SRCCLI:%.c=%.o)
-OBJSO  += $(SRCSO:%.c=%.o)
-
-.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags
-
-cli: x264$(EXE)
-lib-static: $(LIBX264)
-lib-shared: $(SONAME)
-
-$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
-	rm -f $(LIBX264)
-	$(AR)$@ $(OBJS) $(OBJASM)
-	$(if $(RANLIB), $(RANLIB) $@)
-
-$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
-	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
-
-ifneq ($(EXE),)
-.PHONY: x264 checkasm example
-x264: x264$(EXE)
-checkasm: checkasm$(EXE)
-example: example$(EXE)
-endif
-
-x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
-	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-
-checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
-	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
-
-example$(EXE): $(GENERATED) .depend $(OBJEXAMPLE) $(LIBX264)
-	$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
-
-$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
-
-%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
-	$(AS) $(ASFLAGS) -o $@ $<
-	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
-
-%.o: %.S
-	$(AS) $(ASFLAGS) -o $@ $<
-	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
-
-%.dll.o: %.rc x264.h
-	$(RC) $(RCFLAGS)$@ -DDLL $<
-
-%.o: %.rc x264.h
-	$(RC) $(RCFLAGS)$@ $<
-
-.depend: config.mak
-	@rm -f .depend
-	@echo 'dependency file generation...'
-ifeq ($(COMPILER),CL)
-	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;)
-else
-	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
-endif
-
-config.mak:
-	./configure
-
-depend: .depend
-ifneq ($(wildcard .depend),)
-include .depend
-endif
-
-SRC2 = $(SRCS) $(SRCCLI)
-# These should cover most of the important codepaths
-OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
-OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0  --slice-max-mbs 50
-OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
-OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
-OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
-OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
-OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
-OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
-
-ifeq (,$(VIDS))
-fprofiled:
-	@echo 'usage: make fprofiled VIDS="infile1 infile2 ..."'
-	@echo 'where infiles are anything that x264 understands,'
-	@echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.'
-else
-fprofiled:
-	$(MAKE) clean
-	$(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)"
-	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
-ifeq ($(COMPILER),CL)
-# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
-	rm -f x264$(EXE)
-else
-	rm -f $(SRC2:%.c=%.o)
-endif
-	$(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)"
-	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
-endif
-
-clean:
-	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
-	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
-	rm -f example example.exe $(OBJEXAMPLE)
-	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
-
-distclean: clean
-	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest*
-
-install-cli: cli
-	$(INSTALL) -d $(DESTDIR)$(bindir)
-	$(INSTALL) x264$(EXE) $(DESTDIR)$(bindir)
-
-install-lib-dev:
-	$(INSTALL) -d $(DESTDIR)$(includedir)
-	$(INSTALL) -d $(DESTDIR)$(libdir)
-	$(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig
-	$(INSTALL) -m 644 $(SRCPATH)/x264.h $(DESTDIR)$(includedir)
-	$(INSTALL) -m 644 x264_config.h $(DESTDIR)$(includedir)
-	$(INSTALL) -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
-
-install-lib-static: lib-static install-lib-dev
-	$(INSTALL) -m 644 $(LIBX264) $(DESTDIR)$(libdir)
-	$(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264))
-
-install-lib-shared: lib-shared install-lib-dev
-ifneq ($(IMPLIBNAME),)
-	$(INSTALL) -d $(DESTDIR)$(bindir)
-	$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir)
-	$(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)
-else ifneq ($(SONAME),)
-	ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
-	$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir)
-endif
-
-uninstall:
-	rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a
-	rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc
-ifneq ($(IMPLIBNAME),)
-	rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME)
-else ifneq ($(SONAME),)
-	rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
-endif
-
-etags: TAGS
-
-TAGS:
-	etags $(SRCS)
diff --git a/android/src/main/libenc/jni/libx264/android_build_armeabi_v7a.sh b/android/src/main/libenc/jni/libx264/android_build_armeabi_v7a.sh
deleted file mode 100755
index d0ca615..0000000
--- a/android/src/main/libenc/jni/libx264/android_build_armeabi_v7a.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/sh
-
-ANDROID_NDK=/home/leoma/MyOSP/android-ndk-r13b
-SYSROOT=$ANDROID_NDK/platforms/android-19/arch-arm
-CROSS_PREFIX=$ANDROID_NDK/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-
-EXTRA_CFLAGS="-march=armv7-a -mfloat-abi=softfp -mfpu=neon -D__ANDROID__ -D__ARM_ARCH_7__ -D__ARM_ARCH_7A__"
-EXTRA_LDFLAGS="-nostdlib"
-PREFIX=`pwd`/libs/armeabi-v7a
-
-./configure --prefix=$PREFIX \
-        --host=arm-linux \
-        --sysroot=$SYSROOT \
-        --cross-prefix=$CROSS_PREFIX \
-        --extra-cflags="$EXTRA_CFLAGS" \
-        --extra-ldflags="$EXTRA_LDFLAGS" \
-        --enable-pic \
-        --enable-static \
-        --enable-strip \
-        --disable-cli \
-        --disable-win32thread \
-        --disable-avs \
-        --disable-swscale \
-        --disable-lavf \
-        --disable-ffms \
-        --disable-gpac \
-        --disable-lsmash
-
-make clean
-make STRIP= -j8 install || exit 1
-
-cp -f $PREFIX/lib/libx264.a $PREFIX
-rm -rf $PREFIX/include $PREFIX/lib $PREFIX/pkgconfig
diff --git a/android/src/main/libenc/jni/libx264/android_build_x86.sh b/android/src/main/libenc/jni/libx264/android_build_x86.sh
deleted file mode 100755
index 05dbd7c..0000000
--- a/android/src/main/libenc/jni/libx264/android_build_x86.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/sh
-
-ANDROID_NDK=/home/leoma/MyOSP/android-ndk-r13b
-SYSROOT=$ANDROID_NDK/platforms/android-19/arch-x86
-CROSS_PREFIX=$ANDROID_NDK/toolchains/x86-4.9/prebuilt/linux-x86_64/bin/i686-linux-android-
-EXTRA_CFLAGS="-D__ANDROID__ -D__i686__"
-EXTRA_LDFLAGS="-nostdlib"
-PREFIX=`pwd`/libs/x86
-
-./configure --prefix=$PREFIX \
-        --host=i686-linux \
-        --sysroot=$SYSROOT \
-        --cross-prefix=$CROSS_PREFIX \
-        --extra-cflags="$EXTRA_CFLAGS" \
-        --extra-ldflags="$EXTRA_LDFLAGS" \
-        --enable-pic \
-        --enable-static \
-        --enable-strip \
-        --disable-cli \
-        --disable-win32thread \
-        --disable-avs \
-        --disable-swscale \
-        --disable-lavf \
-        --disable-ffms \
-        --disable-gpac \
-        --disable-lsmash
-
-make clean
-make STRIP= -j8 install || exit 1
-
-cp -f $PREFIX/lib/libx264.a $PREFIX
-rm -rf $PREFIX/include $PREFIX/lib $PREFIX/pkgconfig
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.c b/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.c
deleted file mode 100755
index 5685f1d..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*****************************************************************************
- * asm-offsets.c: check asm offsets for aarch64
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "asm-offsets.h"
-
-#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
-{ \
-    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
-}
-
-X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
-X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
-X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
-X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
-X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
-X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
-X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
-X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
-X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.h b/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.h
deleted file mode 100755
index 8968be4..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/asm-offsets.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*****************************************************************************
- * asm-offsets.h: asm offsets for aarch64
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_ASM_OFFSETS_H
-#define X264_AARCH64_ASM_OFFSETS_H
-
-#define CABAC_I_LOW                 0x00
-#define CABAC_I_RANGE               0x04
-#define CABAC_I_QUEUE               0x08
-#define CABAC_I_BYTES_OUTSTANDING   0x0c
-#define CABAC_P_START               0x10
-#define CABAC_P                     0x18
-#define CABAC_P_END                 0x20
-#define CABAC_F8_BITS_ENCODED       0x30
-#define CABAC_STATE                 0x34
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/asm.S b/android/src/main/libenc/jni/libx264/common/aarch64/asm.S
deleted file mode 100755
index 106a0af..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/asm.S
+++ /dev/null
@@ -1,221 +0,0 @@
-/*****************************************************************************
- * asm.S: AArch64 utility macros
- *****************************************************************************
- * Copyright (C) 2008-2016 x264 project
- *
- * Authors: Mans Rullgard <mans@mansr.com>
- *          David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "config.h"
-
-#ifdef PREFIX
-#   define EXTERN_ASM _
-#else
-#   define EXTERN_ASM
-#endif
-
-#ifdef __ELF__
-#   define ELF
-#else
-#   define ELF  #
-#endif
-
-#ifdef __MACH__
-#   define MACH
-#else
-#   define MACH #
-#endif
-
-#if HAVE_AS_FUNC
-#   define FUNC
-#else
-#   define FUNC #
-#endif
-
-.macro  function name, export=0, align=2
-    .macro endfunc
-ELF     .size   \name, . - \name
-FUNC    .endfunc
-        .purgem endfunc
-    .endm
-        .text
-        .align          \align
-    .if \export
-        .global EXTERN_ASM\name
-ELF     .type   EXTERN_ASM\name, %function
-FUNC    .func   EXTERN_ASM\name
-EXTERN_ASM\name:
-    .else
-ELF     .type   \name, %function
-FUNC    .func   \name
-\name:
-    .endif
-.endm
-
-.macro  const   name, align=2
-    .macro endconst
-ELF     .size   \name, . - \name
-        .purgem endconst
-    .endm
-ELF     .section        .rodata
-MACH    .const_data
-        .align          \align
-\name:
-.endm
-
-.macro  movrel rd, val
-#if defined(PIC) && defined(__APPLE__)
-        adrp            \rd, \val@PAGE
-        add             \rd, \rd, \val@PAGEOFF
-#elif defined(PIC)
-        adrp            \rd, \val
-        add             \rd, \rd, :lo12:\val
-#else
-        ldr             \rd, =\val
-#endif
-.endm
-
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
-
-#define FDEC_STRIDE 32
-#define FENC_STRIDE 16
-
-
-.macro SUMSUB_AB   sum, sub, a, b
-    add        \sum,  \a,  \b
-    sub        \sub,  \a,  \b
-.endm
-
-.macro unzip t1, t2, s1, s2
-    uzp1        \t1,  \s1,  \s2
-    uzp2        \t2,  \s1,  \s2
-.endm
-
-.macro transpose t1, t2, s1, s2
-    trn1        \t1,  \s1,  \s2
-    trn2        \t2,  \s1,  \s2
-.endm
-
-.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
-    transpose   \t0\().2s,  \t2\().2s,  \v0\().2s,  \v2\().2s
-    transpose   \t1\().2s,  \t3\().2s,  \v1\().2s,  \v3\().2s
-    transpose   \v0\().4h,  \v1\().4h,  \t0\().4h,  \t1\().4h
-    transpose   \v2\().4h,  \v3\().4h,  \t2\().4h,  \t3\().4h
-.endm
-
-.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
-    transpose   \t0\().4s,  \t2\().4s,  \v0\().4s,  \v2\().4s
-    transpose   \t1\().4s,  \t3\().4s,  \v1\().4s,  \v3\().4s
-    transpose   \v0\().8h,  \v1\().8h,  \t0\().8h,  \t1\().8h
-    transpose   \v2\().8h,  \v3\().8h,  \t2\().8h,  \t3\().8h
-.endm
-
-
-.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-    trn1        \r8\().8H,  \r0\().8H,  \r1\().8H
-    trn2        \r9\().8H,  \r0\().8H,  \r1\().8H
-    trn1        \r1\().8H,  \r2\().8H,  \r3\().8H
-    trn2        \r3\().8H,  \r2\().8H,  \r3\().8H
-    trn1        \r0\().8H,  \r4\().8H,  \r5\().8H
-    trn2        \r5\().8H,  \r4\().8H,  \r5\().8H
-    trn1        \r2\().8H,  \r6\().8H,  \r7\().8H
-    trn2        \r7\().8H,  \r6\().8H,  \r7\().8H
-
-    trn1        \r4\().4S,  \r0\().4S,  \r2\().4S
-    trn2        \r2\().4S,  \r0\().4S,  \r2\().4S
-    trn1        \r6\().4S,  \r5\().4S,  \r7\().4S
-    trn2        \r7\().4S,  \r5\().4S,  \r7\().4S
-    trn1        \r5\().4S,  \r9\().4S,  \r3\().4S
-    trn2        \r9\().4S,  \r9\().4S,  \r3\().4S
-    trn1        \r3\().4S,  \r8\().4S,  \r1\().4S
-    trn2        \r8\().4S,  \r8\().4S,  \r1\().4S
-
-    trn1        \r0\().2D,  \r3\().2D,  \r4\().2D
-    trn2        \r4\().2D,  \r3\().2D,  \r4\().2D
-
-    trn1        \r1\().2D,  \r5\().2D,  \r6\().2D
-    trn2        \r5\().2D,  \r5\().2D,  \r6\().2D
-
-    trn2        \r6\().2D,  \r8\().2D,  \r2\().2D
-    trn1        \r2\().2D,  \r8\().2D,  \r2\().2D
-
-    trn1        \r3\().2D,  \r9\().2D,  \r7\().2D
-    trn2        \r7\().2D,  \r9\().2D,  \r7\().2D
-.endm
-
-.macro  transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-    trn1        \t0\().16b, \r0\().16b, \r1\().16b
-    trn2        \t1\().16b, \r0\().16b, \r1\().16b
-    trn1        \r1\().16b, \r2\().16b, \r3\().16b
-    trn2        \r3\().16b, \r2\().16b, \r3\().16b
-    trn1        \r0\().16b, \r4\().16b, \r5\().16b
-    trn2        \r5\().16b, \r4\().16b, \r5\().16b
-    trn1        \r2\().16b, \r6\().16b, \r7\().16b
-    trn2        \r7\().16b, \r6\().16b, \r7\().16b
-
-    trn1        \r4\().8h,  \r0\().8h,  \r2\().8h
-    trn2        \r2\().8h,  \r0\().8h,  \r2\().8h
-    trn1        \r6\().8h,  \r5\().8h,  \r7\().8h
-    trn2        \r7\().8h,  \r5\().8h,  \r7\().8h
-    trn1        \r5\().8h,  \t1\().8h,  \r3\().8h
-    trn2        \t1\().8h,  \t1\().8h,  \r3\().8h
-    trn1        \r3\().8h,  \t0\().8h,  \r1\().8h
-    trn2        \t0\().8h,  \t0\().8h,  \r1\().8h
-
-    trn1        \r0\().4s,  \r3\().4s,  \r4\().4s
-    trn2        \r4\().4s,  \r3\().4s,  \r4\().4s
-
-    trn1        \r1\().4s,  \r5\().4s,  \r6\().4s
-    trn2        \r5\().4s,  \r5\().4s,  \r6\().4s
-
-    trn2        \r6\().4s,  \t0\().4s,  \r2\().4s
-    trn1        \r2\().4s,  \t0\().4s,  \r2\().4s
-
-    trn1        \r3\().4s,  \t1\().4s,  \r7\().4s
-    trn2        \r7\().4s,  \t1\().4s,  \r7\().4s
-.endm
-
-.macro  transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
-    trn1        \t4\().16b, \r0\().16b,  \r1\().16b
-    trn2        \t5\().16b, \r0\().16b,  \r1\().16b
-    trn1        \t6\().16b, \r2\().16b,  \r3\().16b
-    trn2        \t7\().16b, \r2\().16b,  \r3\().16b
-
-    trn1        \r0\().8h,  \t4\().8h,  \t6\().8h
-    trn2        \r2\().8h,  \t4\().8h,  \t6\().8h
-    trn1        \r1\().8h,  \t5\().8h,  \t7\().8h
-    trn2        \r3\().8h,  \t5\().8h,  \t7\().8h
-.endm
-
-.macro  transpose_4x8.b  r0, r1, r2, r3, t4, t5, t6, t7
-    trn1        \t4\().8b,  \r0\().8b,  \r1\().8b
-    trn2        \t5\().8b,  \r0\().8b,  \r1\().8b
-    trn1        \t6\().8b,  \r2\().8b,  \r3\().8b
-    trn2        \t7\().8b,  \r2\().8b,  \r3\().8b
-
-    trn1        \r0\().4h,  \t4\().4h,  \t6\().4h
-    trn2        \r2\().4h,  \t4\().4h,  \t6\().4h
-    trn1        \r1\().4h,  \t5\().4h,  \t7\().4h
-    trn2        \r3\().4h,  \t5\().4h,  \t7\().4h
-.endm
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/bitstream-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/bitstream-a.S
deleted file mode 100755
index 3ec7c27..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/bitstream-a.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/*****************************************************************************
- * bitstream-a.S: aarch64 bitstream functions
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-function x264_nal_escape_neon, export=1
-    movi        v0.16b,  #0xff
-    movi        v4.16b,  #4
-    mov         w3,  #3
-    subs        x6,  x1,  x2
-    cbz         x6,  99f
-0:
-    cmn         x6,  #15
-    b.lt        16f
-    mov         x1,  x2
-    b           100f
-16:
-    ld1         {v1.16b}, [x1], #16
-    ext         v2.16b, v0.16b, v1.16b, #14
-    ext         v3.16b, v0.16b, v1.16b, #15
-    cmhi        v7.16b, v4.16b, v1.16b
-    cmeq        v5.16b, v2.16b, #0
-    cmeq        v6.16b, v3.16b, #0
-    and         v5.16b, v5.16b, v7.16b
-    and         v5.16b, v5.16b, v6.16b
-    shrn        v7.8b,  v5.8h,  #4
-    mov         x7,  v7.d[0]
-    cbz         x7,  16f
-    mov         x6,  #-16
-100:
-    umov        w5,  v0.b[14]
-    umov        w4,  v0.b[15]
-    orr         w5,  w4,  w5, lsl #8
-101:
-    ldrb        w4,  [x1, x6]
-    orr         w9,  w4,  w5, lsl #16
-    cmp         w9,  #3
-    b.hi        102f
-    strb        w3,  [x0], #1
-    orr         w5,  w3,  w5, lsl #8
-102:
-    adds        x6,  x6,  #1
-    strb        w4,  [x0], #1
-    orr         w5,  w4,  w5, lsl #8
-    b.lt        101b
-    subs        x6,  x1,  x2
-    lsr         w9,  w5,  #8
-    mov         v0.b[14],  w9
-    mov         v0.b[15],  w5
-    b.lt        0b
-
-    ret
-16:
-    subs        x6,  x1,  x2
-    st1         {v1.16b}, [x0], #16
-    mov         v0.16b, v1.16b
-    b.lt        0b
-99:
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/cabac-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/cabac-a.S
deleted file mode 100755
index ace336f..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/cabac-a.S
+++ /dev/null
@@ -1,122 +0,0 @@
-/*****************************************************************************
- * cabac-a.S: aarch64 cabac
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-#include "asm-offsets.h"
-
-// w11 holds x264_cabac_t.i_low
-// w12 holds x264_cabac_t.i_range
-
-function x264_cabac_encode_decision_asm, export=1
-    movrel      x8,  X(x264_cabac_range_lps)
-    movrel      x9,  X(x264_cabac_transition)
-    add         w10, w1, #CABAC_STATE
-    ldrb        w3,  [x0,  x10]         // i_state
-    ldr         w12, [x0,  #CABAC_I_RANGE]
-    and         x4,  x3,  #~1
-    asr         w5,  w12, #6
-    add         x8,  x8,  x4, lsl #1
-    sub         w5,  w5,  #4
-    eor         w6,  w2,  w3            // b ^ i_state
-    ldrb        w4,  [x8,  x5]          // i_range_lps
-    ldr         w11, [x0, #CABAC_I_LOW]
-    sub         w12, w12, w4
-    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
-    add         w11, w11, w12
-    mov         w12,  w4
-1:
-    orr         w4,  w2,  w3, lsl #1
-    ldrb        w9,  [x9,  x4]
-    strb        w9,  [x0,  x10]    // i_state
-
-cabac_encode_renorm:
-    clz         w5,  w12
-    ldr         w2,  [x0, #CABAC_I_QUEUE]
-    sub         w5,  w5,  #23
-    lsl         w12, w12, w5
-    lsl         w11, w11, w5
-2:
-    adds        w2,  w2,  w5
-    str         w12, [x0, #CABAC_I_RANGE]
-    b.lt        0f
-cabac_putbyte:
-    mov         w13, #0x400
-    add         w12, w2,  #10
-    lsl         w13, w13, w2
-    asr         w4,  w11, w12           // out
-    sub         w2,  w2,  #8
-    sub         w13, w13, #1
-    subs        w5,  w4,  #0xff
-    and         w11, w11, w13
-    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
-    str         w2,  [x0, #CABAC_I_QUEUE]
-    b.ne        1f
-
-    add         w6,  w6,  #1
-    str         w11, [x0, #CABAC_I_LOW]
-    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
-    ret
-
-1:
-    ldr         x7,  [x0, #CABAC_P]
-    asr         w5,  w4,  #8            // carry
-    ldrb        w8,  [x7, #-1]
-    add         w8,  w8,  w5
-    sub         w5,  w5,  #1
-    strb        w8,  [x7, #-1]
-    cbz         w6,  3f
-2:
-    subs        w6,  w6,  #1
-    strb        w5,  [x7],  #1
-    b.gt        2b
-3:
-    strb        w4,  [x7],  #1
-    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
-    str         x7,  [x0, #CABAC_P]
-0:
-    str         w11, [x0, #CABAC_I_LOW]
-    str         w2,  [x0, #CABAC_I_QUEUE]
-    ret
-endfunc
-
-function x264_cabac_encode_bypass_asm, export=1
-    ldr         w12, [x0, #CABAC_I_RANGE]
-    ldr         w11, [x0, #CABAC_I_LOW]
-    ldr         w2,  [x0, #CABAC_I_QUEUE]
-    and         w1,  w1,  w12
-    add         w11, w1,  w11, lsl #1
-    adds        w2,  w2,  #1
-    b.ge        cabac_putbyte
-    str         w11, [x0, #CABAC_I_LOW]
-    str         w2,  [x0, #CABAC_I_QUEUE]
-    ret
-endfunc
-
-function x264_cabac_encode_terminal_asm, export=1
-    ldr         w12, [x0, #CABAC_I_RANGE]
-    ldr         w11, [x0, #CABAC_I_LOW]
-    sub         w12, w12, #2
-    b           cabac_encode_renorm
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/dct-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/dct-a.S
deleted file mode 100755
index 525a569..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/dct-a.S
+++ /dev/null
@@ -1,1008 +0,0 @@
-/****************************************************************************
- * dct-a.S: aarch64 transform and zigzag
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-const scan4x4_frame, align=4
-.byte    0,1,   8,9,   2,3,   4,5
-.byte   10,11, 16,17, 24,25, 18,19
-.byte   12,13,  6,7,  14,15, 20,21
-.byte   26,27, 28,29, 22,23, 30,31
-endconst
-
-const scan4x4_field, align=4
-.byte    0,1,   2,3,   8,9,   4,5
-.byte    6,7,  10,11, 12,13, 14,15
-endconst
-
-const sub4x4_frame, align=4
-.byte    0,  1,  4,  8
-.byte    5,  2,  3,  6
-.byte    9, 12, 13, 10
-.byte    7, 11, 14, 15
-endconst
-
-const sub4x4_field, align=4
-.byte    0,  4,  1,  8
-.byte   12,  5,  9, 13
-.byte    2,  6, 10, 14
-.byte    3,  7, 11, 15
-endconst
-
-// sum = a + (b>>shift)   sub = (a>>shift) - b
-.macro SUMSUB_SHR shift sum sub a b t0 t1
-    sshr        \t0,  \b, #\shift
-    sshr        \t1,  \a, #\shift
-    add         \sum, \a, \t0
-    sub         \sub, \t1, \b
-.endm
-
-// sum = (a>>shift) + b   sub = a - (b>>shift)
-.macro SUMSUB_SHR2 shift sum sub a b t0 t1
-    sshr        \t0,  \a, #\shift
-    sshr        \t1,  \b, #\shift
-    add         \sum, \t0, \b
-    sub         \sub, \a, \t1
-.endm
-
-// a += 1.5*ma   b -= 1.5*mb
-.macro SUMSUB_15 a b ma mb t0 t1
-    sshr        \t0, \ma, #1
-    sshr        \t1, \mb, #1
-    add         \t0, \t0, \ma
-    add         \t1, \t1, \mb
-    add         \a,  \a,  \t0
-    sub         \b,  \b,  \t1
-.endm
-
-
-function x264_dct4x4dc_neon, export=1
-    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
-    movi        v31.4h, #1
-    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
-    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
-    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
-    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
-    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
-    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
-    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
-    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
-    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
-    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
-    add         v16.4h, v4.4h,  v31.4h
-    add         v17.4h, v6.4h,  v31.4h
-    srhadd      v0.4h,  v4.4h,  v5.4h
-    shsub       v1.4h,  v16.4h, v5.4h
-    shsub       v2.4h,  v17.4h, v7.4h
-    srhadd      v3.4h,  v6.4h,  v7.4h
-    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
-    ret
-endfunc
-
-function x264_idct4x4dc_neon, export=1
-    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
-    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
-    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
-    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
-    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
-    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
-    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
-    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
-    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
-    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
-    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
-    SUMSUB_AB   v0.4h,  v1.4h,  v4.4h,  v5.4h
-    SUMSUB_AB   v3.4h,  v2.4h,  v6.4h,  v7.4h
-    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
-    ret
-endfunc
-
-.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
-    SUMSUB_AB   \v1, \v6, \v5, \v6
-    SUMSUB_AB   \v3, \v7, \v4, \v7
-    add         \v0, \v3, \v1
-    add         \v4, \v7, \v7
-    add         \v5, \v6, \v6
-    sub         \v2, \v3, \v1
-    add         \v1, \v4, \v6
-    sub         \v3, \v7, \v5
-.endm
-
-function x264_sub4x4_dct_neon, export=1
-    mov         x3, #FENC_STRIDE
-    mov         x4, #FDEC_STRIDE
-    ld1        {v0.s}[0], [x1], x3
-    ld1        {v1.s}[0], [x2], x4
-    ld1        {v2.s}[0], [x1], x3
-    usubl       v16.8h, v0.8b,  v1.8b
-    ld1        {v3.s}[0], [x2], x4
-    ld1        {v4.s}[0], [x1], x3
-    usubl       v17.8h, v2.8b,  v3.8b
-    ld1        {v5.s}[0], [x2], x4
-    ld1        {v6.s}[0], [x1], x3
-    usubl       v18.8h, v4.8b,  v5.8b
-    ld1        {v7.s}[0], [x2], x4
-    usubl       v19.8h, v6.8b,  v7.8b
-
-    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
-    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
-    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
-    st1        {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
-    ret
-endfunc
-
-function x264_sub8x4_dct_neon
-    ld1        {v0.8b}, [x1], x3
-    ld1        {v1.8b}, [x2], x4
-    usubl       v16.8h, v0.8b,  v1.8b
-    ld1        {v2.8b}, [x1], x3
-    ld1        {v3.8b}, [x2], x4
-    usubl       v17.8h, v2.8b,  v3.8b
-    ld1        {v4.8b}, [x1], x3
-    ld1        {v5.8b}, [x2], x4
-    usubl       v18.8h, v4.8b,  v5.8b
-    ld1        {v6.8b}, [x1], x3
-    ld1        {v7.8b}, [x2], x4
-    usubl       v19.8h, v6.8b,  v7.8b
-
-    DCT_1D      v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
-    transpose4x8.h  v0, v1, v2, v3, v4, v5, v6, v7
-
-    SUMSUB_AB   v16.8h, v19.8h, v0.8h,  v3.8h
-    SUMSUB_AB   v17.8h, v18.8h, v1.8h,  v2.8h
-    add         v22.8h, v19.8h, v19.8h
-    add         v21.8h, v18.8h, v18.8h
-    add         v0.8h,  v16.8h, v17.8h
-    sub         v1.8h,  v16.8h, v17.8h
-
-    add         v2.8h,  v22.8h, v18.8h
-    sub         v3.8h,  v19.8h, v21.8h
-
-    zip1        v4.2d,  v0.2d,  v2.2d
-    zip2        v6.2d,  v0.2d,  v2.2d
-    zip1        v5.2d,  v1.2d,  v3.2d
-    zip2        v7.2d,  v1.2d,  v3.2d
-
-    st1        {v4.8h}, [x0], #16
-    st1        {v5.8h}, [x0], #16
-    st1        {v6.8h}, [x0], #16
-    st1        {v7.8h}, [x0], #16
-    ret
-endfunc
-
-function x264_sub8x8_dct_neon, export=1
-    mov         x5,  x30
-    mov         x3, #FENC_STRIDE
-    mov         x4, #FDEC_STRIDE
-    bl          x264_sub8x4_dct_neon
-    mov         x30, x5
-    b           x264_sub8x4_dct_neon
-endfunc
-
-function x264_sub16x16_dct_neon, export=1
-    mov         x5,  x30
-    mov         x3, #FENC_STRIDE
-    mov         x4, #FDEC_STRIDE
-    bl          x264_sub8x4_dct_neon
-    bl          x264_sub8x4_dct_neon
-    sub         x1, x1, #8*FENC_STRIDE-8
-    sub         x2, x2, #8*FDEC_STRIDE-8
-    bl          x264_sub8x4_dct_neon
-    bl          x264_sub8x4_dct_neon
-    sub         x1, x1, #8
-    sub         x2, x2, #8
-    bl          x264_sub8x4_dct_neon
-    bl          x264_sub8x4_dct_neon
-    sub         x1, x1, #8*FENC_STRIDE-8
-    sub         x2, x2, #8*FDEC_STRIDE-8
-    bl          x264_sub8x4_dct_neon
-    mov         x30, x5
-    b           x264_sub8x4_dct_neon
-endfunc
-
-
-.macro DCT8_1D type
-    SUMSUB_AB   v18.8h, v17.8h, v3.8h,  v4.8h   // s34/d34
-    SUMSUB_AB   v19.8h, v16.8h, v2.8h,  v5.8h   // s25/d25
-    SUMSUB_AB   v22.8h, v21.8h, v1.8h,  v6.8h   // s16/d16
-    SUMSUB_AB   v23.8h, v20.8h, v0.8h,  v7.8h   // s07/d07
-
-    SUMSUB_AB   v24.8h, v26.8h,  v23.8h, v18.8h  // a0/a2
-    SUMSUB_AB   v25.8h, v27.8h,  v22.8h, v19.8h  // a1/a3
-
-    SUMSUB_AB   v30.8h, v29.8h,  v20.8h, v17.8h  // a6/a5
-    sshr        v23.8h, v21.8h, #1
-    sshr        v18.8h, v16.8h, #1
-    add         v23.8h, v23.8h, v21.8h
-    add         v18.8h, v18.8h, v16.8h
-    sub         v30.8h, v30.8h, v23.8h
-    sub         v29.8h, v29.8h, v18.8h
-
-    SUMSUB_AB   v28.8h, v31.8h,  v21.8h, v16.8h   // a4/a7
-    sshr        v22.8h, v20.8h, #1
-    sshr        v19.8h, v17.8h, #1
-    add         v22.8h, v22.8h, v20.8h
-    add         v19.8h, v19.8h, v17.8h
-    add         v22.8h, v28.8h, v22.8h
-    add         v31.8h, v31.8h, v19.8h
-
-    SUMSUB_AB      v0.8h,  v4.8h,  v24.8h, v25.8h
-    SUMSUB_SHR  2, v1.8h,  v7.8h,  v22.8h, v31.8h, v16.8h, v17.8h
-    SUMSUB_SHR  1, v2.8h,  v6.8h,  v26.8h, v27.8h, v18.8h, v19.8h
-    SUMSUB_SHR2 2, v3.8h,  v5.8h,  v30.8h, v29.8h, v20.8h, v21.8h
-.endm
-
-function x264_sub8x8_dct8_neon, export=1
-    mov         x3, #FENC_STRIDE
-    mov         x4, #FDEC_STRIDE
-    ld1        {v16.8b}, [x1], x3
-    ld1        {v17.8b}, [x2], x4
-    ld1        {v18.8b}, [x1], x3
-    ld1        {v19.8b}, [x2], x4
-    usubl       v0.8h,  v16.8b, v17.8b
-    ld1        {v20.8b}, [x1], x3
-    ld1        {v21.8b}, [x2], x4
-    usubl       v1.8h,  v18.8b, v19.8b
-    ld1        {v22.8b}, [x1], x3
-    ld1        {v23.8b}, [x2], x4
-    usubl       v2.8h,  v20.8b, v21.8b
-    ld1        {v24.8b}, [x1], x3
-    ld1        {v25.8b}, [x2], x4
-    usubl       v3.8h,  v22.8b, v23.8b
-    ld1        {v26.8b}, [x1], x3
-    ld1        {v27.8b}, [x2], x4
-    usubl       v4.8h,  v24.8b, v25.8b
-    ld1        {v28.8b}, [x1], x3
-    ld1        {v29.8b}, [x2], x4
-    usubl       v5.8h,  v26.8b, v27.8b
-    ld1        {v30.8b}, [x1], x3
-    ld1        {v31.8b}, [x2], x4
-    usubl       v6.8h,  v28.8b, v29.8b
-    usubl       v7.8h,  v30.8b, v31.8b
-
-    DCT8_1D row
-    transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
-    DCT8_1D col
-
-    st1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
-    st1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
-    ret
-endfunc
-
-function x264_sub16x16_dct8_neon, export=1
-    mov         x7,  x30
-    bl          X(x264_sub8x8_dct8_neon)
-    sub         x1,  x1,  #FENC_STRIDE*8 - 8
-    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
-    bl          X(x264_sub8x8_dct8_neon)
-    sub         x1,  x1,  #8
-    sub         x2,  x2,  #8
-    bl          X(x264_sub8x8_dct8_neon)
-    mov         x30, x7
-    sub         x1,  x1,  #FENC_STRIDE*8 - 8
-    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
-    b           X(x264_sub8x8_dct8_neon)
-endfunc
-
-
-// First part of IDCT (minus final SUMSUB_BA)
-.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
-    SUMSUB_AB   \d4, \d5, \d0, \d2
-    sshr        \d7, \d1, #1
-    sshr        \d6, \d3, #1
-    sub         \d7, \d7, \d3
-    add         \d6, \d6, \d1
-.endm
-
-function x264_add4x4_idct_neon, export=1
-    mov         x2, #FDEC_STRIDE
-    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
-
-    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
-    ld1        {v28.s}[0], [x0], x2
-    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
-    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
-
-    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
-
-    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
-    ld1        {v29.s}[0], [x0], x2
-    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
-    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
-
-    srshr       v0.4h,  v0.4h,  #6
-    srshr       v1.4h,  v1.4h,  #6
-    ld1        {v31.s}[0], [x0], x2
-    srshr       v2.4h,  v2.4h,  #6
-    srshr       v3.4h,  v3.4h,  #6
-    ld1        {v30.s}[0], [x0], x2
-
-    sub         x0,  x0,  x2,  lsl #2
-    uaddw       v0.8h,  v0.8h,  v28.8b
-    uaddw       v1.8h,  v1.8h,  v29.8b
-    uaddw       v2.8h,  v2.8h,  v30.8b
-    uaddw       v3.8h,  v3.8h,  v31.8b
-    sqxtun      v0.8b,  v0.8h
-    sqxtun      v1.8b,  v1.8h
-    sqxtun      v2.8b,  v2.8h
-    sqxtun      v3.8b,  v3.8h
-
-    st1        {v0.s}[0], [x0], x2
-    st1        {v1.s}[0], [x0], x2
-    st1        {v3.s}[0], [x0], x2
-    st1        {v2.s}[0], [x0], x2
-    ret
-endfunc
-
-function x264_add8x4_idct_neon, export=1
-    ld1        {v0.8h,v1.8h}, [x1], #32
-    ld1        {v2.8h,v3.8h}, [x1], #32
-    transpose   v20.2d, v21.2d, v0.2d, v2.2d
-    transpose   v22.2d, v23.2d, v1.2d, v3.2d
-    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
-    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
-
-    transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
-
-    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
-    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
-
-    srshr       v0.8h,  v0.8h,  #6
-    ld1        {v28.8b}, [x0], x2
-    srshr       v1.8h,  v1.8h,  #6
-    ld1        {v29.8b}, [x0], x2
-    srshr       v2.8h,  v2.8h,  #6
-    ld1        {v30.8b}, [x0], x2
-    srshr       v3.8h,  v3.8h,  #6
-    ld1        {v31.8b}, [x0], x2
-
-    sub         x0,  x0,  x2,  lsl #2
-    uaddw       v0.8h,  v0.8h,  v28.8b
-    uaddw       v1.8h,  v1.8h,  v29.8b
-    uaddw       v2.8h,  v2.8h,  v30.8b
-    uaddw       v3.8h,  v3.8h,  v31.8b
-
-    sqxtun      v0.8b,  v0.8h
-    sqxtun      v1.8b,  v1.8h
-    st1        {v0.8b}, [x0], x2
-    sqxtun      v2.8b,  v2.8h
-    st1        {v1.8b}, [x0], x2
-    sqxtun      v3.8b,  v3.8h
-    st1        {v2.8b}, [x0], x2
-    st1        {v3.8b}, [x0], x2
-    ret
-endfunc
-
-function x264_add8x8_idct_neon, export=1
-    mov             x2, #FDEC_STRIDE
-    mov             x5,  x30
-    bl              X(x264_add8x4_idct_neon)
-    mov             x30, x5
-    b               X(x264_add8x4_idct_neon)
-endfunc
-
-function x264_add16x16_idct_neon, export=1
-    mov             x2, #FDEC_STRIDE
-    mov             x5,  x30
-    bl              X(x264_add8x4_idct_neon)
-    bl              X(x264_add8x4_idct_neon)
-    sub             x0, x0, #8*FDEC_STRIDE-8
-    bl              X(x264_add8x4_idct_neon)
-    bl              X(x264_add8x4_idct_neon)
-    sub             x0, x0, #8
-    bl              X(x264_add8x4_idct_neon)
-    bl              X(x264_add8x4_idct_neon)
-    sub             x0, x0, #8*FDEC_STRIDE-8
-    bl              X(x264_add8x4_idct_neon)
-    mov             x30, x5
-    b               X(x264_add8x4_idct_neon)
-endfunc
-
-.macro IDCT8_1D type
-    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v20.8h          // a0/a2
-.ifc \type, row
-    ld1        {v22.8h,v23.8h}, [x1], #32
-.endif
-    SUMSUB_SHR  1, v2.8h,  v3.8h,  v18.8h, v22.8h, v16.8h, v20.8h   // a6/a4
-    SUMSUB_AB   v16.8h, v18.8h, v21.8h, v19.8h
-    SUMSUB_15   v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h      // a7/a1
-    SUMSUB_AB   v22.8h, v23.8h, v23.8h, v17.8h
-    SUMSUB_15   v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h      // a5/a3
-
-    SUMSUB_SHR  2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h   // b3/b5
-    SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h   // b1/b7
-
-    SUMSUB_AB   v18.8h, v2.8h,  v0.8h,  v2.8h           // b0/b6
-    SUMSUB_AB   v19.8h, v3.8h,  v1.8h,  v3.8h           // b2/b4
-
-    SUMSUB_AB   v16.8h, v23.8h, v18.8h, v23.8h
-    SUMSUB_AB   v17.8h, v22.8h, v19.8h, v22.8h
-    SUMSUB_AB   v18.8h, v21.8h, v3.8h,  v21.8h
-    SUMSUB_AB   v19.8h, v20.8h, v2.8h,  v20.8h
-.endm
-
-function x264_add8x8_idct8_neon, export=1
-    mov         x2,  #FDEC_STRIDE
-    ld1        {v16.8h,v17.8h}, [x1], #32
-    ld1        {v18.8h,v19.8h}, [x1], #32
-    ld1        {v20.8h,v21.8h}, [x1], #32
-
-    IDCT8_1D    row
-
-    transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
-
-    IDCT8_1D    col
-
-    ld1        {v0.8b}, [x0], x2
-    srshr       v16.8h, v16.8h, #6
-    ld1        {v1.8b}, [x0], x2
-    srshr       v17.8h, v17.8h, #6
-    ld1        {v2.8b}, [x0], x2
-    srshr       v18.8h, v18.8h, #6
-    ld1        {v3.8b}, [x0], x2
-    srshr       v19.8h, v19.8h, #6
-    ld1        {v4.8b}, [x0], x2
-    srshr       v20.8h, v20.8h, #6
-    ld1        {v5.8b}, [x0], x2
-    srshr       v21.8h, v21.8h, #6
-    ld1        {v6.8b}, [x0], x2
-    srshr       v22.8h, v22.8h, #6
-    ld1        {v7.8b}, [x0], x2
-    srshr       v23.8h, v23.8h, #6
-    sub         x0,  x0,  x2,  lsl #3
-
-    uaddw       v16.8h, v16.8h, v0.8b
-    uaddw       v17.8h, v17.8h, v1.8b
-    uaddw       v18.8h, v18.8h, v2.8b
-    sqxtun      v0.8b,  v16.8h
-    sqxtun      v1.8b,  v17.8h
-    sqxtun      v2.8b,  v18.8h
-    uaddw       v19.8h, v19.8h, v3.8b
-    st1        {v0.8b}, [x0], x2
-    uaddw       v20.8h, v20.8h, v4.8b
-    st1        {v1.8b}, [x0], x2
-    uaddw       v21.8h, v21.8h, v5.8b
-    st1        {v2.8b}, [x0], x2
-    sqxtun      v3.8b,  v19.8h
-    sqxtun      v4.8b,  v20.8h
-    uaddw       v22.8h, v22.8h, v6.8b
-    uaddw       v23.8h, v23.8h, v7.8b
-    st1        {v3.8b}, [x0], x2
-    sqxtun      v5.8b,  v21.8h
-    st1        {v4.8b}, [x0], x2
-    sqxtun      v6.8b,  v22.8h
-    sqxtun      v7.8b,  v23.8h
-    st1        {v5.8b}, [x0], x2
-    st1        {v6.8b}, [x0], x2
-    st1        {v7.8b}, [x0], x2
-    ret
-endfunc
-
-function x264_add16x16_idct8_neon, export=1
-    mov             x7,  x30
-    bl              X(x264_add8x8_idct8_neon)
-    sub             x0,  x0,  #8*FDEC_STRIDE-8
-    bl              X(x264_add8x8_idct8_neon)
-    sub             x0,  x0,  #8
-    bl              X(x264_add8x8_idct8_neon)
-    sub             x0,  x0,  #8*FDEC_STRIDE-8
-    mov             x30, x7
-    b               X(x264_add8x8_idct8_neon)
-endfunc
-
-function x264_add8x8_idct_dc_neon, export=1
-    mov         x2,  #FDEC_STRIDE
-    ld1        {v16.4h}, [x1]
-    ld1        {v0.8b}, [x0], x2
-    srshr       v16.4h, v16.4h, #6
-    ld1        {v1.8b}, [x0], x2
-    dup         v20.8h, v16.h[0]
-    dup         v21.8h, v16.h[1]
-    ld1        {v2.8b}, [x0], x2
-    dup         v22.8h, v16.h[2]
-    dup         v23.8h, v16.h[3]
-    ld1        {v3.8b}, [x0], x2
-    trn1        v20.2d, v20.2d,  v21.2d
-    ld1        {v4.8b}, [x0], x2
-    trn1        v21.2d, v22.2d,  v23.2d
-    ld1        {v5.8b}, [x0], x2
-    neg         v22.8h, v20.8h
-    ld1        {v6.8b}, [x0], x2
-    neg         v23.8h, v21.8h
-    ld1        {v7.8b}, [x0], x2
-
-    sub         x0,  x0,  #8*FDEC_STRIDE
-
-    sqxtun      v20.8b,  v20.8h
-    sqxtun      v21.8b,  v21.8h
-    sqxtun      v22.8b,  v22.8h
-    sqxtun      v23.8b,  v23.8h
-
-    uqadd       v0.8b,  v0.8b,  v20.8b
-    uqadd       v1.8b,  v1.8b,  v20.8b
-    uqadd       v2.8b,  v2.8b,  v20.8b
-    uqadd       v3.8b,  v3.8b,  v20.8b
-    uqadd       v4.8b,  v4.8b,  v21.8b
-    uqadd       v5.8b,  v5.8b,  v21.8b
-    uqadd       v6.8b,  v6.8b,  v21.8b
-    uqadd       v7.8b,  v7.8b,  v21.8b
-    uqsub       v0.8b,  v0.8b,  v22.8b
-    uqsub       v1.8b,  v1.8b,  v22.8b
-    uqsub       v2.8b,  v2.8b,  v22.8b
-    uqsub       v3.8b,  v3.8b,  v22.8b
-    uqsub       v4.8b,  v4.8b,  v23.8b
-    uqsub       v5.8b,  v5.8b,  v23.8b
-    uqsub       v6.8b,  v6.8b,  v23.8b
-    uqsub       v7.8b,  v7.8b,  v23.8b
-
-    st1        {v0.8b}, [x0], x2
-    st1        {v1.8b}, [x0], x2
-    st1        {v2.8b}, [x0], x2
-    st1        {v3.8b}, [x0], x2
-    st1        {v4.8b}, [x0], x2
-    st1        {v5.8b}, [x0], x2
-    st1        {v6.8b}, [x0], x2
-    st1        {v7.8b}, [x0], x2
-    ret
-endfunc
-
-.macro ADD16x4_IDCT_DC dc
-    ld1         {v4.16b}, [x0], x3
-    dup         v24.8h,  \dc[0]
-    dup         v25.8h,  \dc[1]
-    ld1         {v5.16b}, [x0], x3
-    dup         v26.8h,  \dc[2]
-    dup         v27.8h,  \dc[3]
-    ld1         {v6.16b}, [x0], x3
-    trn1        v24.2d,  v24.2d,  v25.2d
-    ld1         {v7.16b}, [x0], x3
-    trn1        v25.2d,  v26.2d,  v27.2d
-    neg         v26.8h,  v24.8h
-    neg         v27.8h,  v25.8h
-
-    sqxtun      v20.8b,  v24.8h
-    sqxtun      v21.8b,  v26.8h
-    sqxtun2     v20.16b, v25.8h
-    sqxtun2     v21.16b, v27.8h
-
-    uqadd        v4.16b, v4.16b, v20.16b
-    uqadd        v5.16b, v5.16b, v20.16b
-    uqadd        v6.16b, v6.16b, v20.16b
-    uqadd        v7.16b, v7.16b, v20.16b
-
-    uqsub        v4.16b, v4.16b, v21.16b
-    uqsub        v5.16b, v5.16b, v21.16b
-    uqsub        v6.16b, v6.16b, v21.16b
-    st1         {v4.16b}, [x2], x3
-    uqsub        v7.16b, v7.16b, v21.16b
-    st1         {v5.16b}, [x2], x3
-    st1         {v6.16b}, [x2], x3
-    st1         {v7.16b}, [x2], x3
-.endm
-
-function x264_add16x16_idct_dc_neon, export=1
-    mov         x2,  x0
-    mov         x3,  #FDEC_STRIDE
-
-    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
-    srshr       v0.4h,  v0.4h,  #6
-    srshr       v1.4h,  v1.4h,  #6
-
-    ADD16x4_IDCT_DC v0.h
-    srshr       v2.4h,  v2.4h,  #6
-    ADD16x4_IDCT_DC v1.h
-    srshr       v3.4h,  v3.4h,  #6
-    ADD16x4_IDCT_DC v2.h
-    ADD16x4_IDCT_DC v3.h
-    ret
-endfunc
-
-.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
-    ld1        {\t0\().8b}, [x1], x3
-    ld1        {\t1\().8b}, [x2], x4
-    ld1        {\t2\().8b}, [x1], x3
-    ld1        {\t3\().8b}, [x2], x4
-    usubl       \t0\().8h,  \t0\().8b,  \t1\().8b
-    ld1        {\t4\().8b}, [x1], x3
-    ld1        {\t5\().8b}, [x2], x4
-    usubl       \t1\().8h,  \t2\().8b,  \t3\().8b
-    ld1        {\t6\().8b}, [x1], x3
-    ld1        {\t7\().8b}, [x2], x4
-    add         \dst\().8h, \t0\().8h,  \t1\().8h
-    usubl       \t2\().8h,  \t4\().8b,  \t5\().8b
-    usubl       \t3\().8h,  \t6\().8b,  \t7\().8b
-    add         \dst\().8h, \dst\().8h, \t2\().8h
-    add         \dst\().8h, \dst\().8h, \t3\().8h
-.endm
-
-function x264_sub8x8_dct_dc_neon, export=1
-    mov             x3,  #FENC_STRIDE
-    mov             x4,  #FDEC_STRIDE
-
-    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
-    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
-
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
-    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
-    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
-
-    addp        v0.8h,  v2.8h,  v3.8h
-    addp        v0.8h,  v0.8h,  v0.8h
-
-    st1        {v0.4h}, [x0]
-    ret
-endfunc
-
-function x264_sub8x16_dct_dc_neon, export=1
-    mov             x3,  #FENC_STRIDE
-    mov             x4,  #FDEC_STRIDE
-    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
-    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
-    sub4x4x2_dct_dc  v2, v16, v17, v18, v19, v20, v21, v22, v23
-    sub4x4x2_dct_dc  v3, v24, v25, v26, v27, v28, v29, v30, v31
-
-    addp             v4.8h,  v0.8h,  v2.8h
-    addp             v5.8h,  v1.8h,  v3.8h
-
-    transpose   v2.4s,  v3.4s,  v4.4s,  v5.4s
-    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
-
-    transpose   v2.4s,  v3.4s,  v0.4s,  v1.4s
-    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
-
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
-    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
-
-    trn1        v2.2d,  v0.2d,  v1.2d
-    trn2        v3.2d,  v1.2d,  v0.2d
-
-    addp        v0.8h,  v2.8h,  v3.8h
-
-    st1        {v0.8h}, [x0]
-    ret
-endfunc
-
-function x264_zigzag_interleave_8x8_cavlc_neon, export=1
-    mov        x3,  #7
-    movi       v31.4s, #1
-    ld4        {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
-    ld4        {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
-    umax       v16.8h, v0.8h,  v4.8h
-    umax       v17.8h, v1.8h,  v5.8h
-    umax       v18.8h, v2.8h,  v6.8h
-    umax       v19.8h, v3.8h,  v7.8h
-    st1        {v0.8h}, [x0],  #16
-    st1        {v4.8h}, [x0],  #16
-    umaxp      v16.8h, v16.8h, v17.8h
-    umaxp      v18.8h, v18.8h, v19.8h
-    st1        {v1.8h}, [x0],  #16
-    st1        {v5.8h}, [x0],  #16
-    umaxp      v16.8h, v16.8h, v18.8h
-    st1        {v2.8h}, [x0],  #16
-    st1        {v6.8h}, [x0],  #16
-    cmhi       v16.4s, v16.4s, v31.4s
-    st1        {v3.8h}, [x0],  #16
-    and        v16.16b, v16.16b, v31.16b
-    st1        {v7.8h}, [x0],  #16
-    st1        {v16.b}[0],    [x2],  #1
-    st1        {v16.b}[4],    [x2],  x3
-    st1        {v16.b}[8],    [x2],  #1
-    st1        {v16.b}[12],   [x2]
-    ret
-endfunc
-
-function x264_zigzag_scan_4x4_frame_neon, export=1
-    movrel      x2, scan4x4_frame
-    ld1        {v0.16b,v1.16b}, [x1]
-    ld1        {v16.16b,v17.16b}, [x2]
-    tbl         v2.16b, {v0.16b,v1.16b}, v16.16b
-    tbl         v3.16b, {v0.16b,v1.16b}, v17.16b
-    st1        {v2.16b,v3.16b},   [x0]
-    ret
-endfunc
-
-.macro zigzag_sub_4x4 f ac
-function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
-    mov         x9,  #FENC_STRIDE
-    mov         x4,  #FDEC_STRIDE
-    movrel      x5,  sub4x4_\f
-    mov         x6,  x2
-    ld1        {v0.s}[0], [x1], x9
-    ld1        {v0.s}[1], [x1], x9
-    ld1        {v0.s}[2], [x1], x9
-    ld1        {v0.s}[3], [x1], x9
-    ld1        {v16.16b}, [x5]
-    ld1        {v1.s}[0], [x2], x4
-    ld1        {v1.s}[1], [x2], x4
-    ld1        {v1.s}[2], [x2], x4
-    ld1        {v1.s}[3], [x2], x4
-    tbl         v2.16b, {v0.16b}, v16.16b
-    tbl         v3.16b, {v1.16b}, v16.16b
-    st1        {v0.s}[0], [x6], x4
-    usubl       v4.8h,  v2.8b,  v3.8b
-.ifc \ac, ac
-    dup         h7, v4.h[0]
-    ins         v4.h[0], wzr
-    fmov        w5,  s7
-    strh        w5,  [x3]
-.endif
-    usubl2      v5.8h,  v2.16b, v3.16b
-    st1        {v0.s}[1], [x6], x4
-    umax        v6.8h,  v4.8h,  v5.8h
-    umaxv       h6,  v6.8h
-    st1        {v0.s}[2], [x6], x4
-    fmov        w7,  s6
-    st1        {v0.s}[3], [x6], x4
-    cmp         w7, #0
-    st1        {v4.8h,v5.8h},   [x0]
-    cset        w0, ne
-    ret
-endfunc
-.endm
-
-zigzag_sub_4x4 field
-zigzag_sub_4x4 field, ac
-zigzag_sub_4x4 frame
-zigzag_sub_4x4 frame, ac
-
-function x264_zigzag_scan_4x4_field_neon, export=1
-    movrel      x2, scan4x4_field
-    ld1        {v0.8h,v1.8h},   [x1]
-    ld1        {v16.16b},       [x2]
-    tbl         v0.16b, {v0.16b}, v16.16b
-    st1        {v0.8h,v1.8h},   [x0]
-    ret
-endfunc
-
-function x264_zigzag_scan_8x8_frame_neon, export=1
-    movrel      x2,  scan8x8_frame
-    ld1        {v0.8h,v1.8h},   [x1], #32
-    ld1        {v2.8h,v3.8h},   [x1], #32
-    ld1        {v4.8h,v5.8h},   [x1], #32
-    ld1        {v6.8h,v7.8h},   [x1]
-    ld1        {v16.16b,v17.16b}, [x2], #32
-    ld1        {v18.16b,v19.16b}, [x2], #32
-    ld1        {v20.16b,v21.16b}, [x2], #32
-    ld1        {v22.16b,v23.16b}, [x2], #32
-    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
-    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
-    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
-    tbl         v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
-    tbl         v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
-    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
-    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
-    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
-    mov         v25.h[6], v4.h[0]
-    mov         v25.h[7], v5.h[0]
-    mov         v26.h[0], v4.h[1]
-    mov         v27.h[4], v7.h[0]
-    mov         v28.h[7], v4.h[4]
-    mov         v29.h[7], v3.h[6]
-    mov         v30.h[0], v2.h[7]
-    mov         v30.h[1], v3.h[7]
-    st1        {v24.8h,v25.8h}, [x0], #32
-    st1        {v26.8h,v27.8h}, [x0], #32
-    st1        {v28.8h,v29.8h}, [x0], #32
-    st1        {v30.8h,v31.8h}, [x0]
-    ret
-endfunc
-
-#define Z(z)   2*(z), 2*(z)+1
-#define T(x,y) Z(x*8+y)
-const scan8x8_frame, align=5
-    .byte T(0,0), T(1,0), T(0,1), T(0,2)
-    .byte T(1,1), T(2,0), T(3,0), T(2,1)
-    .byte T(1,2), T(0,3), T(0,4), T(1,3)
-    .byte T(2,2), T(3,1), T(4,0), T(5,0)
-    .byte T(4,1), T(3,2), T(2,3), T(1,4)
-    .byte T(0,5), T(0,6), T(1,5), T(2,4)
-#undef T
-#define T(x,y) Z((x-3)*8+y)
-    .byte T(3,3), T(4,2), T(5,1), T(6,0)
-    .byte T(7,0), T(6,1), T(5,2), T(4,3)
-#undef T
-#define T(x,y) Z((x-0)*8+y)
-    .byte T(3,4), T(2,5), T(1,6), T(0,7)
-    .byte T(1,7), T(2,6), T(3,5), T(4,4)
-#undef T
-#define T(x,y) Z((x-4)*8+y)
-    .byte T(5,3), T(6,2), T(7,1), T(7,2)
-    .byte T(6,3), T(5,4), T(4,5), T(3,6)
-    .byte T(2,7), T(3,7), T(4,6), T(5,5)
-    .byte T(6,4), T(7,3), T(7,4), T(6,5)
-    .byte T(5,6), T(4,7), T(5,7), T(6,6)
-    .byte T(7,5), T(7,6), T(6,7), T(7,7)
-endconst
-
-function x264_zigzag_scan_8x8_field_neon, export=1
-    movrel      x2,  scan8x8_field
-    ld1        {v0.8h,v1.8h},   [x1], #32
-    ld1        {v2.8h,v3.8h},   [x1], #32
-    ld1        {v4.8h,v5.8h},   [x1], #32
-    ld1        {v6.8h,v7.8h},   [x1]
-    ld1        {v16.16b,v17.16b}, [x2], #32
-    ld1        {v18.16b,v19.16b}, [x2], #32
-    ld1        {v20.16b,v21.16b}, [x2], #32
-    ld1        {v22.16b}, [x2]
-    ext         v31.16b, v7.16b, v7.16b, #4
-    tbl         v24.16b, {v0.16b,v1.16b},               v16.16b
-    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
-    tbl         v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
-    tbl         v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
-    tbl         v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
-    tbl         v29.16b, {v4.16b,v5.16b,v6.16b},        v21.16b
-    tbl         v30.16b, {v5.16b,v6.16b,v7.16b},        v22.16b
-    ext         v31.16b, v6.16b, v31.16b, #12
-    st1        {v24.8h,v25.8h}, [x0], #32
-    st1        {v26.8h,v27.8h}, [x0], #32
-    st1        {v28.8h,v29.8h}, [x0], #32
-    st1        {v30.8h,v31.8h}, [x0]
-    ret
-endfunc
-
-.macro zigzag_sub8x8 f
-function x264_zigzag_sub_8x8_\f\()_neon, export=1
-    movrel      x4,  sub8x8_\f
-    mov         x5,  #FENC_STRIDE
-    mov         x6,  #FDEC_STRIDE
-    mov         x7,  x2
-    ld1        {v0.d}[0], [x1], x5
-    ld1        {v0.d}[1], [x1], x5
-    ld1        {v1.d}[0], [x1], x5
-    ld1        {v1.d}[1], [x1], x5
-    ld1        {v2.d}[0], [x1], x5
-    ld1        {v2.d}[1], [x1], x5
-    ld1        {v3.d}[0], [x1], x5
-    ld1        {v3.d}[1], [x1]
-    ld1        {v4.d}[0], [x2], x6
-    ld1        {v4.d}[1], [x2], x6
-    ld1        {v5.d}[0], [x2], x6
-    ld1        {v5.d}[1], [x2], x6
-    ld1        {v6.d}[0], [x2], x6
-    ld1        {v6.d}[1], [x2], x6
-    ld1        {v7.d}[0], [x2], x6
-    ld1        {v7.d}[1], [x2]
-    ld1        {v16.16b,v17.16b}, [x4], #32
-    ld1        {v18.16b,v19.16b}, [x4], #32
-    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
-    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
-    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
-    tbl         v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
-    tbl         v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
-    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
-    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
-    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
-    usubl       v4.8h,  v24.8b,  v28.8b
-    usubl2      v5.8h,  v24.16b, v28.16b
-    usubl       v6.8h,  v25.8b,  v29.8b
-    usubl2      v7.8h,  v25.16b, v29.16b
-    usubl       v16.8h, v26.8b,  v30.8b
-    usubl2      v17.8h, v26.16b, v30.16b
-    usubl       v18.8h, v27.8b,  v31.8b
-    usubl2      v19.8h, v27.16b, v31.16b
-    umax        v20.8h, v4.8h,   v5.8h
-    umax        v21.8h, v6.8h,   v7.8h
-    umax        v22.8h, v16.8h,  v17.8h
-    umax        v23.8h, v18.8h,  v19.8h
-    umax        v20.8h, v20.8h,  v21.8h
-    umax        v21.8h, v22.8h,  v23.8h
-    umax        v20.8h, v20.8h,  v21.8h
-    umaxv       h22,    v20.8h
-    st1        {v0.d}[0], [x7], x6
-    st1        {v0.d}[1], [x7], x6
-    st1        {v1.d}[0], [x7], x6
-    st1        {v1.d}[1], [x7], x6
-    st1        {v2.d}[0], [x7], x6
-    st1        {v2.d}[1], [x7], x6
-    st1        {v3.d}[0], [x7], x6
-    st1        {v3.d}[1], [x7]
-    st1        {v4.8h,v5.8h},   [x0], #32
-    st1        {v6.8h,v7.8h},   [x0], #32
-    st1        {v16.8h,v17.8h}, [x0], #32
-    st1        {v18.8h,v19.8h}, [x0]
-    fmov        w9,  s22
-    cmp         w9, #0
-    cset        w0, ne
-    ret
-endfunc
-.endm
-
-zigzag_sub8x8 field
-zigzag_sub8x8 frame
-
-#undef T
-#define T(x,y) Z(x*8+y)
-const scan8x8_field, align=5
-    .byte T(0,0), T(0,1), T(0,2), T(1,0)
-    .byte T(1,1), T(0,3), T(0,4), T(1,2)
-    .byte T(2,0), T(1,3), T(0,5), T(0,6)
-    .byte T(0,7), T(1,4), T(2,1), T(3,0)
-#undef T
-#define T(x,y) Z((x-1)*8+y)
-    .byte T(2,2), T(1,5), T(1,6), T(1,7)
-    .byte T(2,3), T(3,1), T(4,0), T(3,2)
-#undef T
-#define T(x,y) Z((x-2)*8+y)
-    .byte T(2,4), T(2,5), T(2,6), T(2,7)
-    .byte T(3,3), T(4,1), T(5,0), T(4,2)
-#undef T
-#define T(x,y) Z((x-3)*8+y)
-    .byte T(3,4), T(3,5), T(3,6), T(3,7)
-    .byte T(4,3), T(5,1), T(6,0), T(5,2)
-#undef T
-#define T(x,y) Z((x-4)*8+y)
-    .byte T(4,4), T(4,5), T(4,6), T(4,7)
-    .byte T(5,3), T(6,1), T(6,2), T(5,4)
-#undef T
-#define T(x,y) Z((x-5)*8+y)
-    .byte T(5,5), T(5,6), T(5,7), T(6,3)
-    .byte T(7,0), T(7,1), T(6,4), T(6,5)
-endconst
-
-
-#undef T
-#define T(y,x) x*8+y
-const sub8x8_frame, align=5
-    .byte T(0,0), T(1,0), T(0,1), T(0,2)
-    .byte T(1,1), T(2,0), T(3,0), T(2,1)
-    .byte T(1,2), T(0,3), T(0,4), T(1,3)
-    .byte T(2,2), T(3,1), T(4,0), T(5,0)
-    .byte T(4,1), T(3,2), T(2,3), T(1,4)
-    .byte T(0,5), T(0,6), T(1,5), T(2,4)
-    .byte T(3,3), T(4,2), T(5,1), T(6,0)
-    .byte T(7,0), T(6,1), T(5,2), T(4,3)
-    .byte T(3,4), T(2,5), T(1,6), T(0,7)
-    .byte T(1,7), T(2,6), T(3,5), T(4,4)
-    .byte T(5,3), T(6,2), T(7,1), T(7,2)
-    .byte T(6,3), T(5,4), T(4,5), T(3,6)
-    .byte T(2,7), T(3,7), T(4,6), T(5,5)
-    .byte T(6,4), T(7,3), T(7,4), T(6,5)
-    .byte T(5,6), T(4,7), T(5,7), T(6,6)
-    .byte T(7,5), T(7,6), T(6,7), T(7,7)
-endconst
-
-const sub8x8_field, align=5
-    .byte T(0,0), T(0,1), T(0,2), T(1,0)
-    .byte T(1,1), T(0,3), T(0,4), T(1,2)
-    .byte T(2,0), T(1,3), T(0,5), T(0,6)
-    .byte T(0,7), T(1,4), T(2,1), T(3,0)
-    .byte T(2,2), T(1,5), T(1,6), T(1,7)
-    .byte T(2,3), T(3,1), T(4,0), T(3,2)
-    .byte T(2,4), T(2,5), T(2,6), T(2,7)
-    .byte T(3,3), T(4,1), T(5,0), T(4,2)
-    .byte T(3,4), T(3,5), T(3,6), T(3,7)
-    .byte T(4,3), T(5,1), T(6,0), T(5,2)
-    .byte T(4,4), T(4,5), T(4,6), T(4,7)
-    .byte T(5,3), T(6,1), T(6,2), T(5,4)
-    .byte T(5,5), T(5,6), T(5,7), T(6,3)
-    .byte T(7,0), T(7,1), T(6,4), T(6,5)
-    .byte T(6,6), T(6,7), T(7,2), T(7,3)
-    .byte T(7,4), T(7,5), T(7,6), T(7,7)
-endconst
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/dct.h b/android/src/main/libenc/jni/libx264/common/aarch64/dct.h
deleted file mode 100755
index 6c812ac..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/dct.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*****************************************************************************
- * dct.h: aarch64 transform and zigzag
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_DCT_H
-#define X264_AARCH64_DCT_H
-
-void x264_dct4x4dc_neon( int16_t d[16] );
-void x264_idct4x4dc_neon( int16_t d[16] );
-
-void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
-void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
-
-void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
-void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
-void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
-
-void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
-
-int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
-int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
-int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
-int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
-
-int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
-int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
-
-void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/deblock-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/deblock-a.S
deleted file mode 100755
index 1310f3b..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/deblock-a.S
+++ /dev/null
@@ -1,813 +0,0 @@
-/*****************************************************************************
- * deblock.S: aarch64 deblocking
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Mans Rullgard <mans@mansr.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.macro h264_loop_filter_start
-    cmp             w2,  #0
-    ldr             w6,  [x4]
-    ccmp            w3,  #0, #0, ne
-    mov             v24.s[0], w6
-    and             w8,  w6,  w6,  lsl #16
-    b.eq            1f
-    ands            w8,  w8,  w8,  lsl #8
-    b.ge            2f
-1:
-    ret
-2:
-.endm
-
-.macro h264_loop_filter_luma
-    dup             v22.16b, w2                     // alpha
-    uxtl            v24.8h,  v24.8b
-    uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
-    uxtl            v24.4s,  v24.4h
-    uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
-    sli             v24.8h,  v24.8h,  #8
-    uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
-    sli             v24.4s,  v24.4s,  #16
-    cmhi            v21.16b, v22.16b, v21.16b       // < alpha
-    dup             v22.16b, w3                     // beta
-    cmlt            v23.16b, v24.16b, #0
-    cmhi            v28.16b, v22.16b, v28.16b       // < beta
-    cmhi            v30.16b, v22.16b, v30.16b       // < beta
-    bic             v21.16b, v21.16b, v23.16b
-    uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
-    and             v21.16b, v21.16b, v28.16b
-    uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
-    cmhi            v17.16b, v22.16b, v17.16b       // < beta
-    and             v21.16b, v21.16b, v30.16b
-    cmhi            v19.16b, v22.16b, v19.16b       // < beta
-    and             v17.16b, v17.16b, v21.16b
-    and             v19.16b, v19.16b, v21.16b
-    and             v24.16b, v24.16b, v21.16b
-    urhadd          v28.16b, v16.16b,  v0.16b
-    sub             v21.16b, v24.16b, v17.16b
-    uqadd           v23.16b, v18.16b, v24.16b
-    uhadd           v20.16b, v20.16b, v28.16b
-    sub             v21.16b, v21.16b, v19.16b
-    uhadd           v28.16b,  v4.16b, v28.16b
-    umin            v23.16b, v23.16b, v20.16b
-    uqsub           v22.16b, v18.16b, v24.16b
-    uqadd           v4.16b,   v2.16b, v24.16b
-    umax            v23.16b, v23.16b, v22.16b
-    uqsub           v22.16b,  v2.16b, v24.16b
-    umin            v28.16b,  v4.16b, v28.16b
-    uxtl            v4.8h,    v0.8b
-    umax            v28.16b, v28.16b, v22.16b
-    uxtl2           v20.8h,   v0.16b
-    usubw           v4.8h,    v4.8h,  v16.8b
-    usubw2          v20.8h,  v20.8h,  v16.16b
-    shl             v4.8h,    v4.8h,  #2
-    shl             v20.8h,  v20.8h,  #2
-    uaddw           v4.8h,    v4.8h,  v18.8b
-    uaddw2          v20.8h,  v20.8h,  v18.16b
-    usubw           v4.8h,    v4.8h,   v2.8b
-    usubw2          v20.8h,  v20.8h,   v2.16b
-    rshrn           v4.8b,    v4.8h,  #3
-    rshrn2          v4.16b,  v20.8h,  #3
-    bsl             v17.16b, v23.16b, v18.16b
-    bsl             v19.16b, v28.16b,  v2.16b
-    neg             v23.16b, v21.16b
-    uxtl            v28.8h,  v16.8b
-    smin            v4.16b,   v4.16b, v21.16b
-    uxtl2           v21.8h,  v16.16b
-    smax            v4.16b,   v4.16b, v23.16b
-    uxtl            v22.8h,   v0.8b
-    uxtl2           v24.8h,   v0.16b
-    saddw           v28.8h,  v28.8h,  v4.8b
-    saddw2          v21.8h,  v21.8h,  v4.16b
-    ssubw           v22.8h,  v22.8h,  v4.8b
-    ssubw2          v24.8h,  v24.8h,  v4.16b
-    sqxtun          v16.8b,  v28.8h
-    sqxtun2         v16.16b, v21.8h
-    sqxtun          v0.8b,   v22.8h
-    sqxtun2         v0.16b,  v24.8h
-.endm
-
-function x264_deblock_v_luma_neon, export=1
-    h264_loop_filter_start
-
-    ld1             {v0.16b},  [x0], x1
-    ld1             {v2.16b},  [x0], x1
-    ld1             {v4.16b},  [x0], x1
-    sub             x0,  x0,  x1, lsl #2
-    sub             x0,  x0,  x1, lsl #1
-    ld1             {v20.16b},  [x0], x1
-    ld1             {v18.16b},  [x0], x1
-    ld1             {v16.16b},  [x0], x1
-
-    h264_loop_filter_luma
-
-    sub             x0,  x0,  x1, lsl #1
-    st1             {v17.16b}, [x0], x1
-    st1             {v16.16b}, [x0], x1
-    st1             {v0.16b},  [x0], x1
-    st1             {v19.16b}, [x0]
-
-    ret
-endfunc
-
-function x264_deblock_h_luma_neon, export=1
-    h264_loop_filter_start
-
-    sub             x0,  x0,  #4
-    ld1             {v6.8b},  [x0], x1
-    ld1             {v20.8b}, [x0], x1
-    ld1             {v18.8b}, [x0], x1
-    ld1             {v16.8b}, [x0], x1
-    ld1             {v0.8b},  [x0], x1
-    ld1             {v2.8b},  [x0], x1
-    ld1             {v4.8b},  [x0], x1
-    ld1             {v26.8b}, [x0], x1
-    ld1             {v6.d}[1],  [x0], x1
-    ld1             {v20.d}[1], [x0], x1
-    ld1             {v18.d}[1], [x0], x1
-    ld1             {v16.d}[1], [x0], x1
-    ld1             {v0.d}[1],  [x0], x1
-    ld1             {v2.d}[1],  [x0], x1
-    ld1             {v4.d}[1],  [x0], x1
-    ld1             {v26.d}[1], [x0], x1
-
-    transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
-
-    h264_loop_filter_luma
-
-    transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
-
-    sub             x0,  x0,  x1, lsl #4
-    add             x0,  x0,  #2
-    st1             {v17.s}[0],  [x0], x1
-    st1             {v16.s}[0], [x0], x1
-    st1             {v0.s}[0],  [x0], x1
-    st1             {v19.s}[0], [x0], x1
-    st1             {v17.s}[1],  [x0], x1
-    st1             {v16.s}[1], [x0], x1
-    st1             {v0.s}[1],  [x0], x1
-    st1             {v19.s}[1], [x0], x1
-    st1             {v17.s}[2],  [x0], x1
-    st1             {v16.s}[2], [x0], x1
-    st1             {v0.s}[2],  [x0], x1
-    st1             {v19.s}[2], [x0], x1
-    st1             {v17.s}[3],  [x0], x1
-    st1             {v16.s}[3], [x0], x1
-    st1             {v0.s}[3],  [x0], x1
-    st1             {v19.s}[3], [x0], x1
-
-    ret
-endfunc
-
-.macro h264_loop_filter_start_intra
-    orr             w4,  w2,  w3
-    cmp             w4,  #0
-    b.ne            1f
-    ret
-1:
-    dup             v30.16b, w2                // alpha
-    dup             v31.16b, w3                // beta
-.endm
-
-.macro h264_loop_filter_luma_intra
-    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
-    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
-    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
-    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
-    cmhi            v17.16b, v31.16b, v17.16b       // < beta
-    cmhi            v18.16b, v31.16b, v18.16b       // < beta
-
-    movi            v29.16b, #2
-    ushr            v30.16b, v30.16b, #2            // alpha >> 2
-    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
-    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
-
-    and             v19.16b, v19.16b, v17.16b
-    and             v19.16b, v19.16b, v18.16b
-    shrn            v20.8b,  v19.8h,  #4
-    mov             x4, v20.d[0]
-    cbz             x4, 9f
-
-    ushll           v20.8h,  v6.8b,   #1
-    ushll           v22.8h,  v1.8b,   #1
-    ushll2          v21.8h,  v6.16b,  #1
-    ushll2          v23.8h,  v1.16b,  #1
-    uaddw           v20.8h,  v20.8h,  v7.8b
-    uaddw           v22.8h,  v22.8h,  v0.8b
-    uaddw2          v21.8h,  v21.8h,  v7.16b
-    uaddw2          v23.8h,  v23.8h,  v0.16b
-    uaddw           v20.8h,  v20.8h,  v1.8b
-    uaddw           v22.8h,  v22.8h,  v6.8b
-    uaddw2          v21.8h,  v21.8h,  v1.16b
-    uaddw2          v23.8h,  v23.8h,  v6.16b
-
-    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
-    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
-    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
-    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
-
-    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
-    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
-    cmhi            v17.16b, v31.16b, v17.16b       // < beta
-    cmhi            v18.16b, v31.16b, v18.16b       // < beta
-
-    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
-    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
-
-    not             v30.16b, v17.16b
-    not             v31.16b, v18.16b
-
-    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
-    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
-
-    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
-    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
-
-    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
-    uaddl           v26.8h,  v5.8b,   v7.8b
-    uaddl2          v27.8h,  v5.16b,  v7.16b
-    uaddw           v26.8h,  v26.8h,  v0.8b
-    uaddw2          v27.8h,  v27.8h,  v0.16b
-    add             v20.8h,  v20.8h,  v26.8h
-    add             v21.8h,  v21.8h,  v27.8h
-    uaddw           v20.8h,  v20.8h,  v0.8b
-    uaddw2          v21.8h,  v21.8h,  v0.16b
-    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
-    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
-    uaddw           v26.8h,  v26.8h,  v6.8b
-    uaddw2          v27.8h,  v27.8h,  v6.16b
-    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
-    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
-    uaddl           v28.8h,  v4.8b,   v5.8b
-    uaddl2          v29.8h,  v4.16b,  v5.16b
-    shl             v28.8h,  v28.8h,  #1
-    shl             v29.8h,  v29.8h,  #1
-    add             v28.8h,  v28.8h,  v26.8h
-    add             v29.8h,  v29.8h,  v27.8h
-    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
-    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
-
-    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
-    uaddl           v26.8h,  v2.8b,   v0.8b
-    uaddl2          v27.8h,  v2.16b,  v0.16b
-    uaddw           v26.8h,  v26.8h,  v7.8b
-    uaddw2          v27.8h,  v27.8h,  v7.16b
-    add             v22.8h,  v22.8h,  v26.8h
-    add             v23.8h,  v23.8h,  v27.8h
-    uaddw           v22.8h,  v22.8h,  v7.8b
-    uaddw2          v23.8h,  v23.8h,  v7.16b
-    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
-    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
-    uaddw           v26.8h,  v26.8h,  v1.8b
-    uaddw2          v27.8h,  v27.8h,  v1.16b
-    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
-    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
-    uaddl           v28.8h,  v2.8b,   v3.8b
-    uaddl2          v29.8h,  v2.16b,  v3.16b
-    shl             v28.8h,  v28.8h,  #1
-    shl             v29.8h,  v29.8h,  #1
-    add             v28.8h,  v28.8h,  v26.8h
-    add             v29.8h,  v29.8h,  v27.8h
-    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
-    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
-
-    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
-    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
-    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
-    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
-    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
-    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
-    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
-    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
-.endm
-
-function x264_deblock_v_luma_intra_neon, export=1
-    h264_loop_filter_start_intra
-
-    ld1             {v0.16b},  [x0], x1 // q0
-    ld1             {v1.16b},  [x0], x1 // q1
-    ld1             {v2.16b},  [x0], x1 // q2
-    ld1             {v3.16b},  [x0], x1 // q3
-    sub             x0,  x0,  x1, lsl #3
-    ld1             {v4.16b},  [x0], x1 // p3
-    ld1             {v5.16b},  [x0], x1 // p2
-    ld1             {v6.16b},  [x0], x1 // p1
-    ld1             {v7.16b},  [x0]     // p0
-
-    h264_loop_filter_luma_intra
-
-    sub             x0,  x0,  x1, lsl #1
-    st1             {v5.16b}, [x0], x1  // p2
-    st1             {v6.16b}, [x0], x1  // p1
-    st1             {v7.16b}, [x0], x1  // p0
-    st1             {v0.16b}, [x0], x1  // q0
-    st1             {v1.16b}, [x0], x1  // q1
-    st1             {v2.16b}, [x0]      // q2
-9:
-    ret
-endfunc
-
-function x264_deblock_h_luma_intra_neon, export=1
-    h264_loop_filter_start_intra
-
-    sub             x0,  x0,  #4
-    ld1             {v4.8b},  [x0], x1
-    ld1             {v5.8b},  [x0], x1
-    ld1             {v6.8b},  [x0], x1
-    ld1             {v7.8b},  [x0], x1
-    ld1             {v0.8b},  [x0], x1
-    ld1             {v1.8b},  [x0], x1
-    ld1             {v2.8b},  [x0], x1
-    ld1             {v3.8b},  [x0], x1
-    ld1             {v4.d}[1],  [x0], x1
-    ld1             {v5.d}[1],  [x0], x1
-    ld1             {v6.d}[1],  [x0], x1
-    ld1             {v7.d}[1],  [x0], x1
-    ld1             {v0.d}[1],  [x0], x1
-    ld1             {v1.d}[1],  [x0], x1
-    ld1             {v2.d}[1],  [x0], x1
-    ld1             {v3.d}[1],  [x0], x1
-
-    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
-
-    h264_loop_filter_luma_intra
-
-    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
-
-    sub             x0,  x0,  x1, lsl #4
-    st1             {v4.8b},  [x0], x1
-    st1             {v5.8b},  [x0], x1
-    st1             {v6.8b},  [x0], x1
-    st1             {v7.8b},  [x0], x1
-    st1             {v0.8b},  [x0], x1
-    st1             {v1.8b},  [x0], x1
-    st1             {v2.8b},  [x0], x1
-    st1             {v3.8b},  [x0], x1
-    st1             {v4.d}[1],  [x0], x1
-    st1             {v5.d}[1],  [x0], x1
-    st1             {v6.d}[1],  [x0], x1
-    st1             {v7.d}[1],  [x0], x1
-    st1             {v0.d}[1],  [x0], x1
-    st1             {v1.d}[1],  [x0], x1
-    st1             {v2.d}[1],  [x0], x1
-    st1             {v3.d}[1],  [x0], x1
-9:
-    ret
-endfunc
-
-.macro h264_loop_filter_chroma
-    dup             v22.16b, w2              // alpha
-    uxtl            v24.8h,  v24.8b
-    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
-    uxtl            v4.8h,   v0.8b
-    uxtl2           v5.8h,   v0.16b
-    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
-    usubw           v4.8h,   v4.8h,   v16.8b
-    usubw2          v5.8h,   v5.8h,   v16.16b
-    sli             v24.8h,  v24.8h,  #8
-    shl             v4.8h,   v4.8h,   #2
-    shl             v5.8h,   v5.8h,   #2
-    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
-    uxtl            v24.4s,  v24.4h
-    uaddw           v4.8h,   v4.8h,   v18.8b
-    uaddw2          v5.8h,   v5.8h,   v18.16b
-    cmhi            v26.16b, v22.16b, v26.16b  // < alpha
-    usubw           v4.8h,   v4.8h,   v2.8b
-    usubw2          v5.8h,   v5.8h,   v2.16b
-    sli             v24.4s,  v24.4s,  #16
-    dup             v22.16b, w3              // beta
-    rshrn           v4.8b,   v4.8h,   #3
-    rshrn2          v4.16b,  v5.8h,   #3
-    cmhi            v28.16b, v22.16b, v28.16b  // < beta
-    cmhi            v30.16b, v22.16b, v30.16b  // < beta
-    smin            v4.16b,  v4.16b,  v24.16b
-    neg             v25.16b, v24.16b
-    and             v26.16b, v26.16b, v28.16b
-    smax            v4.16b,  v4.16b,  v25.16b
-    and             v26.16b, v26.16b, v30.16b
-    uxtl            v22.8h,  v0.8b
-    uxtl2           v23.8h,  v0.16b
-    and             v4.16b,  v4.16b,  v26.16b
-    uxtl            v28.8h,  v16.8b
-    uxtl2           v29.8h,  v16.16b
-    saddw           v28.8h,  v28.8h,  v4.8b
-    saddw2          v29.8h,  v29.8h,  v4.16b
-    ssubw           v22.8h,  v22.8h,  v4.8b
-    ssubw2          v23.8h,  v23.8h,  v4.16b
-    sqxtun          v16.8b,  v28.8h
-    sqxtun          v0.8b,   v22.8h
-    sqxtun2         v16.16b, v29.8h
-    sqxtun2         v0.16b,  v23.8h
-.endm
-
-function x264_deblock_v_chroma_neon, export=1
-    h264_loop_filter_start
-
-    sub             x0,  x0,  x1, lsl #1
-    ld1             {v18.16b}, [x0], x1
-    ld1             {v16.16b}, [x0], x1
-    ld1             {v0.16b},  [x0], x1
-    ld1             {v2.16b},  [x0]
-
-    h264_loop_filter_chroma
-
-    sub             x0,  x0,  x1, lsl #1
-    st1             {v16.16b}, [x0], x1
-    st1             {v0.16b},  [x0], x1
-
-    ret
-endfunc
-
-function x264_deblock_h_chroma_neon, export=1
-    h264_loop_filter_start
-
-    sub             x0,  x0,  #4
-deblock_h_chroma:
-    ld1             {v18.d}[0], [x0], x1
-    ld1             {v16.d}[0], [x0], x1
-    ld1             {v0.d}[0],  [x0], x1
-    ld1             {v2.d}[0],  [x0], x1
-    ld1             {v18.d}[1], [x0], x1
-    ld1             {v16.d}[1], [x0], x1
-    ld1             {v0.d}[1],  [x0], x1
-    ld1             {v2.d}[1],  [x0], x1
-
-    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
-
-    h264_loop_filter_chroma
-
-    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
-
-    sub             x0,  x0,  x1, lsl #3
-    st1             {v18.d}[0], [x0], x1
-    st1             {v16.d}[0], [x0], x1
-    st1             {v0.d}[0],  [x0], x1
-    st1             {v2.d}[0],  [x0], x1
-    st1             {v18.d}[1], [x0], x1
-    st1             {v16.d}[1], [x0], x1
-    st1             {v0.d}[1],  [x0], x1
-    st1             {v2.d}[1],  [x0], x1
-
-    ret
-endfunc
-
-function x264_deblock_h_chroma_422_neon, export=1
-    add             x5,  x0,  x1
-    sub             x0,  x0,  #4
-    add             x1,  x1,  x1
-    h264_loop_filter_start
-    mov             x7,  x30
-    bl              deblock_h_chroma
-    mov             x30, x7
-    sub             x0,  x5,  #4
-    mov             v24.s[0], w6
-    b               deblock_h_chroma
-endfunc
-
-.macro h264_loop_filter_chroma8
-    dup             v22.8b,  w2                 // alpha
-    uxtl            v24.8h,  v24.8b
-    uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
-    uxtl            v4.8h,   v17.8b
-    uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
-    usubw           v4.8h,   v4.8h,   v16.8b
-    sli             v24.8h,  v24.8h,  #8
-    shl             v4.8h,   v4.8h,   #2
-    uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
-    uaddw           v4.8h,   v4.8h,   v18.8b
-    cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
-    usubw           v4.8h,   v4.8h,   v19.8b
-    dup             v22.8b,  w3                 // beta
-    rshrn           v4.8b,   v4.8h,   #3
-    cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
-    cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
-    smin            v4.8b,   v4.8b,   v24.8b
-    neg             v25.8b,  v24.8b
-    and             v26.8b,  v26.8b,  v28.8b
-    smax            v4.8b,   v4.8b,   v25.8b
-    and             v26.8b,  v26.8b,  v30.8b
-    uxtl            v22.8h,  v17.8b
-    and             v4.8b,   v4.8b,   v26.8b
-    uxtl            v28.8h,  v16.8b
-    saddw           v28.8h,  v28.8h,  v4.8b
-    ssubw           v22.8h,  v22.8h,  v4.8b
-    sqxtun          v16.8b,  v28.8h
-    sqxtun          v17.8b,  v22.8h
-.endm
-
-function x264_deblock_h_chroma_mbaff_neon, export=1
-    h264_loop_filter_start
-
-    sub             x4,  x0,  #4
-    sub             x0,  x0,  #2
-
-    ld1             {v18.8b}, [x4], x1
-    ld1             {v16.8b}, [x4], x1
-    ld1             {v17.8b},  [x4], x1
-    ld1             {v19.8b},  [x4]
-
-    transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
-
-    h264_loop_filter_chroma8
-
-    st2             {v16.h,v17.h}[0], [x0], x1
-    st2             {v16.h,v17.h}[1], [x0], x1
-    st2             {v16.h,v17.h}[2], [x0], x1
-    st2             {v16.h,v17.h}[3], [x0]
-
-    ret
-endfunc
-
-.macro h264_loop_filter_chroma_intra width=16
-    uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
-    uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
-    uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
-    cmhi            v26.16b, v30.16b, v26.16b  // < alpha
-    cmhi            v27.16b, v31.16b, v27.16b  // < beta
-    cmhi            v28.16b, v31.16b, v28.16b  // < beta
-    and             v26.16b, v26.16b, v27.16b
-    and             v26.16b, v26.16b, v28.16b
-
-    ushll           v4.8h,   v18.8b,  #1
-    ushll           v6.8h,   v19.8b,  #1
-.ifc \width, 16
-    ushll2          v5.8h,   v18.16b, #1
-    ushll2          v7.8h,   v19.16b, #1
-    uaddl2          v21.8h,  v16.16b, v19.16b
-    uaddl2          v23.8h,  v17.16b, v18.16b
-.endif
-    uaddl           v20.8h,  v16.8b,  v19.8b
-    uaddl           v22.8h,  v17.8b,  v18.8b
-    add             v20.8h,  v20.8h,  v4.8h     // mlal?
-    add             v22.8h,  v22.8h,  v6.8h
-.ifc \width, 16
-    add             v21.8h,  v21.8h,  v5.8h
-    add             v23.8h,  v23.8h,  v7.8h
-.endif
-    uqrshrn         v24.8b,  v20.8h,  #2
-    uqrshrn         v25.8b,  v22.8h,  #2
-.ifc \width, 16
-    uqrshrn2        v24.16b, v21.8h,  #2
-    uqrshrn2        v25.16b, v23.8h,  #2
-.endif
-    bit             v16.16b, v24.16b, v26.16b
-    bit             v17.16b, v25.16b, v26.16b
-.endm
-
-function x264_deblock_v_chroma_intra_neon, export=1
-    h264_loop_filter_start_intra
-
-    sub             x0,  x0,  x1, lsl #1
-    ld1             {v18.16b}, [x0], x1
-    ld1             {v16.16b}, [x0], x1
-    ld1             {v17.16b}, [x0], x1
-    ld1             {v19.16b}, [x0]
-
-    h264_loop_filter_chroma_intra
-
-    sub             x0,  x0,  x1, lsl #1
-    st1             {v16.16b}, [x0], x1
-    st1             {v17.16b}, [x0], x1
-
-    ret
-endfunc
-
-function x264_deblock_h_chroma_intra_mbaff_neon, export=1
-    h264_loop_filter_start_intra
-
-    sub             x4,  x0,  #4
-    sub             x0,  x0,  #2
-    ld1             {v18.8b}, [x4], x1
-    ld1             {v16.8b}, [x4], x1
-    ld1             {v17.8b}, [x4], x1
-    ld1             {v19.8b}, [x4], x1
-
-    transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
-
-    h264_loop_filter_chroma_intra width=8
-
-    st2             {v16.h,v17.h}[0], [x0], x1
-    st2             {v16.h,v17.h}[1], [x0], x1
-    st2             {v16.h,v17.h}[2], [x0], x1
-    st2             {v16.h,v17.h}[3], [x0], x1
-
-    ret
-endfunc
-
-function x264_deblock_h_chroma_intra_neon, export=1
-    h264_loop_filter_start_intra
-
-    sub             x4,  x0,  #4
-    sub             x0,  x0,  #2
-    ld1             {v18.d}[0], [x4], x1
-    ld1             {v16.d}[0], [x4], x1
-    ld1             {v17.d}[0], [x4], x1
-    ld1             {v19.d}[0], [x4], x1
-    ld1             {v18.d}[1], [x4], x1
-    ld1             {v16.d}[1], [x4], x1
-    ld1             {v17.d}[1], [x4], x1
-    ld1             {v19.d}[1], [x4], x1
-
-    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
-
-    h264_loop_filter_chroma_intra
-
-    st2             {v16.h,v17.h}[0], [x0], x1
-    st2             {v16.h,v17.h}[1], [x0], x1
-    st2             {v16.h,v17.h}[2], [x0], x1
-    st2             {v16.h,v17.h}[3], [x0], x1
-    st2             {v16.h,v17.h}[4], [x0], x1
-    st2             {v16.h,v17.h}[5], [x0], x1
-    st2             {v16.h,v17.h}[6], [x0], x1
-    st2             {v16.h,v17.h}[7], [x0], x1
-
-    ret
-endfunc
-
-function x264_deblock_h_chroma_422_intra_neon, export=1
-    h264_loop_filter_start_intra
-
-    sub             x4,  x0,  #4
-    sub             x0,  x0,  #2
-    ld1             {v18.d}[0], [x4], x1
-    ld1             {v16.d}[0], [x4], x1
-    ld1             {v17.d}[0], [x4], x1
-    ld1             {v19.d}[0], [x4], x1
-    ld1             {v18.d}[1], [x4], x1
-    ld1             {v16.d}[1], [x4], x1
-    ld1             {v17.d}[1], [x4], x1
-    ld1             {v19.d}[1], [x4], x1
-
-    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
-
-    h264_loop_filter_chroma_intra
-
-    st2             {v16.h,v17.h}[0], [x0], x1
-    st2             {v16.h,v17.h}[1], [x0], x1
-    st2             {v16.h,v17.h}[2], [x0], x1
-    st2             {v16.h,v17.h}[3], [x0], x1
-    st2             {v16.h,v17.h}[4], [x0], x1
-    st2             {v16.h,v17.h}[5], [x0], x1
-    st2             {v16.h,v17.h}[6], [x0], x1
-    st2             {v16.h,v17.h}[7], [x0], x1
-
-    ld1             {v18.d}[0], [x4], x1
-    ld1             {v16.d}[0], [x4], x1
-    ld1             {v17.d}[0], [x4], x1
-    ld1             {v19.d}[0], [x4], x1
-    ld1             {v18.d}[1], [x4], x1
-    ld1             {v16.d}[1], [x4], x1
-    ld1             {v17.d}[1], [x4], x1
-    ld1             {v19.d}[1], [x4], x1
-
-    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
-
-    h264_loop_filter_chroma_intra
-
-    st2             {v16.h,v17.h}[0], [x0], x1
-    st2             {v16.h,v17.h}[1], [x0], x1
-    st2             {v16.h,v17.h}[2], [x0], x1
-    st2             {v16.h,v17.h}[3], [x0], x1
-    st2             {v16.h,v17.h}[4], [x0], x1
-    st2             {v16.h,v17.h}[5], [x0], x1
-    st2             {v16.h,v17.h}[6], [x0], x1
-    st2             {v16.h,v17.h}[7], [x0], x1
-
-    ret
-endfunc
-
-//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
-//                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-//                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
-//                                uint8_t bs[2][8][4], int mvy_limit,
-//                                int bframe )
-function x264_deblock_strength_neon, export=1
-    movi        v4.16b, #0
-    lsl         w4,  w4,  #8
-    add         x3,  x3,  #32
-    sub         w4,  w4,  #(1<<8)-3
-    movi        v5.16b, #0
-    dup         v6.8h,  w4
-    mov         x6,  #-32
-
-bframe:
-    // load bytes ref
-    add         x2,  x2,  #16
-    ld1        {v31.d}[1], [x1], #8
-    ld1        {v1.16b}, [x1], #16
-    movi        v0.16b,  #0
-    ld1        {v2.16b}, [x1], #16
-    ext         v3.16b,  v0.16b,  v1.16b,  #15
-    ext         v0.16b,  v0.16b,  v2.16b,  #15
-    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
-    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
-    ext         v21.16b, v31.16b, v22.16b, #12
-
-    eor         v0.16b,  v20.16b, v22.16b
-    eor         v1.16b,  v21.16b, v22.16b
-    orr         v4.16b,  v4.16b,  v0.16b
-    orr         v5.16b,  v5.16b,  v1.16b
-
-    ld1        {v21.8h}, [x2], #16      // mv + 0x10
-    ld1        {v19.8h}, [x2], #16      // mv + 0x20
-    ld1        {v22.8h}, [x2], #16      // mv + 0x30
-    ld1        {v18.8h}, [x2], #16      // mv + 0x40
-    ld1        {v23.8h}, [x2], #16      // mv + 0x50
-    ext         v19.16b, v19.16b, v22.16b, #12
-    ext         v18.16b, v18.16b, v23.16b, #12
-    sabd        v0.8h,   v22.8h,  v19.8h
-    ld1        {v19.8h}, [x2], #16      // mv + 0x60
-    sabd        v1.8h,   v23.8h,  v18.8h
-    ld1        {v24.8h}, [x2], #16      // mv + 0x70
-    uqxtn       v0.8b,   v0.8h
-    ld1        {v18.8h}, [x2], #16      // mv + 0x80
-    ld1        {v25.8h}, [x2], #16      // mv + 0x90
-    uqxtn2      v0.16b,  v1.8h
-    ext         v19.16b, v19.16b, v24.16b, #12
-    ext         v18.16b, v18.16b, v25.16b, #12
-    sabd        v1.8h,   v24.8h,  v19.8h
-    sabd        v2.8h,   v25.8h,  v18.8h
-    uqxtn       v1.8b,   v1.8h
-    uqxtn2      v1.16b,  v2.8h
-
-    uqsub       v0.16b,  v0.16b,  v6.16b
-    uqsub       v1.16b,  v1.16b,  v6.16b
-    uqxtn       v0.8b,   v0.8h
-    uqxtn2      v0.16b,  v1.8h
-
-    sabd        v1.8h,   v22.8h,  v23.8h
-    orr         v4.16b,  v4.16b,  v0.16b
-
-    sabd        v0.8h,   v21.8h,  v22.8h
-    sabd        v2.8h,   v23.8h,  v24.8h
-    sabd        v3.8h,   v24.8h,  v25.8h
-    uqxtn       v0.8b,   v0.8h
-    uqxtn2      v0.16b,  v1.8h
-    uqxtn       v1.8b,   v2.8h
-    uqxtn2      v1.16b,  v3.8h
-
-    uqsub       v0.16b,  v0.16b,  v6.16b
-    uqsub       v1.16b,  v1.16b,  v6.16b
-    uqxtn       v0.8b,   v0.8h
-    uqxtn2      v0.16b,  v1.8h
-    subs        w5,  w5,  #1
-    orr         v5.16b,  v5.16b,  v0.16b
-    b.eq        bframe
-
-    movi        v6.16b, #1
-    // load bytes nnz
-    ld1        {v31.d}[1], [x0], #8
-    ld1        {v1.16b}, [x0], #16
-    movi        v0.16b,  #0
-    ld1        {v2.16b}, [x0], #16
-    ext         v3.16b,  v0.16b,  v1.16b,  #15
-    ext         v0.16b,  v0.16b,  v2.16b,  #15
-    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
-    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
-    ext         v21.16b, v31.16b, v22.16b, #12
-
-    movrel      x7,  transpose_table
-    ld1        {v7.16b}, [x7]
-    orr         v0.16b,  v20.16b, v22.16b
-    orr         v1.16b,  v21.16b, v22.16b
-    umin        v0.16b,  v0.16b,  v6.16b
-    umin        v1.16b,  v1.16b,  v6.16b
-    umin        v4.16b,  v4.16b,  v6.16b        // mv ? 1 : 0
-    umin        v5.16b,  v5.16b,  v6.16b
-    add         v0.16b,  v0.16b,  v0.16b        // nnz ? 2 : 0
-    add         v1.16b,  v1.16b,  v1.16b
-    umax        v4.16b,  v4.16b,  v0.16b
-    umax        v5.16b,  v5.16b,  v1.16b
-    tbl         v6.16b, {v4.16b}, v7.16b
-    st1        {v5.16b}, [x3], x6       // bs[1]
-    st1        {v6.16b}, [x3]           // bs[0]
-    ret
-endfunc
-
-const transpose_table
-    .byte 0, 4,  8, 12
-    .byte 1, 5,  9, 13
-    .byte 2, 6, 10, 14
-    .byte 3, 7, 11, 15
-endconst
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/mc-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/mc-a.S
deleted file mode 100755
index 3a99fbe..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/mc-a.S
+++ /dev/null
@@ -1,1754 +0,0 @@
-/*****************************************************************************
- * mc.S: aarch64 motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *          Mans Rullgard <mans@mansr.com>
- *          Stefan Groenroos <stefan.gronroos@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-// note: prefetch stuff assumes 64-byte cacheline
-
-// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
-function x264_prefetch_ref_aarch64, export=1
-    cmp         w2,  #1
-    csel        x2,  xzr, x1, eq
-    add         x0,  x0,  #64
-    add         x0,  x0,  x2,  lsl #3
-
-    lsl         x2,  x1,  #1
-    add         x3,  x1,  x1,  lsl #1
-    add         x4,  x0,  x1,  lsl #2
-
-    prfm        pldl1strm, [x0]
-    prfm        pldl1strm, [x0,  x1]
-    prfm        pldl1strm, [x0,  x2]
-    prfm        pldl1strm, [x0,  x3]
-    prfm        pldl1strm, [x4]
-    prfm        pldl1strm, [x4,  x1]
-    prfm        pldl1strm, [x4,  x2]
-    prfm        pldl1strm, [x4,  x3]
-    ret
-endfunc
-
-// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
-//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
-.macro x264_prefetch_fenc sub
-function x264_prefetch_fenc_\sub\()_aarch64, export=1
-    and         w6,  w5,  #3
-    and         w7,  w5,  #3
-    mul         x6,  x6,  x1
-    mul         x7,  x7,  x3
-    add         x0,  x0,  #64
-    add         x2,  x2,  #64
-
-    add         x0,  x0,  x6,  lsl #2
-    add         x6,  x0,  x1,  lsl #1
-    prfm        pldl1strm, [x0]
-    prfm        pldl1strm, [x0,  x1]
-    prfm        pldl1strm, [x6]
-    prfm        pldl1strm, [x6, x1]
-
-    add         x2,  x2,  x7,  lsl #1
-    prfm        pldl1strm, [x2]
-    prfm        pldl1strm, [x2,  x3]
-.ifc \sub, 422
-    add         x7,  x2,  x3,  lsl #1
-    prfm        pldl1strm, [x7]
-    prfm        pldl1strm, [x7,  x3]
-.endif
-    ret
-endfunc
-.endm
-
-x264_prefetch_fenc 420
-x264_prefetch_fenc 422
-
-// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
-//                 uint8_t *src1, intptr_t src1_stride,
-//                 uint8_t *src2, intptr_t src2_stride, int weight );
-.macro AVGH w h
-function x264_pixel_avg_\w\()x\h\()_neon, export=1
-    mov         w10, #64
-    cmp         w6,  #32
-    mov         w9, #\h
-    b.eq        pixel_avg_w\w\()_neon
-    subs        w7,  w10,  w6
-    b.lt        pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
-    cmp         w6,  #0
-    b.ge        pixel_avg_weight_w\w\()_add_add_neon
-    b           pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
-endfunc
-.endm
-
-AVGH  4, 2
-AVGH  4, 4
-AVGH  4, 8
-AVGH  4, 16
-AVGH  8, 4
-AVGH  8, 8
-AVGH  8, 16
-AVGH 16, 8
-AVGH 16, 16
-
-// 0 < weight < 64
-.macro load_weights_add_add
-    mov         w6,  w6
-.endm
-.macro weight_add_add dst, s1, s2, h=
-.ifc \h, 2
-    umull2      \dst, \s1, v30.16b
-    umlal2      \dst, \s2, v31.16b
-.else
-    umull       \dst, \s1, v30.8b
-    umlal       \dst, \s2, v31.8b
-.endif
-.endm
-
-// weight > 64
-.macro load_weights_add_sub
-    neg         w7,  w7
-.endm
-.macro weight_add_sub dst, s1, s2, h=
-.ifc \h, 2
-    umull2      \dst, \s1, v30.16b
-    umlsl2      \dst, \s2, v31.16b
-.else
-    umull       \dst, \s1, v30.8b
-    umlsl       \dst, \s2, v31.8b
-.endif
-.endm
-
-// weight < 0
-.macro load_weights_sub_add
-    neg         w6,  w6
-.endm
-.macro weight_sub_add dst, s1, s2, h=
-.ifc \h, 2
-    umull2      \dst, \s2, v31.16b
-    umlsl2      \dst, \s1, v30.16b
-.else
-    umull       \dst, \s2, v31.8b
-    umlsl       \dst, \s1, v30.8b
-.endif
-.endm
-
-.macro AVG_WEIGHT ext
-function pixel_avg_weight_w4_\ext\()_neon
-    load_weights_\ext
-    dup         v30.8b, w6
-    dup         v31.8b, w7
-1:  // height loop
-    subs        w9,  w9,  #2
-    ld1        {v0.s}[0], [x2], x3
-    ld1        {v1.s}[0], [x4], x5
-    weight_\ext v4.8h,  v0.8b,  v1.8b
-    ld1        {v2.s}[0], [x2], x3
-    ld1        {v3.s}[0], [x4], x5
-    sqrshrun    v0.8b,  v4.8h,  #6
-    weight_\ext v5.8h,  v2.8b,  v3.8b
-    st1        {v0.s}[0], [x0], x1
-    sqrshrun    v1.8b,  v5.8h,  #6
-    st1        {v1.s}[0], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function pixel_avg_weight_w8_\ext\()_neon
-    load_weights_\ext
-    dup         v30.8b, w6
-    dup         v31.8b, w7
-1:  // height loop
-    subs        w9,  w9,  #4
-    ld1        {v0.8b}, [x2], x3
-    ld1        {v1.8b}, [x4], x5
-    weight_\ext v16.8h, v0.8b,  v1.8b
-    ld1        {v2.8b}, [x2], x3
-    ld1        {v3.8b}, [x4], x5
-    weight_\ext v17.8h, v2.8b,  v3.8b
-    ld1        {v4.8b}, [x2], x3
-    ld1        {v5.8b}, [x4], x5
-    weight_\ext v18.8h, v4.8b,  v5.8b
-    ld1        {v6.8b}, [x2], x3
-    ld1        {v7.8b}, [x4], x5
-    weight_\ext v19.8h, v6.8b,  v7.8b
-    sqrshrun    v0.8b,  v16.8h, #6
-    sqrshrun    v1.8b,  v17.8h, #6
-    sqrshrun    v2.8b,  v18.8h, #6
-    sqrshrun    v3.8b,  v19.8h, #6
-    st1        {v0.8b}, [x0], x1
-    st1        {v1.8b}, [x0], x1
-    st1        {v2.8b}, [x0], x1
-    st1        {v3.8b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function pixel_avg_weight_w16_\ext\()_neon
-    load_weights_\ext
-    dup         v30.16b, w6
-    dup         v31.16b, w7
-1:  // height loop
-    subs        w9,  w9,  #2
-    ld1        {v0.16b}, [x2], x3
-    ld1        {v1.16b}, [x4], x5
-    weight_\ext v16.8h, v0.8b,  v1.8b
-    weight_\ext v17.8h, v0.16b, v1.16b, 2
-    ld1        {v2.16b}, [x2], x3
-    ld1        {v3.16b}, [x4], x5
-    weight_\ext v18.8h, v2.8b,  v3.8b
-    weight_\ext v19.8h, v2.16b, v3.16b, 2
-    sqrshrun    v0.8b,  v16.8h, #6
-    sqrshrun    v1.8b,  v18.8h, #6
-    sqrshrun2   v0.16b, v17.8h, #6
-    sqrshrun2   v1.16b, v19.8h, #6
-    st1        {v0.16b}, [x0], x1
-    st1        {v1.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-.endm
-
-AVG_WEIGHT add_add
-AVG_WEIGHT add_sub
-AVG_WEIGHT sub_add
-
-function pixel_avg_w4_neon
-1:  subs        w9,  w9,  #2
-    ld1        {v0.s}[0], [x2], x3
-    ld1        {v2.s}[0], [x4], x5
-    urhadd      v0.8b,  v0.8b,  v2.8b
-    ld1        {v1.s}[0], [x2], x3
-    ld1        {v3.s}[0], [x4], x5
-    urhadd      v1.8b,  v1.8b,  v3.8b
-    st1        {v0.s}[0], [x0], x1
-    st1        {v1.s}[0], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function pixel_avg_w8_neon
-1:  subs        w9,  w9,  #4
-    ld1        {v0.8b}, [x2], x3
-    ld1        {v1.8b}, [x4], x5
-    ld1        {v2.8b}, [x2], x3
-    urhadd      v0.8b,  v0.8b,  v1.8b
-    ld1        {v3.8b}, [x4], x5
-    st1        {v0.8b}, [x0], x1
-    ld1        {v4.8b}, [x2], x3
-    urhadd      v1.8b,  v2.8b,  v3.8b
-    ld1        {v5.8b}, [x4], x5
-    st1        {v1.8b}, [x0], x1
-    ld1        {v6.8b}, [x2], x3
-    ld1        {v7.8b}, [x4], x5
-    urhadd      v2.8b,  v4.8b,  v5.8b
-    urhadd      v3.8b,  v6.8b,  v7.8b
-    st1        {v2.8b}, [x0], x1
-    st1        {v3.8b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function pixel_avg_w16_neon
-1:  subs        w9,  w9,  #4
-    ld1        {v0.16b}, [x2], x3
-    ld1        {v1.16b}, [x4], x5
-    ld1        {v2.16b}, [x2], x3
-    urhadd      v0.16b, v0.16b, v1.16b
-    ld1        {v3.16b}, [x4], x5
-    st1        {v0.16b}, [x0], x1
-    ld1        {v4.16b}, [x2], x3
-    urhadd      v1.16b, v2.16b, v3.16b
-    ld1        {v5.16b}, [x4], x5
-    st1        {v1.16b}, [x0], x1
-    ld1        {v6.16b}, [x2], x3
-    ld1        {v7.16b}, [x4], x5
-    urhadd      v2.16b, v4.16b, v5.16b
-    urhadd      v3.16b, v6.16b, v7.16b
-    st1        {v2.16b}, [x0], x1
-    st1        {v3.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_pixel_avg2_w4_neon, export=1
-1:
-    subs        w5,  w5,  #2
-    ld1        {v0.s}[0],  [x2], x3
-    ld1        {v2.s}[0],  [x4], x3
-    urhadd      v0.8b,  v0.8b,  v2.8b
-    ld1        {v1.s}[0],  [x2], x3
-    ld1        {v3.s}[0],  [x4], x3
-    urhadd      v1.8b,  v1.8b,  v3.8b
-    st1        {v0.s}[0], [x0], x1
-    st1        {v1.s}[0], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_pixel_avg2_w8_neon, export=1
-1:
-    subs        w5,  w5,  #2
-    ld1        {v0.8b}, [x2], x3
-    ld1        {v2.8b}, [x4], x3
-    urhadd      v0.8b,  v0.8b,  v2.8b
-    ld1        {v1.8b}, [x2], x3
-    ld1        {v3.8b}, [x4], x3
-    urhadd      v1.8b,  v1.8b,  v3.8b
-    st1        {v0.8b}, [x0], x1
-    st1        {v1.8b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_pixel_avg2_w16_neon, export=1
-1:
-    subs        w5,  w5,  #2
-    ld1        {v0.16b}, [x2], x3
-    ld1        {v2.16b}, [x4], x3
-    urhadd      v0.16b, v0.16b, v2.16b
-    ld1        {v1.16b}, [x2], x3
-    ld1        {v3.16b}, [x4], x3
-    urhadd      v1.16b, v1.16b, v3.16b
-    st1        {v0.16b}, [x0], x1
-    st1        {v1.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_pixel_avg2_w20_neon, export=1
-    sub         x1,  x1,  #16
-1:
-    subs        w5,  w5,  #2
-    ld1        {v0.16b,v1.16b}, [x2], x3
-    ld1        {v2.16b,v3.16b}, [x4], x3
-    urhadd      v0.16b, v0.16b, v2.16b
-    urhadd      v1.8b,  v1.8b,  v3.8b
-    ld1        {v4.16b,v5.16b}, [x2], x3
-    ld1        {v6.16b,v7.16b}, [x4], x3
-    urhadd      v4.16b, v4.16b, v6.16b
-    urhadd      v5.8b,  v5.8b,  v7.8b
-    st1        {v0.16b},  [x0], #16
-    st1        {v1.s}[0], [x0], x1
-    st1        {v4.16b},  [x0], #16
-    st1        {v5.s}[0], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-.macro weight_prologue type
-    mov         w9,  w5                 // height
-.ifc \type, full
-    ldr         w12, [x4, #32]          // denom
-.endif
-    ldp         w4,  w5,  [x4, #32+4]   // scale, offset
-    dup         v0.16b, w4
-    dup         v1.8h,  w5
-.ifc \type, full
-    neg         w12, w12
-    dup         v2.8h,  w12
-.endif
-.endm
-
-// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
-//                 intptr_t dst_stride, const x264_weight_t *weight, int h )
-function x264_mc_weight_w20_neon, export=1
-    weight_prologue full
-    sub         x1,  x1,  #16
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
-    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
-    umull       v22.8h, v16.8b, v0.8b
-    umull       v23.8h, v17.8b, v0.8b
-    zip1        v18.2s, v18.2s, v21.2s
-    umull       v25.8h, v19.8b, v0.8b
-    umull       v26.8h, v20.8b, v0.8b
-    umull       v24.8h, v18.8b, v0.8b
-    srshl       v22.8h, v22.8h, v2.8h
-    srshl       v23.8h, v23.8h, v2.8h
-    srshl       v24.8h, v24.8h, v2.8h
-    srshl       v25.8h, v25.8h, v2.8h
-    srshl       v26.8h, v26.8h, v2.8h
-    add         v22.8h, v22.8h, v1.8h
-    add         v23.8h, v23.8h, v1.8h
-    add         v24.8h, v24.8h, v1.8h
-    add         v25.8h, v25.8h, v1.8h
-    add         v26.8h, v26.8h, v1.8h
-    sqxtun      v4.8b,  v22.8h
-    sqxtun2     v4.16b, v23.8h
-    sqxtun      v6.8b,  v24.8h
-    sqxtun      v5.8b,  v25.8h
-    sqxtun2     v5.16b, v26.8h
-    st1        {v4.16b},  [x0], #16
-    st1        {v6.s}[0], [x0], x1
-    st1        {v5.16b},  [x0], #16
-    st1        {v6.s}[1], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w16_neon, export=1
-    weight_prologue full
-weight16_loop:
-1:
-    subs        w9,  w9,  #2
-    ld1        {v4.16b}, [x2], x3
-    ld1        {v5.16b}, [x2], x3
-    umull       v22.8h, v4.8b,  v0.8b
-    umull2      v23.8h, v4.16b, v0.16b
-    umull       v24.8h, v5.8b,  v0.8b
-    umull2      v25.8h, v5.16b, v0.16b
-    srshl       v22.8h, v22.8h, v2.8h
-    srshl       v23.8h, v23.8h, v2.8h
-    srshl       v24.8h, v24.8h, v2.8h
-    srshl       v25.8h, v25.8h, v2.8h
-    add         v22.8h, v22.8h, v1.8h
-    add         v23.8h, v23.8h, v1.8h
-    add         v24.8h, v24.8h, v1.8h
-    add         v25.8h, v25.8h, v1.8h
-    sqxtun      v4.8b,  v22.8h
-    sqxtun2     v4.16b, v23.8h
-    sqxtun      v5.8b,  v24.8h
-    sqxtun2     v5.16b, v25.8h
-    st1        {v4.16b}, [x0], x1
-    st1        {v5.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w8_neon, export=1
-    weight_prologue full
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.8b}, [x2], x3
-    ld1        {v17.8b}, [x2], x3
-    umull       v4.8h,  v16.8b, v0.8b
-    umull       v5.8h,  v17.8b, v0.8b
-    srshl       v4.8h,  v4.8h,  v2.8h
-    srshl       v5.8h,  v5.8h,  v2.8h
-    add         v4.8h,  v4.8h,  v1.8h
-    add         v5.8h,  v5.8h,  v1.8h
-    sqxtun      v16.8b, v4.8h
-    sqxtun      v17.8b, v5.8h
-    st1        {v16.8b}, [x0], x1
-    st1        {v17.8b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w4_neon, export=1
-    weight_prologue full
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.s}[0], [x2], x3
-    ld1        {v16.s}[1], [x2], x3
-    umull       v4.8h,  v16.8b, v0.8b
-    srshl       v4.8h,  v4.8h,  v2.8h
-    add         v4.8h,  v4.8h,  v1.8h
-    sqxtun      v16.8b, v4.8h
-    st1        {v16.s}[0], [x0], x1
-    st1        {v16.s}[1], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w20_nodenom_neon, export=1
-    weight_prologue nodenom
-    sub         x1,  x1,  #16
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
-    mov         v27.16b, v1.16b
-    mov         v28.16b, v1.16b
-    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
-    mov         v31.16b, v1.16b
-    mov         v29.16b, v1.16b
-    mov         v30.16b, v1.16b
-    zip1        v18.2s, v18.2s, v21.2s
-    umlal       v27.8h, v16.8b, v0.8b
-    umlal       v28.8h, v17.8b, v0.8b
-    umlal       v31.8h, v18.8b, v0.8b
-    umlal       v29.8h, v19.8b, v0.8b
-    umlal       v30.8h, v20.8b, v0.8b
-    sqxtun      v4.8b,  v27.8h
-    sqxtun2     v4.16b, v28.8h
-    sqxtun      v5.8b,  v29.8h
-    sqxtun2     v5.16b, v30.8h
-    sqxtun      v6.8b,  v31.8h
-    st1        {v4.16b},  [x0], #16
-    st1        {v6.s}[0], [x0], x1
-    st1        {v5.16b},  [x0], #16
-    st1        {v6.s}[1], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w16_nodenom_neon, export=1
-    weight_prologue nodenom
-1:
-    subs        w9,  w9,  #2
-    ld1        {v6.16b},  [x2], x3
-    mov         v27.16b, v1.16b
-    mov         v28.16b, v1.16b
-    ld1        {v7.16b},  [x2], x3
-    mov         v29.16b, v1.16b
-    mov         v30.16b, v1.16b
-    umlal       v27.8h, v6.8b,  v0.8b
-    umlal2      v28.8h, v6.16b, v0.16b
-    umlal       v29.8h, v7.8b,  v0.8b
-    umlal2      v30.8h, v7.16b, v0.16b
-    sqxtun      v4.8b,  v27.8h
-    sqxtun2     v4.16b, v28.8h
-    sqxtun      v5.8b,  v29.8h
-    sqxtun2     v5.16b, v30.8h
-    st1        {v4.16b},  [x0], x1
-    st1        {v5.16b},  [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w8_nodenom_neon, export=1
-    weight_prologue nodenom
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.8b}, [x2], x3
-    mov         v27.16b, v1.16b
-    ld1        {v17.8b}, [x2], x3
-    mov         v29.16b, v1.16b
-    umlal       v27.8h, v16.8b, v0.8b
-    umlal       v29.8h, v17.8b, v0.8b
-    sqxtun      v4.8b,  v27.8h
-    sqxtun      v5.8b,  v29.8h
-    st1        {v4.8b},  [x0], x1
-    st1        {v5.8b},  [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w4_nodenom_neon, export=1
-    weight_prologue nodenom
-1:
-    subs        w9,  w9,  #2
-    ld1        {v16.s}[0], [x2], x3
-    ld1        {v16.s}[1], [x2], x3
-    mov         v27.16b, v1.16b
-    umlal       v27.8h, v16.8b, v0.8b
-    sqxtun      v4.8b,  v27.8h
-    st1        {v4.s}[0],  [x0], x1
-    st1        {v4.s}[1],  [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-.macro weight_simple_prologue
-    ldr         w6,  [x4]               // offset
-    dup         v1.16b,  w6
-.endm
-
-.macro weight_simple name op
-function x264_mc_weight_w20_\name\()_neon, export=1
-    weight_simple_prologue
-1:
-    subs        w5,  w5,  #2
-    ldr         s18, [x2, #16]
-    ld1        {v16.16b}, [x2], x3
-    ldr         s19, [x2, #16]
-    ld1        {v17.16b}, [x2], x3
-    \op         v18.8b,  v18.8b,  v1.8b
-    \op         v16.16b, v16.16b, v1.16b
-    \op         v19.8b,  v19.8b,  v1.8b
-    \op         v17.16b, v17.16b, v1.16b
-    str         s18, [x0, #16]
-    st1        {v16.16b}, [x0], x1
-    str         s19, [x0, #16]
-    st1        {v17.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w16_\name\()_neon, export=1
-    weight_simple_prologue
-1:
-    subs        w5,  w5,  #2
-    ld1        {v16.16b}, [x2], x3
-    ld1        {v17.16b}, [x2], x3
-    \op         v16.16b, v16.16b, v1.16b
-    \op         v17.16b, v17.16b, v1.16b
-    st1        {v16.16b}, [x0], x1
-    st1        {v17.16b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w8_\name\()_neon, export=1
-    weight_simple_prologue
-1:
-    subs        w5,  w5,  #2
-    ld1        {v16.8b}, [x2], x3
-    ld1        {v17.8b}, [x2], x3
-    \op         v16.8b, v16.8b, v1.8b
-    \op         v17.8b, v17.8b, v1.8b
-    st1        {v16.8b}, [x0], x1
-    st1        {v17.8b}, [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_weight_w4_\name\()_neon, export=1
-    weight_simple_prologue
-1:
-    subs        w5,  w5,  #2
-    ld1        {v16.s}[0], [x2], x3
-    ld1        {v16.s}[1], [x2], x3
-    \op         v16.8b, v16.8b, v1.8b
-    st1        {v16.s}[0], [x0], x1
-    st1        {v16.s}[1], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-.endm
-
-weight_simple offsetadd, uqadd
-weight_simple offsetsub, uqsub
-
-
-// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
-function x264_mc_copy_w4_neon, export=1
-1:
-    subs        w4,  w4,  #4
-    ld1        {v0.s}[0],  [x2],  x3
-    ld1        {v1.s}[0],  [x2],  x3
-    ld1        {v2.s}[0],  [x2],  x3
-    ld1        {v3.s}[0],  [x2],  x3
-    st1        {v0.s}[0],  [x0],  x1
-    st1        {v1.s}[0],  [x0],  x1
-    st1        {v2.s}[0],  [x0],  x1
-    st1        {v3.s}[0],  [x0],  x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_copy_w8_neon, export=1
-1:  subs        w4,  w4,  #4
-    ld1        {v0.8b},  [x2],  x3
-    ld1        {v1.8b},  [x2],  x3
-    ld1        {v2.8b},  [x2],  x3
-    ld1        {v3.8b},  [x2],  x3
-    st1        {v0.8b},  [x0],  x1
-    st1        {v1.8b},  [x0],  x1
-    st1        {v2.8b},  [x0],  x1
-    st1        {v3.8b},  [x0],  x1
-    b.gt        1b
-    ret
-endfunc
-
-function x264_mc_copy_w16_neon, export=1
-1:  subs        w4,  w4,  #4
-    ld1        {v0.16b}, [x2],  x3
-    ld1        {v1.16b}, [x2],  x3
-    ld1        {v2.16b}, [x2],  x3
-    ld1        {v3.16b}, [x2],  x3
-    st1        {v0.16b}, [x0],  x1
-    st1        {v1.16b}, [x0],  x1
-    st1        {v2.16b}, [x0],  x1
-    st1        {v3.16b}, [x0],  x1
-    b.gt        1b
-    ret
-endfunc
-
-// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
-//                           intptr_t i_dst_stride,
-//                           uint8_t *src, intptr_t i_src_stride,
-//                           int dx, int dy, int i_width, int i_height );
-function x264_mc_chroma_neon, export=1
-    ldr         w15, [sp]               // height
-    sbfx        x12, x6,  #3,  #29      // asr(3) and sign extend
-    sbfx        x11, x5,  #3,  #29      // asr(3) and sign extend
-    cmp         w7,  #4
-    mul         x12, x12, x4
-    add         x3,  x3,  x11, lsl #1
-
-    and         w5,  w5,  #7
-    and         w6,  w6,  #7
-
-    add         x3,  x3,  x12
-
-    //pld             [x3]
-    //pld             [x3, x4]
-
-    b.gt        mc_chroma_w8_neon
-    b.eq        mc_chroma_w4_neon
-endfunc
-
-.macro CHROMA_MC_START r00, r01, r10, r11
-    mul         w12, w5,  w6            // cD = d8x    *d8y
-    lsl         w13, w5,  #3
-    add         w9,  w12,  #64
-    lsl         w14, w6,  #3
-    tst         w12, w12
-    sub         w9,  w9,  w13
-    sub         w10, w13, w12           // cB = d8x    *(8-d8y);
-    sub         w11, w14, w12           // cC = (8-d8x)*d8y
-    sub         w9,  w9,  w14           // cA = (8-d8x)*(8-d8y);
-.endm
-
-.macro CHROMA_MC width, vsize
-function mc_chroma_w\width\()_neon
-// since the element size varies, there's a different index for the 2nd store
-.if \width == 4
-    .set st2, 1
-.else
-    .set st2, 2
-.endif
-    CHROMA_MC_START
-    b.eq        2f
-
-    ld2        {v28.8b,v29.8b}, [x3], x4
-    dup         v0.8b,  w9               // cA
-    dup         v1.8b,  w10              // cB
-
-    ext         v6.8b, v28.8b, v6.8b,  #1
-    ext         v7.8b, v29.8b, v7.8b,  #1
-
-    ld2        {v30.8b,v31.8b}, [x3], x4
-    dup         v2.8b,  w11              // cC
-    dup         v3.8b,  w12              // cD
-
-    ext         v22.8b, v30.8b, v22.8b,  #1
-    ext         v23.8b, v31.8b, v23.8b,  #1
-
-    trn1        v0.2s,  v0.2s,  v1.2s
-    trn1        v2.2s,  v2.2s,  v3.2s
-
-    trn1        v4.2s,  v28.2s, v6.2s
-    trn1        v5.2s,  v29.2s, v7.2s
-    trn1        v20.2s, v30.2s, v22.2s
-    trn1        v21.2s, v31.2s, v23.2s
-1:  // height loop, interpolate xy
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b
-    umlal       v16.8h, v20.8b, v2.8b
-    umull       v17.8h, v5.8b,  v0.8b
-    umlal       v17.8h, v21.8b, v2.8b
-
-    ld2        {v28.8b,v29.8b}, [x3], x4
-    transpose   v24.2d, v25.2d, v16.2d, v17.2d
-
-    ext         v6.8b, v28.8b, v6.8b,  #1
-    ext         v7.8b, v29.8b, v7.8b,  #1
-
-    trn1        v4.2s,  v28.2s, v6.2s
-    trn1        v5.2s,  v29.2s, v7.2s
-
-    add         v16.8h, v24.8h, v25.8h
-
-    umull       v18.8h, v20.8b, v0.8b
-    umlal       v18.8h, v4.8b,  v2.8b
-    umull       v19.8h, v21.8b, v0.8b
-    umlal       v19.8h, v5.8b,  v2.8b
-
-    ld2        {v30.8b,v31.8b}, [x3], x4
-    transpose   v26.2d, v27.2d, v18.2d, v19.2d
-
-    ext         v22.8b, v30.8b, v22.8b,  #1
-    ext         v23.8b, v31.8b, v23.8b,  #1
-    trn1        v20.2s, v30.2s, v22.2s
-    trn1        v21.2s, v31.2s, v23.2s
-
-    add         v17.8h, v26.8h, v27.8h
-
-    rshrn       v16.8b, v16.8h, #6
-    rshrn       v17.8b, v17.8h, #6
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.\vsize}[0],   [x0], x2
-    st1        {v16.\vsize}[st2], [x1], x2
-    st1        {v17.\vsize}[0],   [x0], x2
-    st1        {v17.\vsize}[st2], [x1], x2
-    b.gt        1b
-
-    ret
-2:  // dx or dy are 0
-    tst         w11, w11
-    add         w10, w10,  w11
-    dup         v0.8b,  w9
-    dup         v1.8b,  w10
-
-    b.eq        4f
-
-    ld1        {v4.8b}, [x3], x4
-    ld1        {v6.8b}, [x3], x4
-3:  // vertical interpolation loop
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b
-    ld1        {v4.8b}, [x3], x4
-    umlal       v16.8h, v6.8b,  v1.8b
-    umull       v17.8h, v6.8b,  v0.8b
-    ld1        {v6.8b}, [x3], x4
-    umlal       v17.8h, v4.8b,  v1.8b
-
-    rshrn       v20.8b, v16.8h, #6      // uvuvuvuv
-    rshrn       v21.8b, v17.8h, #6      // uvuvuvuv
-
-    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
-    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.\vsize}[0],   [x0], x2
-    st1        {v16.\vsize}[st2], [x0], x2
-    st1        {v17.\vsize}[0],   [x1], x2
-    st1        {v17.\vsize}[st2], [x1], x2
-    b.gt        3b
-
-    ret
-
-4:  // dy is 0
-    ld1        {v4.8b,v5.8b}, [x3], x4
-    ld1        {v6.8b,v7.8b}, [x3], x4
-
-    ext         v5.8b,  v4.8b,  v5.8b,  #2
-    ext         v7.8b,  v6.8b,  v7.8b,  #2
-5:  // horizontal interpolation loop
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b
-    umlal       v16.8h, v5.8b,  v1.8b
-    umull       v17.8h, v6.8b,  v0.8b
-    umlal       v17.8h, v7.8b,  v1.8b
-
-    ld1        {v4.8b,v5.8b}, [x3], x4
-    ld1        {v6.8b,v7.8b}, [x3], x4
-    rshrn       v20.8b, v16.8h, #6
-    rshrn       v21.8b, v17.8h, #6
-    ext         v5.8b,  v4.8b,  v5.8b,  #2
-    ext         v7.8b,  v6.8b,  v7.8b,  #2
-    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
-    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.\vsize}[0],   [x0], x2
-    st1        {v16.\vsize}[st2], [x0], x2
-    st1        {v17.\vsize}[0],   [x1], x2
-    st1        {v17.\vsize}[st2], [x1], x2
-    b.gt        5b
-
-    ret
-endfunc
-.endm
-
-    CHROMA_MC 2, h
-    CHROMA_MC 4, s
-
-function mc_chroma_w8_neon
-    CHROMA_MC_START
-    b.eq        2f
-    ld2        {v4.16b,v5.16b}, [x3], x4
-    ld2        {v20.16b,v21.16b}, [x3], x4
-    dup         v0.8b, w9               // cA
-    dup         v1.8b, w10              // cB
-
-    ext         v6.16b, v4.16b, v4.16b, #1
-    ext         v7.16b, v5.16b, v5.16b, #1
-
-    dup         v2.8b, w11              // cC
-    dup         v3.8b, w12              // cD
-
-    ext         v22.16b, v20.16b, v20.16b, #1
-    ext         v23.16b, v21.16b, v21.16b, #1
-
-1:  // height loop, interpolate xy
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b
-    umlal       v16.8h, v6.8b,  v1.8b
-    umlal       v16.8h, v20.8b, v2.8b
-    umlal       v16.8h, v22.8b, v3.8b
-
-    umull       v17.8h, v5.8b,  v0.8b
-    umlal       v17.8h, v7.8b,  v1.8b
-    umlal       v17.8h, v21.8b, v2.8b
-    umlal       v17.8h, v23.8b, v3.8b
-
-    ld2        {v4.16b,v5.16b}, [x3], x4
-
-    ext         v6.16b, v4.16b, v4.16b, #1
-    ext         v7.16b, v5.16b, v5.16b, #1
-
-    umull       v18.8h, v20.8b, v0.8b
-    umlal       v18.8h, v22.8b, v1.8b
-    umlal       v18.8h, v4.8b,  v2.8b
-    umlal       v18.8h, v6.8b,  v3.8b
-
-    umull       v19.8h, v21.8b, v0.8b
-    umlal       v19.8h, v23.8b, v1.8b
-    umlal       v19.8h, v5.8b,  v2.8b
-    umlal       v19.8h, v7.8b,  v3.8b
-
-    ld2        {v20.16b,v21.16b}, [x3], x4
-
-    rshrn       v16.8b, v16.8h, #6
-    rshrn       v17.8b, v17.8h, #6
-    rshrn       v18.8b, v18.8h, #6
-    rshrn       v19.8b, v19.8h, #6
-
-    ext         v22.16b, v20.16b, v20.16b, #1
-    ext         v23.16b, v21.16b, v21.16b, #1
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.8b}, [x0], x2
-    st1        {v17.8b}, [x1], x2
-    st1        {v18.8b}, [x0], x2
-    st1        {v19.8b}, [x1], x2
-    b.gt        1b
-
-    ret
-2:  // dx or dy are 0
-    tst         w11, w11
-    add         w10, w10, w11
-    dup         v0.8b, w9
-    dup         v1.8b, w10
-
-    b.eq        4f
-
-    ld2        {v4.8b,v5.8b}, [x3], x4
-    ld2        {v6.8b,v7.8b}, [x3], x4
-3:  // vertical interpolation loop
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b //U
-    umlal       v16.8h, v6.8b,  v1.8b
-    umull       v17.8h, v5.8b,  v0.8b //V
-    umlal       v17.8h, v7.8b,  v1.8b
-
-    ld2        {v4.8b,v5.8b}, [x3], x4
-
-    umull       v18.8h, v6.8b,  v0.8b
-    umlal       v18.8h, v4.8b,  v1.8b
-    umull       v19.8h, v7.8b,  v0.8b
-    umlal       v19.8h, v5.8b,  v1.8b
-
-    ld2        {v6.8b,v7.8b}, [x3], x4
-
-    rshrn       v16.8b, v16.8h, #6
-    rshrn       v17.8b, v17.8h, #6
-    rshrn       v18.8b, v18.8h, #6
-    rshrn       v19.8b, v19.8h, #6
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.8b}, [x0], x2
-    st1        {v17.8b}, [x1], x2
-    st1        {v18.8b}, [x0], x2
-    st1        {v19.8b}, [x1], x2
-    b.gt        3b
-
-    ret
-4:  // dy is 0
-    ld2        {v4.16b,v5.16b}, [x3], x4
-    ext         v6.16b, v4.16b, v4.16b, #1
-    ext         v7.16b, v5.16b, v5.16b, #1
-    ld2        {v20.16b,v21.16b}, [x3], x4
-    ext         v22.16b, v20.16b, v20.16b, #1
-    ext         v23.16b, v21.16b, v21.16b, #1
-5:  // horizontal interpolation loop
-    subs        w15, w15, #2
-    umull       v16.8h, v4.8b,  v0.8b //U
-    umlal       v16.8h, v6.8b,  v1.8b
-    umull       v17.8h, v5.8b,  v0.8b //V
-    umlal       v17.8h, v7.8b,  v1.8b
-
-    ld2        {v4.16b,v5.16b}, [x3], x4
-
-    umull       v18.8h, v20.8b, v0.8b
-    umlal       v18.8h, v22.8b, v1.8b
-    umull       v19.8h, v21.8b, v0.8b
-    umlal       v19.8h, v23.8b, v1.8b
-
-    ld2        {v20.16b,v21.16b}, [x3], x4
-
-    rshrn       v16.8b, v16.8h, #6
-    rshrn       v17.8b, v17.8h, #6
-    rshrn       v18.8b, v18.8h, #6
-    rshrn       v19.8b, v19.8h, #6
-
-    ext         v6.16b, v4.16b, v4.16b, #1
-    ext         v7.16b, v5.16b, v5.16b, #1
-    ext         v22.16b, v20.16b, v20.16b, #1
-    ext         v23.16b, v21.16b, v21.16b, #1
-
-    //pld         [x3]
-    //pld         [x3, x4]
-
-    st1        {v16.8b}, [x0], x2
-    st1        {v17.8b}, [x1], x2
-    st1        {v18.8b}, [x0], x2
-    st1        {v19.8b}, [x1], x2
-    b.gt        5b
-
-    ret
-endfunc
-
-//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-//                  intptr_t stride, int width, int height, int16_t *buf )
-function x264_hpel_filter_neon, export=1
-    ubfm        x9,  x3,  #0,  #3
-    add         w15, w5,  w9
-    sub         x13, x3,  x9            // align src
-    sub         x10, x0,  x9
-    sub         x11, x1,  x9
-    sub         x12, x2,  x9
-    movi        v30.16b,  #5
-    movi        v31.16b,  #20
-1:  // line start
-    mov         x3,  x13
-    mov         x2,  x12
-    mov         x1,  x11
-    mov         x0,  x10
-    add         x7,  x3,  #16           // src pointer next 16b for horiz filter
-    mov         x5,  x15                // restore width
-    sub         x3,  x3,  x4,  lsl #1   // src - 2*stride
-    ld1        {v28.16b}, [x7], #16     // src[16:31]
-
-    add         x9,  x3,  x5            // holds src - 2*stride + width
-
-    ld1        {v16.16b}, [x3], x4      // src-2*stride[0:15]
-    ld1        {v17.16b}, [x3], x4      // src-1*stride[0:15]
-    ld1        {v18.16b}, [x3], x4      // src+0*stride[0:15]
-    ld1        {v19.16b}, [x3], x4      // src+1*stride[0:15]
-    ld1        {v20.16b}, [x3], x4      // src+2*stride[0:15]
-    ld1        {v21.16b}, [x3], x4      // src+3*stride[0:15]
-
-    ext         v22.16b, v7.16b,  v18.16b, #14
-    uaddl       v1.8h,   v16.8b,  v21.8b
-    ext         v26.16b, v18.16b, v28.16b, #3
-    umlsl       v1.8h,   v17.8b,  v30.8b
-    ext         v23.16b, v7.16b,  v18.16b, #15
-    umlal       v1.8h,   v18.8b,  v31.8b
-    ext         v24.16b, v18.16b, v28.16b, #1
-    umlal       v1.8h,   v19.8b,  v31.8b
-    ext         v25.16b, v18.16b, v28.16b, #2
-    umlsl       v1.8h,   v20.8b,  v30.8b
-2:  // next 16 pixel of line
-    subs        x5,  x5,  #16
-    sub         x3,  x9,  x5            // src - 2*stride += 16
-
-    uaddl       v4.8h,  v22.8b,  v26.8b
-    uaddl2      v5.8h,  v22.16b, v26.16b
-    sqrshrun    v6.8b,  v1.8h,   #5
-    umlsl       v4.8h,  v23.8b,  v30.8b
-    umlsl2      v5.8h,  v23.16b, v30.16b
-    umlal       v4.8h,  v18.8b,  v31.8b
-    umlal2      v5.8h,  v18.16b, v31.16b
-    umlal       v4.8h,  v24.8b,  v31.8b
-    umlal2      v5.8h,  v24.16b, v31.16b
-    umlsl       v4.8h,  v25.8b,  v30.8b
-    umlsl2      v5.8h,  v25.16b, v30.16b
-
-    uaddl2      v2.8h,  v16.16b, v21.16b
-    sqrshrun    v4.8b,  v4.8h,   #5
-    mov         v7.16b, v18.16b
-    sqrshrun2   v4.16b, v5.8h,   #5
-
-    umlsl2      v2.8h,  v17.16b, v30.16b
-    ld1        {v16.16b}, [x3],  x4      // src-2*stride[0:15]
-    umlal2      v2.8h,  v18.16b, v31.16b
-    ld1        {v17.16b}, [x3],  x4      // src-1*stride[0:15]
-    umlal2      v2.8h,  v19.16b, v31.16b
-    ld1        {v18.16b}, [x3],  x4      // src+0*stride[0:15]
-    umlsl2      v2.8h,  v20.16b, v30.16b
-    ld1        {v19.16b}, [x3],  x4      // src+1*stride[0:15]
-    st1        {v4.16b},  [x0],  #16
-    sqrshrun2   v6.16b, v2.8h,   #5
-    ld1        {v20.16b}, [x3],  x4      // src+2*stride[0:15]
-    ld1        {v21.16b}, [x3],  x4      // src+3*stride[0:15]
-
-    ext         v22.16b, v0.16b, v1.16b, #12
-    ext         v26.16b, v1.16b, v2.16b, #6
-    ext         v23.16b, v0.16b, v1.16b, #14
-    st1        {v6.16b},  [x1],  #16
-    uaddl       v3.8h,   v16.8b, v21.8b
-    ext         v25.16b, v1.16b, v2.16b, #4
-    umlsl       v3.8h,   v17.8b, v30.8b
-    ext         v24.16b, v1.16b, v2.16b, #2
-
-    umlal       v3.8h,  v18.8b, v31.8b
-    add         v4.8h,  v22.8h, v26.8h
-    umlal       v3.8h,  v19.8b, v31.8b
-    add         v5.8h,  v23.8h, v25.8h
-    umlsl       v3.8h,  v20.8b, v30.8b
-    add         v6.8h,  v24.8h, v1.8h
-
-    ext         v22.16b, v1.16b, v2.16b, #12
-    ext         v26.16b, v2.16b, v3.16b, #6
-    ext         v23.16b, v1.16b, v2.16b, #14
-    ext         v25.16b, v2.16b, v3.16b, #4
-    ext         v24.16b, v2.16b, v3.16b, #2
-
-    add         v22.8h, v22.8h, v26.8h
-    add         v23.8h, v23.8h, v25.8h
-    add         v24.8h, v24.8h, v2.8h
-
-    sub         v4.8h,  v4.8h,  v5.8h   // a-b
-    sub         v5.8h,  v5.8h,  v6.8h   // b-c
-
-    sub         v22.8h, v22.8h, v23.8h  // a-b
-    sub         v23.8h, v23.8h, v24.8h  // b-c
-
-    sshr        v4.8h,  v4.8h,  #2      // (a-b)/4
-    sshr        v22.8h, v22.8h, #2      // (a-b)/4
-    sub         v4.8h,  v4.8h,  v5.8h   // (a-b)/4-b+c
-    sub         v22.8h, v22.8h, v23.8h  // (a-b)/4-b+c
-    sshr        v4.8h,  v4.8h,  #2      // ((a-b)/4-b+c)/4
-    sshr        v22.8h, v22.8h, #2      // ((a-b)/4-b+c)/4
-    add         v4.8h,  v4.8h,  v6.8h   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    add         v22.8h, v22.8h, v24.8h  // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-
-    sqrshrun    v4.8b,   v4.8h,   #6
-    ld1        {v28.16b}, [x7],   #16   // src[16:31]
-    mov         v0.16b,  v2.16b
-    ext         v23.16b, v7.16b,  v18.16b, #15
-    sqrshrun2   v4.16b,  v22.8h,  #6
-    mov         v1.16b,  v3.16b
-    ext         v22.16b, v7.16b,  v18.16b, #14
-    ext         v24.16b, v18.16b, v28.16b, #1
-    ext         v25.16b, v18.16b, v28.16b, #2
-    ext         v26.16b, v18.16b, v28.16b, #3
-
-    st1        {v4.16b}, [x2], #16
-    b.gt        2b
-
-    subs        w6,  w6,  #1
-    add         x10,  x10,  x4
-    add         x11,  x11,  x4
-    add         x12,  x12,  x4
-    add         x13,  x13,  x4
-    b.gt        1b
-
-    ret
-endfunc
-
-// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
-//                         uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
-//                         intptr_t dst_stride, int width, int height )
-function x264_frame_init_lowres_core_neon, export=1
-    ldr         w8,  [sp]
-    sub         x10, x6,  w7, uxtw      // dst_stride - width
-    and         x10, x10, #~15
-
-1:
-    mov         w9,  w7                 // width
-    mov         x11, x0                 // src0
-    add         x12, x0,  x5            // src1 = src0 + src_stride
-    add         x13, x0,  x5,  lsl #1   // src2 = src1 + src_stride
-
-    ld2        {v0.16b,v1.16b}, [x11], #32
-    ld2        {v2.16b,v3.16b}, [x12], #32
-    ld2        {v4.16b,v5.16b}, [x13], #32
-
-    urhadd      v20.16b, v0.16b,  v2.16b    // s0[2x]   + s1[2x]
-    urhadd      v22.16b, v2.16b,  v4.16b    // s1[2x]   + s2[2x]
-2:
-    subs        w9,  w9,  #16
-    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
-    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
-
-    ld2        {v0.16b,v1.16b}, [x11], #32
-    ld2        {v2.16b,v3.16b}, [x12], #32
-    ld2        {v4.16b,v5.16b}, [x13], #32
-    urhadd      v30.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
-    urhadd      v31.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
-    ext         v24.16b, v20.16b, v30.16b, #1   // s0[2x+2] + s1[2x+2]
-    ext         v25.16b, v22.16b, v31.16b, #1   // s1[2x+2] + s2[2x+2]
-
-    urhadd      v16.16b, v20.16b, v21.16b
-    urhadd      v18.16b, v22.16b, v23.16b
-    urhadd      v17.16b, v21.16b, v24.16b
-    urhadd      v19.16b, v23.16b, v25.16b
-
-    st1        {v16.16b},   [x1],  #16
-    st1        {v18.16b},   [x3],  #16
-    st1        {v17.16b},   [x2],  #16
-    st1        {v19.16b},   [x4],  #16
-    b.le        3f
-
-    subs        w9,  w9,  #16
-    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
-    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
-
-    ld2        {v0.16b,v1.16b}, [x11], #32
-    ld2        {v2.16b,v3.16b}, [x12], #32
-    ld2        {v4.16b,v5.16b}, [x13], #32
-    urhadd      v20.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
-    urhadd      v22.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
-    ext         v24.16b, v30.16b, v20.16b, #1   // s0[2x+2] + s1[2x+2]
-    ext         v25.16b, v31.16b, v22.16b, #1   // s1[2x+2] + s2[2x+2]
-
-    urhadd      v16.16b, v30.16b, v21.16b
-    urhadd      v18.16b, v31.16b, v23.16b
-    urhadd      v17.16b, v21.16b, v24.16b
-    urhadd      v19.16b, v23.16b, v25.16b
-
-    st1        {v16.16b},   [x1],  #16
-    st1        {v18.16b},   [x3],  #16
-    st1        {v17.16b},   [x2],  #16
-    st1        {v19.16b},   [x4],  #16
-    b.gt        2b
-3:
-    subs        w8,  w8,  #1
-    add         x0,  x0,  x5,  lsl #1
-    add         x1,  x1,  x10
-    add         x2,  x2,  x10
-    add         x3,  x3,  x10
-    add         x4,  x4,  x10
-    b.gt        1b
-
-    ret
-endfunc
-
-function x264_load_deinterleave_chroma_fenc_neon, export=1
-    mov         x4,  #FENC_STRIDE/2
-    b           load_deinterleave_chroma
-endfunc
-
-function x264_load_deinterleave_chroma_fdec_neon, export=1
-    mov         x4,  #FDEC_STRIDE/2
-load_deinterleave_chroma:
-    ld2        {v0.8b,v1.8b}, [x1], x2
-    ld2        {v2.8b,v3.8b}, [x1], x2
-    subs        w3,  w3,  #2
-    st1        {v0.8b}, [x0], x4
-    st1        {v1.8b}, [x0], x4
-    st1        {v2.8b}, [x0], x4
-    st1        {v3.8b}, [x0], x4
-    b.gt        load_deinterleave_chroma
-
-    ret
-endfunc
-
-function x264_plane_copy_core_neon, export=1
-    add         x8,  x4,  #15
-    and         x4,  x8,  #~15
-    sub         x1,  x1,  x4
-    sub         x3,  x3,  x4
-1:
-    mov         w8,  w4
-16:
-    tst         w8,  #16
-    b.eq        32f
-    subs        w8,  w8,  #16
-    ldr         q0,  [x2], #16
-    str         q0,  [x0], #16
-    b.eq        0f
-32:
-    subs        w8,  w8,  #32
-    ldp         q0,  q1,  [x2], #32
-    stp         q0,  q1,  [x0], #32
-    b.gt        32b
-0:
-    subs        w5,  w5,  #1
-    add         x2,  x2,  x3
-    add         x0,  x0,  x1
-    b.gt        1b
-
-    ret
-endfunc
-
-function x264_plane_copy_swap_core_neon, export=1
-    lsl         w4,  w4,  #1
-    sub         x1,  x1,  x4
-    sub         x3,  x3,  x4
-1:
-    mov         w8,  w4
-    tbz         w4,  #4,  32f
-    subs        w8,  w8,  #16
-    ld1         {v0.16b}, [x2], #16
-    rev16       v0.16b, v0.16b
-    st1         {v0.16b}, [x0], #16
-    b.eq        0f
-32:
-    subs        w8,  w8,  #32
-    ld1         {v0.16b,v1.16b}, [x2], #32
-    rev16       v0.16b, v0.16b
-    rev16       v1.16b, v1.16b
-    st1         {v0.16b,v1.16b}, [x0], #32
-    b.gt        32b
-0:
-    subs        w5,  w5,  #1
-    add         x2,  x2,  x3
-    add         x0,  x0,  x1
-    b.gt        1b
-
-    ret
-endfunc
-
-function x264_plane_copy_deinterleave_neon, export=1
-    add         w9,  w6,  #15
-    and         w9,  w9,  #0xfffffff0
-    sub         x1,  x1,  x9
-    sub         x3,  x3,  x9
-    sub         x5,  x5,  x9, lsl #1
-1:
-    ld2        {v0.16b,v1.16b}, [x4], #32
-    subs        w9,  w9,  #16
-    st1        {v0.16b}, [x0],  #16
-    st1        {v1.16b}, [x2],  #16
-    b.gt        1b
-
-    add         x4,  x4,  x5
-    subs        w7,  w7,  #1
-    add         x0,  x0,  x1
-    add         x2,  x2,  x3
-    mov         w9,  w6
-    b.gt       1b
-
-    ret
-endfunc
-
-.macro deinterleave_rgb
-    subs            x11, x11, #8
-    st1            {v0.8b},    [x0], #8
-    st1            {v1.8b},    [x2], #8
-    st1            {v2.8b},    [x4], #8
-    b.gt            1b
-
-    subs            w10, w10, #1
-    add             x0,  x0,  x1
-    add             x2,  x2,  x3
-    add             x4,  x4,  x5
-    add             x6,  x6,  x7
-    mov             x11, x9
-    b.gt            1b
-.endm
-
-function x264_plane_copy_deinterleave_rgb_neon, export=1
-#if SYS_MACOSX
-    ldr             w8,  [sp]
-    ldp             w9,  w10, [sp, #4]
-#else
-    ldr             x8,  [sp]
-    ldp             x9,  x10, [sp, #8]
-#endif
-    cmp             w8,  #3
-    uxtw            x9,  w9
-    add             x11, x9,  #7
-    and             x11, x11, #~7
-    sub             x1,  x1,  x11
-    sub             x3,  x3,  x11
-    sub             x5,  x5,  x11
-    b.ne            4f
-    sub             x7,  x7,  x11, lsl #1
-    sub             x7,  x7,  x11
-1:
-    ld3            {v0.8b,v1.8b,v2.8b}, [x6], #24
-    deinterleave_rgb
-
-    ret
-4:
-    sub             x7,  x7,  x11, lsl #2
-1:
-    ld4            {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
-    deinterleave_rgb
-
-    ret
-endfunc
-
-function x264_plane_copy_interleave_core_neon, export=1
-    add         w9,  w6,  #15
-    and         w9,  w9,  #0xfffffff0
-    sub         x1,  x1,  x9,  lsl #1
-    sub         x3,  x3,  x9
-    sub         x5,  x5,  x9
-1:
-    ld1        {v0.16b}, [x2],  #16
-    ld1        {v1.16b}, [x4],  #16
-    subs        w9,  w9,  #16
-    st2        {v0.16b,v1.16b}, [x0],  #32
-    b.gt        1b
-
-    subs        w7,  w7,  #1
-    add         x0,  x0,  x1
-    add         x2,  x2,  x3
-    add         x4,  x4,  x5
-    mov         w9,  w6
-    b.gt        1b
-
-    ret
-endfunc
-
-function x264_store_interleave_chroma_neon, export=1
-    mov             x5,  #FDEC_STRIDE
-1:
-    ld1        {v0.8b}, [x2], x5
-    ld1        {v1.8b}, [x3], x5
-    ld1        {v2.8b}, [x2], x5
-    ld1        {v3.8b}, [x3], x5
-    subs        w4,  w4,  #2
-    zip1        v4.16b,  v0.16b,  v1.16b
-    zip1        v5.16b,  v2.16b,  v3.16b
-    st1        {v4.16b}, [x0], x1
-    st1        {v5.16b}, [x0], x1
-    b.gt        1b
-
-    ret
-endfunc
-
-.macro integral4h p1, p2
-    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
-    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
-    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
-    uaddl       v0.8h,  \p1\().8b,  v1.8b
-    uaddl       v4.8h,  v2.8b,  v3.8b
-    add         v0.8h,  v0.8h,  v4.8h
-    add         v0.8h,  v0.8h,  v5.8h
-.endm
-
-function integral_init4h_neon, export=1
-    sub         x3,  x0,  x2, lsl #1
-    ld1        {v6.8b,v7.8b}, [x1], #16
-1:
-    subs        x2,  x2,  #16
-    ld1        {v5.8h},  [x3], #16
-    integral4h  v6, v7
-    ld1        {v6.8b},  [x1], #8
-    ld1        {v5.8h},  [x3], #16
-    st1        {v0.8h},  [x0], #16
-    integral4h  v7, v6
-    ld1        {v7.8b},  [x1], #8
-    st1        {v0.8h},  [x0], #16
-    b.gt        1b
-    ret
-endfunc
-
-.macro integral8h p1, p2, s
-    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
-    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
-    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
-    ext         v4.8b,  \p1\().8b,  \p2\().8b,  #4
-    ext         v5.8b,  \p1\().8b,  \p2\().8b,  #5
-    ext         v6.8b,  \p1\().8b,  \p2\().8b,  #6
-    ext         v7.8b,  \p1\().8b,  \p2\().8b,  #7
-    uaddl       v0.8h,  \p1\().8b,  v1.8b
-    uaddl       v2.8h,  v2.8b,  v3.8b
-    uaddl       v4.8h,  v4.8b,  v5.8b
-    uaddl       v6.8h,  v6.8b,  v7.8b
-    add         v0.8h,  v0.8h,  v2.8h
-    add         v4.8h,  v4.8h,  v6.8h
-    add         v0.8h,  v0.8h,  v4.8h
-    add         v0.8h,  v0.8h,  \s\().8h
-.endm
-
-function integral_init8h_neon, export=1
-    sub         x3,  x0,  x2, lsl #1
-    ld1        {v16.8b,v17.8b}, [x1], #16
-1:
-    subs        x2,  x2,  #16
-    ld1        {v18.8h}, [x3], #16
-    integral8h  v16, v17, v18
-    ld1        {v16.8b}, [x1], #8
-    ld1        {v18.8h}, [x3], #16
-    st1        {v0.8h},  [x0], #16
-    integral8h  v17, v16, v18
-    ld1        {v17.8b}, [x1], #8
-    st1        {v0.8h},  [x0], #16
-    b.gt        1b
-    ret
-endfunc
-
-function integral_init4v_neon, export=1
-    mov         x3,  x0
-    add         x4,  x0,  x2,  lsl #3
-    add         x8,  x0,  x2,  lsl #4
-    sub         x2,  x2,  #8
-    ld1        {v20.8h,v21.8h,v22.8h}, [x3], #48
-    ld1        {v16.8h,v17.8h,v18.8h}, [x8], #48
-1:
-    subs        x2,  x2,  #16
-    ld1        {v24.8h,v25.8h}, [x4], #32
-    ext         v0.16b,  v20.16b, v21.16b, #8
-    ext         v1.16b,  v21.16b, v22.16b, #8
-    ext         v2.16b,  v16.16b, v17.16b, #8
-    ext         v3.16b,  v17.16b, v18.16b, #8
-    sub         v24.8h,  v24.8h,  v20.8h
-    sub         v25.8h,  v25.8h,  v21.8h
-    add         v0.8h,   v0.8h,   v20.8h
-    add         v1.8h,   v1.8h,   v21.8h
-    add         v2.8h,   v2.8h,   v16.8h
-    add         v3.8h,   v3.8h,   v17.8h
-    st1        {v24.8h},  [x1], #16
-    st1        {v25.8h},  [x1], #16
-    mov         v20.16b,  v22.16b
-    mov         v16.16b,  v18.16b
-    sub         v0.8h,   v2.8h,   v0.8h
-    sub         v1.8h,   v3.8h,   v1.8h
-    ld1        {v21.8h,v22.8h}, [x3], #32
-    ld1        {v17.8h,v18.8h}, [x8], #32
-    st1        {v0.8h},  [x0], #16
-    st1        {v1.8h},  [x0], #16
-    b.gt        1b
-2:
-    ret
-endfunc
-
-function integral_init8v_neon, export=1
-    add         x2,  x0,  x1,  lsl #4
-    sub         x1,  x1,  #8
-    ands        x3,  x1,  #16 - 1
-    b.eq        1f
-    subs        x1,  x1,  #8
-    ld1        {v0.8h}, [x0]
-    ld1        {v2.8h}, [x2], #16
-    sub         v4.8h,  v2.8h,  v0.8h
-    st1        {v4.8h},  [x0], #16
-    b.le        2f
-1:
-    subs        x1,  x1,  #16
-    ld1        {v0.8h,v1.8h}, [x0]
-    ld1        {v2.8h,v3.8h}, [x2], #32
-    sub         v4.8h,  v2.8h,  v0.8h
-    sub         v5.8h,  v3.8h,  v1.8h
-    st1        {v4.8h},  [x0], #16
-    st1        {v5.8h},  [x0], #16
-    b.gt        1b
-2:
-    ret
-endfunc
-
-function x264_mbtree_propagate_cost_neon, export=1
-    ld1r        {v5.4s},  [x5]
-8:
-    subs        w6,  w6,  #8
-    ld1         {v1.8h},  [x1], #16
-    ld1         {v2.8h},  [x2], #16
-    ld1         {v3.8h},  [x3], #16
-    ld1         {v4.8h},  [x4], #16
-    bic         v3.8h,  #0xc0, lsl #8
-    umin        v3.8h,  v2.8h,  v3.8h
-    umull       v20.4s, v2.4h,  v4.4h   // propagate_intra
-    umull2      v21.4s, v2.8h,  v4.8h   // propagate_intra
-    usubl       v22.4s, v2.4h,  v3.4h   // propagate_num
-    usubl2      v23.4s, v2.8h,  v3.8h   // propagate_num
-    uxtl        v26.4s, v2.4h           // propagate_denom
-    uxtl2       v27.4s, v2.8h           // propagate_denom
-    uxtl        v24.4s, v1.4h
-    uxtl2       v25.4s, v1.8h
-    ucvtf       v20.4s, v20.4s
-    ucvtf       v21.4s, v21.4s
-    ucvtf       v26.4s, v26.4s
-    ucvtf       v27.4s, v27.4s
-    ucvtf       v22.4s, v22.4s
-    ucvtf       v23.4s, v23.4s
-    frecpe      v28.4s, v26.4s
-    frecpe      v29.4s, v27.4s
-    ucvtf       v24.4s, v24.4s
-    ucvtf       v25.4s, v25.4s
-    frecps      v30.4s, v28.4s, v26.4s
-    frecps      v31.4s, v29.4s, v27.4s
-    fmla        v24.4s, v20.4s, v5.4s   // propagate_amount
-    fmla        v25.4s, v21.4s, v5.4s   // propagate_amount
-    fmul        v28.4s, v28.4s, v30.4s
-    fmul        v29.4s, v29.4s, v31.4s
-    fmul        v16.4s, v24.4s, v22.4s
-    fmul        v17.4s, v25.4s, v23.4s
-    fmul        v18.4s, v16.4s, v28.4s
-    fmul        v19.4s, v17.4s, v29.4s
-    fcvtns      v20.4s, v18.4s
-    fcvtns      v21.4s, v19.4s
-    sqxtn       v0.4h,  v20.4s
-    sqxtn2      v0.8h,  v21.4s
-    st1         {v0.8h},  [x0], #16
-    b.gt        8b
-    ret
-endfunc
-
-const pw_0to15, align=5
-    .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-endconst
-
-function x264_mbtree_propagate_list_internal_neon, export=1
-    movrel      x11,  pw_0to15
-    dup         v31.8h,  w4             // bipred_weight
-    movi        v30.8h,  #0xc0, lsl #8
-    ld1         {v29.8h},  [x11] //h->mb.i_mb_x,h->mb.i_mb_y
-    movi        v28.4s,  #4
-    movi        v27.8h,  #31
-    movi        v26.8h,  #32
-    dup         v24.8h,  w5             // mb_y
-    zip1        v29.8h,  v29.8h, v24.8h
-8:
-    subs        w6,  w6,  #8
-    ld1         {v1.8h},  [x1], #16     // propagate_amount
-    ld1         {v2.8h},  [x2], #16     // lowres_cost
-    and         v2.16b, v2.16b, v30.16b
-    cmeq        v25.8h, v2.8h,  v30.8h
-    umull       v16.4s, v1.4h,  v31.4h
-    umull2      v17.4s, v1.8h,  v31.8h
-    rshrn       v16.4h, v16.4s, #6
-    rshrn2      v16.8h, v17.4s, #6
-    bsl         v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
-    //          propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
-    ld1         {v4.8h,v5.8h},  [x0],  #32
-    sshr        v6.8h,  v4.8h,  #5
-    sshr        v7.8h,  v5.8h,  #5
-    add         v6.8h,  v6.8h,  v29.8h
-    add         v29.8h, v29.8h, v28.8h
-    add         v7.8h,  v7.8h,  v29.8h
-    add         v29.8h, v29.8h, v28.8h
-    st1         {v6.8h,v7.8h},  [x3],  #32
-    and         v4.16b, v4.16b, v27.16b
-    and         v5.16b, v5.16b, v27.16b
-    uzp1        v6.8h,  v4.8h,  v5.8h   // x & 31
-    uzp2        v7.8h,  v4.8h,  v5.8h   // y & 31
-    sub         v4.8h,  v26.8h, v6.8h   // 32 - (x & 31)
-    sub         v5.8h,  v26.8h, v7.8h   // 32 - (y & 31)
-    mul         v19.8h, v6.8h,  v7.8h   // idx3weight = y*x;
-    mul         v18.8h, v4.8h,  v7.8h   // idx2weight = y*(32-x);
-    mul         v17.8h, v6.8h,  v5.8h   // idx1weight = (32-y)*x;
-    mul         v16.8h, v4.8h,  v5.8h   // idx0weight = (32-y)*(32-x) ;
-    umull       v6.4s,  v19.4h, v25.4h
-    umull2      v7.4s,  v19.8h, v25.8h
-    umull       v4.4s,  v18.4h, v25.4h
-    umull2      v5.4s,  v18.8h, v25.8h
-    umull       v2.4s,  v17.4h, v25.4h
-    umull2      v3.4s,  v17.8h, v25.8h
-    umull       v0.4s,  v16.4h, v25.4h
-    umull2      v1.4s,  v16.8h, v25.8h
-    rshrn       v19.4h, v6.4s,  #10
-    rshrn2      v19.8h, v7.4s,  #10
-    rshrn       v18.4h, v4.4s,  #10
-    rshrn2      v18.8h, v5.4s,  #10
-    rshrn       v17.4h, v2.4s,  #10
-    rshrn2      v17.8h, v3.4s,  #10
-    rshrn       v16.4h, v0.4s,  #10
-    rshrn2      v16.8h, v1.4s,  #10
-    zip1        v0.8h,  v16.8h, v17.8h
-    zip2        v1.8h,  v16.8h, v17.8h
-    zip1        v2.8h,  v18.8h, v19.8h
-    zip2        v3.8h,  v18.8h, v19.8h
-    st1         {v0.8h,v1.8h},  [x3], #32
-    st1         {v2.8h,v3.8h},  [x3], #32
-    b.ge        8b
-    ret
-endfunc
-
-function x264_memcpy_aligned_neon, export=1
-    tst         x2,  #16
-    b.eq        32f
-    sub         x2,  x2,  #16
-    ldr         q0,  [x1], #16
-    str         q0,  [x0], #16
-32:
-    tst         x2,  #32
-    b.eq        640f
-    sub         x2,  x2,  #32
-    ldp         q0,  q1,  [x1], #32
-    stp         q0,  q1,  [x0], #32
-640:
-    cbz         x2,  1f
-64:
-    subs        x2,  x2,  #64
-    ldp         q0,  q1,  [x1, #32]
-    ldp         q2,  q3,  [x1], #64
-    stp         q0,  q1,  [x0, #32]
-    stp         q2,  q3,  [x0], #64
-    b.gt        64b
-1:
-    ret
-endfunc
-
-function x264_memzero_aligned_neon, export=1
-    movi        v0.16b,  #0
-    movi        v1.16b,  #0
-1:
-    subs        x1,  x1,  #128
-    stp         q0,  q1,  [x0, #96]
-    stp         q0,  q1,  [x0, #64]
-    stp         q0,  q1,  [x0, #32]
-    stp         q0,  q1,  [x0], 128
-    b.gt        1b
-    ret
-endfunc
-
-// void mbtree_fix8_pack( int16_t *dst, float *src, int count )
-function x264_mbtree_fix8_pack_neon, export=1
-    subs        w3,  w2,  #8
-    b.lt        2f
-1:
-    subs        w3,  w3,  #8
-    ld1         {v0.4s,v1.4s}, [x1], #32
-    fcvtzs      v0.4s,  v0.4s,  #8
-    fcvtzs      v1.4s,  v1.4s,  #8
-    sqxtn       v2.4h,  v0.4s
-    sqxtn2      v2.8h,  v1.4s
-    rev16       v3.16b, v2.16b
-    st1         {v3.8h},  [x0], #16
-    b.ge        1b
-2:
-    adds        w3,  w3,  #8
-    b.eq        4f
-3:
-    subs        w3,  w3,  #1
-    ldr         s0, [x1], #4
-    fcvtzs      w4,  s0,  #8
-    rev16       w5,  w4
-    strh        w5, [x0], #2
-    b.gt        3b
-4:
-    ret
-endfunc
-
-// void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
-function x264_mbtree_fix8_unpack_neon, export=1
-    subs        w3,  w2,  #8
-    b.lt        2f
-1:
-    subs        w3,  w3,  #8
-    ld1         {v0.8h}, [x1], #16
-    rev16       v1.16b, v0.16b
-    sxtl        v2.4s,  v1.4h
-    sxtl2       v3.4s,  v1.8h
-    scvtf       v4.4s,  v2.4s,  #8
-    scvtf       v5.4s,  v3.4s,  #8
-    st1         {v4.4s,v5.4s}, [x0], #32
-    b.ge        1b
-2:
-    adds        w3,  w3,  #8
-    b.eq        4f
-3:
-    subs        w3,  w3,  #1
-    ldrh        w4, [x1], #2
-    rev16       w5,  w4
-    sxth        w6,  w5
-    scvtf       s0,  w6,  #8
-    str         s0, [x0], #4
-    b.gt        3b
-4:
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/mc-c.c b/android/src/main/libenc/jni/libx264/common/aarch64/mc-c.c
deleted file mode 100755
index 09794d8..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/mc-c.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*****************************************************************************
- * mc-c.c: aarch64 motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "mc.h"
-
-void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
-void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_neon( void *dst, size_t n );
-
-void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-
-void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
-                                pixel *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
-                                     pixel *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
-                                         pixel *dstv, intptr_t i_dstv,
-                                         pixel *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
-                                            pixel *dstb, intptr_t i_dstb,
-                                            pixel *dstc, intptr_t i_dstc,
-                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
-                                           pixel *srcu, intptr_t i_srcu,
-                                           pixel *srcv, intptr_t i_srcv, int w, int h );
-
-void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
-
-#define MC_WEIGHT(func)\
-void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-\
-static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
-{\
-    x264_mc_weight_w4##func##_neon,\
-    x264_mc_weight_w4##func##_neon,\
-    x264_mc_weight_w8##func##_neon,\
-    x264_mc_weight_w16##func##_neon,\
-    x264_mc_weight_w16##func##_neon,\
-    x264_mc_weight_w20##func##_neon,\
-};
-
-MC_WEIGHT()
-MC_WEIGHT(_nodenom)
-MC_WEIGHT(_offsetadd)
-MC_WEIGHT(_offsetsub)
-
-void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
-void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
-void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
-void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
-void integral_init8v_neon( uint16_t *, intptr_t );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
-
-void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
-
-void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
-void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
-
-#if !HIGH_BIT_DEPTH
-static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
-{
-    if( w->i_scale == 1<<w->i_denom )
-    {
-        if( w->i_offset < 0 )
-        {
-            w->weightfn = x264_mc_offsetsub_wtab_neon;
-            w->cachea[0] = -w->i_offset;
-        }
-        else
-        {
-            w->weightfn = x264_mc_offsetadd_wtab_neon;
-            w->cachea[0] = w->i_offset;
-        }
-    }
-    else if( !w->i_denom )
-        w->weightfn = x264_mc_nodenom_wtab_neon;
-    else
-        w->weightfn = x264_mc_wtab_neon;
-}
-
-static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
-{
-    NULL,
-    x264_pixel_avg2_w4_neon,
-    x264_pixel_avg2_w8_neon,
-    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
-    x264_pixel_avg2_w16_neon,
-    x264_pixel_avg2_w20_neon,
-};
-
-static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
-{
-    NULL,
-    x264_mc_copy_w4_neon,
-    x264_mc_copy_w8_neon,
-    NULL,
-    x264_mc_copy_w16_neon,
-};
-
-static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
-                          uint8_t *src[4], intptr_t i_src_stride,
-                          int mvx, int mvy,
-                          int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if ( (mvy&3) == 3 )             // explict if() to force conditional add
-        src1 += i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_neon[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
-    }
-    else if( weight->weightfn )
-        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
-    else
-        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
-}
-
-static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
-                              uint8_t *src[4], intptr_t i_src_stride,
-                              int mvx, int mvy,
-                              int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if ( (mvy&3) == 3 )             // explict if() to force conditional add
-        src1 += i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_neon[i_width>>2](
-                dst, *i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
-        return dst;
-    }
-    else if( weight->weightfn )
-    {
-        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
-        return dst;
-    }
-    else
-    {
-        *i_dst_stride = i_src_stride;
-        return src1;
-    }
-}
-
-void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-                            uint8_t *src, intptr_t stride, int width,
-                            int height, int16_t *buf );
-
-PLANE_COPY(16, neon)
-PLANE_COPY_SWAP(16, neon)
-PLANE_INTERLEAVE(neon)
-#endif // !HIGH_BIT_DEPTH
-
-PROPAGATE_LIST(neon)
-
-void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
-{
-#if !HIGH_BIT_DEPTH
-    if( cpu&X264_CPU_ARMV8 )
-    {
-        pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
-        pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
-        pf->prefetch_ref      = x264_prefetch_ref_aarch64;
-    }
-
-    if( !(cpu&X264_CPU_NEON) )
-        return;
-
-    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
-    pf->copy[PIXEL_16x16]    = x264_mc_copy_w16_neon;
-    pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
-    pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
-
-    pf->plane_copy                  = x264_plane_copy_neon;
-    pf->plane_copy_swap             = x264_plane_copy_swap_neon;
-    pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
-    pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
-
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
-    pf->store_interleave_chroma       = x264_store_interleave_chroma_neon;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
-
-    pf->weight       = x264_mc_wtab_neon;
-    pf->offsetadd    = x264_mc_offsetadd_wtab_neon;
-    pf->offsetsub    = x264_mc_offsetsub_wtab_neon;
-    pf->weight_cache = x264_weight_cache_neon;
-
-    pf->mc_chroma = x264_mc_chroma_neon;
-    pf->mc_luma = mc_luma_neon;
-    pf->get_ref = get_ref_neon;
-    pf->hpel_filter = x264_hpel_filter_neon;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
-
-    pf->integral_init4h = integral_init4h_neon;
-    pf->integral_init8h = integral_init8h_neon;
-    pf->integral_init4v = integral_init4v_neon;
-    pf->integral_init8v = integral_init8v_neon;
-
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
-
-    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
-    pf->memzero_aligned = x264_memzero_aligned_neon;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/mc.h b/android/src/main/libenc/jni/libx264/common/aarch64/mc.h
deleted file mode 100755
index 86f2bb9..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/mc.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * mc.h: aarch64 motion compensation
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_MC_H
-#define X264_AARCH64_MC_H
-
-void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/pixel-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/pixel-a.S
deleted file mode 100755
index 92ec92d..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/pixel-a.S
+++ /dev/null
@@ -1,1406 +0,0 @@
-/*****************************************************************************
- * pixel.S: aarch64 pixel metrics
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-const mask
-.rept 16
-.byte 0xff
-.endr
-.rept 16
-.byte 0x00
-.endr
-endconst
-
-const mask_ac_4_8
-.short 0, -1, -1, -1,  0, -1, -1, -1
-.short 0, -1, -1, -1, -1, -1, -1, -1
-endconst
-
-.macro SAD_START_4
-    ld1        {v1.s}[0], [x2], x3
-    ld1        {v0.s}[0], [x0], x1
-    ld1        {v1.s}[1], [x2], x3
-    ld1        {v0.s}[1], [x0], x1
-    uabdl       v16.8h,  v0.8b,  v1.8b
-.endm
-
-.macro SAD_4
-    ld1        {v1.s}[0], [x2], x3
-    ld1        {v0.s}[0], [x0], x1
-    ld1        {v1.s}[1], [x2], x3
-    ld1        {v0.s}[1], [x0], x1
-    uabal       v16.8h,  v0.8b,  v1.8b
-.endm
-
-.macro SAD_START_8
-    ld1         {v1.8b}, [x2], x3
-    ld1         {v0.8b}, [x0], x1
-    ld1         {v3.8b}, [x2], x3
-    ld1         {v2.8b}, [x0], x1
-    uabdl       v16.8h,  v0.8b,  v1.8b
-    uabdl       v17.8h,  v2.8b,  v3.8b
-.endm
-
-.macro SAD_8
-    ld1         {v1.8b}, [x2], x3
-    ld1         {v0.8b}, [x0], x1
-    ld1         {v3.8b}, [x2], x3
-    ld1         {v2.8b}, [x0], x1
-    uabal       v16.8h,  v0.8b,  v1.8b
-    uabal       v17.8h,  v2.8b,  v3.8b
-.endm
-
-.macro SAD_START_16
-    ld1         {v1.16b}, [x2], x3
-    ld1         {v0.16b}, [x0], x1
-    ld1         {v3.16b}, [x2], x3
-    ld1         {v2.16b}, [x0], x1
-    uabdl       v16.8h,  v0.8b,  v1.8b
-    uabdl2      v17.8h,  v0.16b, v1.16b
-    uabal       v16.8h,  v2.8b,  v3.8b
-    uabal2      v17.8h,  v2.16b, v3.16b
-.endm
-
-.macro SAD_16
-    ld1         {v1.16b}, [x2], x3
-    ld1         {v0.16b}, [x0], x1
-    ld1         {v3.16b}, [x2], x3
-    ld1         {v2.16b}, [x0], x1
-    uabal       v16.8h,  v0.8b,  v1.8b
-    uabal2      v17.8h,  v0.16b, v1.16b
-    uabal       v16.8h,  v2.8b,  v3.8b
-    uabal2      v17.8h,  v2.16b, v3.16b
-.endm
-
-.macro SAD_FUNC w, h, name
-function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
-    SAD_START_\w
-
-.rept \h / 2 - 1
-    SAD_\w
-.endr
-.if \w > 4
-    add         v16.8h,  v16.8h,  v17.8h
-.endif
-    uaddlv      s0,  v16.8h
-    fmov        w0,  s0
-    ret
-endfunc
-.endm
-
-SAD_FUNC  4,  4
-SAD_FUNC  4,  8
-SAD_FUNC  4,  16
-SAD_FUNC  8,  4
-SAD_FUNC  8,  8
-SAD_FUNC  8,  16
-SAD_FUNC  16, 8
-SAD_FUNC  16, 16
-
-.macro SAD_X_4 x, first=uabal
-    ld1        {v0.s}[0], [x0], x7
-    ld1        {v1.s}[0], [x1], x5
-    ld1        {v0.s}[1], [x0], x7
-    ld1        {v1.s}[1], [x1], x5
-    \first      v16.8h,  v1.8b,  v0.8b
-    ld1        {v2.s}[0], [x2], x5
-    ld1        {v2.s}[1], [x2], x5
-    \first      v17.8h,  v2.8b,  v0.8b
-    ld1        {v3.s}[0], [x3], x5
-    ld1        {v3.s}[1], [x3], x5
-    \first      v18.8h,  v3.8b,  v0.8b
-.if \x == 4
-    ld1        {v4.s}[0], [x4], x5
-    ld1        {v4.s}[1], [x4], x5
-    \first      v19.8h,  v4.8b,  v0.8b
-.endif
-.endm
-
-.macro SAD_X_8 x, first=uabal
-    ld1        {v0.8b}, [x0], x7
-    ld1        {v1.8b}, [x1], x5
-    \first      v16.8h,  v1.8b,  v0.8b
-    ld1        {v2.8b}, [x2], x5
-    ld1        {v5.8b}, [x0], x7
-    \first      v17.8h,  v2.8b,  v0.8b
-    ld1        {v3.8b}, [x3], x5
-    ld1        {v1.8b}, [x1], x5
-    \first      v18.8h,  v3.8b,  v0.8b
-    uabal       v16.8h,  v1.8b,  v5.8b
-    ld1        {v2.8b}, [x2], x5
-    ld1        {v3.8b}, [x3], x5
-    uabal       v17.8h,  v2.8b,  v5.8b
-    uabal       v18.8h,  v3.8b,  v5.8b
-.if \x == 4
-    ld1        {v4.8b}, [x4], x5
-    \first      v19.8h,  v4.8b,  v0.8b
-    ld1        {v4.8b}, [x4], x5
-    uabal       v19.8h,  v4.8b,  v5.8b
-.endif
-.endm
-
-.macro SAD_X_16 x, first=uabal
-    ld1        {v0.16b}, [x0], x7
-    ld1        {v1.16b}, [x1], x5
-    \first      v16.8h,  v1.8b,  v0.8b
-    \first\()2  v20.8h,  v1.16b, v0.16b
-    ld1        {v2.16b}, [x2], x5
-    ld1        {v5.16b}, [x0], x7
-    \first      v17.8h,  v2.8b,  v0.8b
-    \first\()2  v21.8h,  v2.16b, v0.16b
-    ld1        {v3.16b}, [x3], x5
-    ld1        {v1.16b}, [x1], x5
-    \first      v18.8h,  v3.8b,  v0.8b
-    \first\()2  v22.8h,  v3.16b, v0.16b
-    uabal       v16.8h,  v1.8b,  v5.8b
-    uabal2      v20.8h,  v1.16b, v5.16b
-    ld1        {v2.16b}, [x2], x5
-    ld1        {v3.16b}, [x3], x5
-    uabal       v17.8h,  v2.8b,  v5.8b
-    uabal2      v21.8h,  v2.16b, v5.16b
-    uabal       v18.8h,  v3.8b,  v5.8b
-    uabal2      v22.8h,  v3.16b, v5.16b
-.if \x == 4
-    ld1        {v4.16b}, [x4], x5
-    \first      v19.8h,  v4.8b,  v0.8b
-    \first\()2  v23.8h,  v4.16b, v0.16b
-    ld1        {v4.16b}, [x4], x5
-    uabal       v19.8h,  v4.8b,  v5.8b
-    uabal2      v23.8h,  v4.16b, v5.16b
-.endif
-.endm
-
-.macro SAD_X_FUNC x, w, h
-function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
-.if \x == 3
-    mov         x6,  x5
-    mov         x5,  x4
-.endif
-    mov         x7,  #FENC_STRIDE
-
-    SAD_X_\w \x, uabdl
-
-.rept \h / 2 - 1
-    SAD_X_\w \x
-.endr
-
-.if \w > 8
-    add         v16.8h, v16.8h, v20.8h
-    add         v17.8h, v17.8h, v21.8h
-    add         v18.8h, v18.8h, v22.8h
-.if \x == 4
-    add         v19.8h, v19.8h, v23.8h
-.endif
-.endif
-// add up the sads
-    uaddlv      s0,  v16.8h
-    uaddlv      s1,  v17.8h
-    uaddlv      s2,  v18.8h
-
-    stp         s0,  s1,  [x6], #8
-.if \x == 3
-    str         s2,  [x6]
-.else
-    uaddlv      s3,  v19.8h
-    stp         s2,  s3,  [x6]
-.endif
-    ret
-endfunc
-.endm
-
-SAD_X_FUNC  3, 4,  4
-SAD_X_FUNC  3, 4,  8
-SAD_X_FUNC  3, 8,  4
-SAD_X_FUNC  3, 8,  8
-SAD_X_FUNC  3, 8,  16
-SAD_X_FUNC  3, 16, 8
-SAD_X_FUNC  3, 16, 16
-
-SAD_X_FUNC  4, 4,  4
-SAD_X_FUNC  4, 4,  8
-SAD_X_FUNC  4, 8,  4
-SAD_X_FUNC  4, 8,  8
-SAD_X_FUNC  4, 8,  16
-SAD_X_FUNC  4, 16, 8
-SAD_X_FUNC  4, 16, 16
-
-
-function x264_pixel_vsad_neon, export=1
-    subs        w2,  w2,  #2
-    ld1        {v0.16b},  [x0],  x1
-    ld1        {v1.16b},  [x0],  x1
-    uabdl       v6.8h,  v0.8b,  v1.8b
-    uabdl2      v7.8h,  v0.16b, v1.16b
-    b.le        2f
-1:
-    subs        w2,  w2,  #2
-    ld1        {v0.16b},  [x0],  x1
-    uabal       v6.8h,  v1.8b,  v0.8b
-    uabal2      v7.8h,  v1.16b, v0.16b
-    ld1        {v1.16b},  [x0],  x1
-    b.lt        2f
-    uabal       v6.8h,  v0.8b,  v1.8b
-    uabal2      v7.8h,  v0.16b, v1.16b
-    b.gt        1b
-2:
-    add         v5.8h,  v6.8h,  v7.8h
-    uaddlv      s0,  v5.8h
-    fmov        w0,  s0
-    ret
-endfunc
-
-function x264_pixel_asd8_neon, export=1
-    sub         w4,  w4,  #2
-    ld1        {v0.8b}, [x0], x1
-    ld1        {v1.8b}, [x2], x3
-    ld1        {v2.8b}, [x0], x1
-    ld1        {v3.8b}, [x2], x3
-    usubl       v16.8h, v0.8b,  v1.8b
-1:
-    subs        w4,  w4,  #2
-    ld1        {v4.8b}, [x0], x1
-    ld1        {v5.8b}, [x2], x3
-    usubl       v17.8h, v2.8b,  v3.8b
-    usubl       v18.8h, v4.8b,  v5.8b
-    add         v16.8h, v16.8h, v17.8h
-    ld1        {v2.8b}, [x0], x1
-    ld1        {v3.8b}, [x2], x3
-    add         v16.8h, v16.8h, v18.8h
-    b.gt        1b
-    usubl       v17.8h, v2.8b,  v3.8b
-    add         v16.8h, v16.8h, v17.8h
-    saddlv      s0,  v16.8h
-    abs         v0.2s,  v0.2s
-    fmov        w0,  s0
-    ret
-endfunc
-
-.macro SSD_START_4
-    ld1        {v16.s}[0], [x0], x1
-    ld1        {v17.s}[0], [x2], x3
-    usubl       v2.8h,  v16.8b,  v17.8b
-    ld1        {v16.s}[0], [x0], x1
-    ld1        {v17.s}[0], [x2], x3
-    smull       v0.4s,  v2.4h,   v2.4h
-.endm
-
-.macro SSD_4
-    usubl       v2.8h,  v16.8b,  v17.8b
-    ld1        {v16.s}[0], [x0], x1
-    ld1        {v17.s}[0], [x2], x3
-    smlal       v0.4s,  v2.4h,   v2.4h
-.endm
-
-.macro SSD_END_4
-    usubl       v2.8h,  v16.8b,  v17.8b
-    smlal       v0.4s,  v2.4h,   v2.4h
-.endm
-
-.macro SSD_START_8
-    ld1        {v16.8b}, [x0], x1
-    ld1        {v17.8b}, [x2], x3
-    usubl       v2.8h,  v16.8b,  v17.8b
-    ld1        {v16.8b}, [x0], x1
-    smull       v0.4s,  v2.4h,   v2.4h
-    ld1        {v17.8b}, [x2], x3
-    smlal2      v0.4s,  v2.8h,   v2.8h
-.endm
-
-.macro SSD_8
-    usubl       v2.8h,  v16.8b,  v17.8b
-    ld1        {v16.8b}, [x0], x1
-    smlal       v0.4s,  v2.4h,   v2.4h
-    ld1        {v17.8b}, [x2], x3
-    smlal2      v0.4s,  v2.8h,   v2.8h
-.endm
-
-.macro SSD_END_8
-    usubl       v2.8h,  v16.8b,  v17.8b
-    smlal       v0.4s,  v2.4h,   v2.4h
-    smlal2      v0.4s,  v2.8h,   v2.8h
-.endm
-
-.macro SSD_START_16
-    ld1        {v16.16b}, [x0], x1
-    ld1        {v17.16b}, [x2], x3
-    usubl       v2.8h,  v16.8b,  v17.8b
-    usubl2      v3.8h,  v16.16b, v17.16b
-    ld1         {v16.16b}, [x0], x1
-    smull       v0.4s,  v2.4h,   v2.4h
-    smull2      v1.4s,  v2.8h,   v2.8h
-    ld1         {v17.16b}, [x2], x3
-    smlal       v0.4s,  v3.4h,   v3.4h
-    smlal2      v1.4s,  v3.8h,   v3.8h
-.endm
-
-.macro SSD_16
-    usubl       v2.8h,  v16.8b,  v17.8b
-    usubl2      v3.8h,  v16.16b, v17.16b
-    ld1         {v16.16b}, [x0], x1
-    smlal       v0.4s,  v2.4h,   v2.4h
-    smlal2      v1.4s,  v2.8h,   v2.8h
-    ld1         {v17.16b}, [x2], x3
-    smlal       v0.4s,  v3.4h,   v3.4h
-    smlal2      v1.4s,  v3.8h,   v3.8h
-.endm
-
-.macro SSD_END_16
-    usubl       v2.8h,  v16.8b,  v17.8b
-    usubl2      v3.8h,  v16.16b, v17.16b
-    smlal       v0.4s,  v2.4h,   v2.4h
-    smlal2      v1.4s,  v2.8h,   v2.8h
-    smlal       v0.4s,  v3.4h,   v3.4h
-    smlal2      v1.4s,  v3.8h,   v3.8h
-    add         v0.4s,  v0.4s,   v1.4s
-.endm
-
-.macro SSD_FUNC w h
-function x264_pixel_ssd_\w\()x\h\()_neon, export=1
-    SSD_START_\w
-.rept \h-2
-    SSD_\w
-.endr
-    SSD_END_\w
-
-    addv        s0,  v0.4s
-    mov         w0,  v0.s[0]
-    ret
-endfunc
-.endm
-
-SSD_FUNC   4, 4
-SSD_FUNC   4, 8
-SSD_FUNC   4, 16
-SSD_FUNC   8, 4
-SSD_FUNC   8, 8
-SSD_FUNC   8, 16
-SSD_FUNC  16, 8
-SSD_FUNC  16, 16
-
-
-function x264_pixel_ssd_nv12_core_neon, export=1
-    sxtw        x8,  w4
-    add         x8,  x8,  #8
-    and         x8,  x8,  #~15
-    movi        v6.2d,  #0
-    movi        v7.2d,  #0
-    sub         x1,  x1,  x8, lsl #1
-    sub         x3,  x3,  x8, lsl #1
-1:
-    subs        w8,  w4,  #16
-    ld2        {v0.8b,v1.8b},   [x0],  #16
-    ld2        {v2.8b,v3.8b},   [x2],  #16
-    ld2        {v24.8b,v25.8b}, [x0],  #16
-    ld2        {v26.8b,v27.8b}, [x2],  #16
-
-    usubl       v16.8h, v0.8b,  v2.8b
-    usubl       v17.8h, v1.8b,  v3.8b
-    smull       v20.4s, v16.4h, v16.4h
-    smull       v21.4s, v17.4h, v17.4h
-    usubl       v18.8h, v24.8b, v26.8b
-    usubl       v19.8h, v25.8b, v27.8b
-    smlal2      v20.4s, v16.8h, v16.8h
-    smlal2      v21.4s, v17.8h, v17.8h
-
-    b.lt        4f
-    b.eq        3f
-2:
-    smlal       v20.4s, v18.4h, v18.4h
-    smlal       v21.4s, v19.4h, v19.4h
-    ld2        {v0.8b,v1.8b}, [x0],  #16
-    ld2        {v2.8b,v3.8b}, [x2],  #16
-    smlal2      v20.4s, v18.8h, v18.8h
-    smlal2      v21.4s, v19.8h, v19.8h
-
-    subs        w8,  w8,  #16
-    usubl       v16.8h, v0.8b,  v2.8b
-    usubl       v17.8h, v1.8b,  v3.8b
-    smlal       v20.4s, v16.4h, v16.4h
-    smlal       v21.4s, v17.4h, v17.4h
-    ld2        {v24.8b,v25.8b}, [x0],  #16
-    ld2        {v26.8b,v27.8b}, [x2],  #16
-    smlal2      v20.4s, v16.8h, v16.8h
-    smlal2      v21.4s, v17.8h, v17.8h
-    b.lt        4f
-
-    usubl       v18.8h, v24.8b, v26.8b
-    usubl       v19.8h, v25.8b, v27.8b
-    b.gt        2b
-3:
-    smlal       v20.4s, v18.4h, v18.4h
-    smlal       v21.4s, v19.4h, v19.4h
-    smlal2      v20.4s, v18.8h, v18.8h
-    smlal2      v21.4s, v19.8h, v19.8h
-4:
-    subs        w5,  w5,  #1
-    uaddw       v6.2d,  v6.2d,  v20.2s
-    uaddw       v7.2d,  v7.2d,  v21.2s
-    add         x0,  x0,  x1
-    add         x2,  x2,  x3
-    uaddw2      v6.2d,  v6.2d,  v20.4s
-    uaddw2      v7.2d,  v7.2d,  v21.4s
-    b.gt        1b
-
-    addp        v6.2d,  v6.2d,  v7.2d
-    st1        {v6.d}[0], [x6]
-    st1        {v6.d}[1], [x7]
-
-    ret
-endfunc
-
-.macro pixel_var_8 h
-function x264_pixel_var_8x\h\()_neon, export=1
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v17.8b}, [x0], x1
-    mov             x2,  \h - 4
-    umull           v1.8h,  v16.8b, v16.8b
-    uxtl            v0.8h,  v16.8b
-    umull           v2.8h,  v17.8b, v17.8b
-    uaddw           v0.8h,  v0.8h,  v17.8b
-    ld1            {v18.8b}, [x0], x1
-    uaddlp          v1.4s,  v1.8h
-    uaddlp          v2.4s,  v2.8h
-    ld1            {v19.8b}, [x0], x1
-
-1:  subs            x2,  x2,  #4
-    uaddw           v0.8h,  v0.8h,  v18.8b
-    umull           v24.8h, v18.8b, v18.8b
-    ld1            {v20.8b}, [x0], x1
-    uaddw           v0.8h,  v0.8h,  v19.8b
-    umull           v25.8h, v19.8b, v19.8b
-    uadalp          v1.4s,  v24.8h
-    ld1            {v21.8b}, [x0], x1
-    uaddw           v0.8h,  v0.8h,  v20.8b
-    umull           v26.8h, v20.8b, v20.8b
-    uadalp          v2.4s,  v25.8h
-    ld1            {v18.8b}, [x0], x1
-    uaddw           v0.8h,  v0.8h,  v21.8b
-    umull           v27.8h, v21.8b, v21.8b
-    uadalp          v1.4s,  v26.8h
-    ld1            {v19.8b}, [x0], x1
-    uadalp          v2.4s,  v27.8h
-    b.gt            1b
-
-    uaddw           v0.8h,  v0.8h,  v18.8b
-    umull           v28.8h, v18.8b, v18.8b
-    uaddw           v0.8h,  v0.8h,  v19.8b
-    umull           v29.8h, v19.8b, v19.8b
-    uadalp          v1.4s,  v28.8h
-    uadalp          v2.4s,  v29.8h
-
-    b               x264_var_end
-endfunc
-.endm
-
-pixel_var_8  8
-pixel_var_8 16
-
-function x264_pixel_var_16x16_neon, export=1
-    ld1            {v16.16b}, [x0],  x1
-    ld1            {v17.16b}, [x0],  x1
-    mov             x2,  #14
-    umull           v1.8h,  v16.8b,  v16.8b
-    umull2          v2.8h,  v16.16b, v16.16b
-    uxtl            v0.8h,  v16.8b
-    uaddlp          v1.4s,  v1.8h
-    uaddlp          v2.4s,  v2.8h
-    uaddw2          v0.8h,  v0.8h,   v16.16b
-
-1:  subs            x2,  x2,  #2
-    ld1            {v18.16b}, [x0],  x1
-    uaddw           v0.8h,  v0.8h,   v17.8b
-    umull           v3.8h,  v17.8b,  v17.8b
-    uaddw2          v0.8h,  v0.8h,   v17.16b
-    umull2          v4.8h,  v17.16b, v17.16b
-    uadalp          v1.4s,  v3.8h
-    uadalp          v2.4s,  v4.8h
-
-    ld1            {v17.16b}, [x0],  x1
-    uaddw           v0.8h,  v0.8h,   v18.8b
-    umull           v5.8h,  v18.8b,  v18.8b
-    uaddw2          v0.8h,  v0.8h,   v18.16b
-    umull2          v6.8h,  v18.16b, v18.16b
-    uadalp          v1.4s,  v5.8h
-    uadalp          v2.4s,  v6.8h
-    b.gt            1b
-
-    uaddw           v0.8h,  v0.8h,   v17.8b
-    umull           v3.8h,  v17.8b,  v17.8b
-    uaddw2          v0.8h,  v0.8h,   v17.16b
-    umull2          v4.8h,  v17.16b, v17.16b
-    uadalp          v1.4s,  v3.8h
-    uadalp          v2.4s,  v4.8h
-endfunc
-
-function x264_var_end
-    add             v1.4s,  v1.4s,  v2.4s
-    uaddlv          s0,  v0.8h
-    uaddlv          d1,  v1.4s
-    mov             w0,  v0.s[0]
-    mov             x1,  v1.d[0]
-    orr             x0,  x0,  x1,  lsl #32
-    ret
-endfunc
-
-
-.macro pixel_var2_8 h
-function x264_pixel_var2_8x\h\()_neon, export=1
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    mov             x5,  \h - 4
-    usubl           v6.8h,  v16.8b, v18.8b
-    usubl           v7.8h,  v17.8b, v19.8b
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smull           v2.4s,  v6.4h,  v6.4h
-    smull2          v3.4s,  v6.8h,  v6.8h
-    add             v0.8h,  v6.8h,  v7.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
-
-    usubl           v6.8h,  v16.8b, v18.8b
-
-1:  subs            x5,  x5,  #2
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    smlal           v2.4s,  v6.4h,  v6.4h
-    smlal2          v3.4s,  v6.8h,  v6.8h
-    usubl           v7.8h,  v17.8b, v19.8b
-    add             v0.8h,  v0.8h,  v6.8h
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
-    usubl           v6.8h,  v16.8b, v18.8b
-    add             v0.8h,  v0.8h,  v7.8h
-    b.gt            1b
-
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    smlal           v2.4s,  v6.4h,  v6.4h
-    smlal2          v3.4s,  v6.8h,  v6.8h
-    usubl           v7.8h,  v17.8b, v19.8b
-    add             v0.8h,  v0.8h,  v6.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    add             v0.8h,  v0.8h,  v7.8h
-    smlal2          v3.4s,  v7.8h,  v7.8h
-
-    saddlv          s0,  v0.8h
-    add             v2.4s,  v2.4s,  v3.4s
-    mov             w0,  v0.s[0]
-    addv            s1,  v2.4s
-    sxtw            x0,  w0
-    mov             w1,  v1.s[0]
-    mul             x0,  x0,  x0
-    str             w1,  [x4]
-    sub             x0,  x1,  x0,  lsr # 6 + (\h >> 4)
-
-    ret
-endfunc
-.endm
-
-pixel_var2_8  8
-pixel_var2_8 16
-
-
-function x264_pixel_satd_4x4_neon, export=1
-    ld1        {v1.s}[0],  [x2], x3
-    ld1        {v0.s}[0],  [x0], x1
-    ld1        {v3.s}[0],  [x2], x3
-    ld1        {v2.s}[0],  [x0], x1
-    ld1        {v1.s}[1],  [x2], x3
-    ld1        {v0.s}[1],  [x0], x1
-    ld1        {v3.s}[1],  [x2], x3
-    ld1        {v2.s}[1],  [x0], x1
-
-    usubl       v0.8h,  v0.8b,  v1.8b
-    usubl       v1.8h,  v2.8b,  v3.8b
-    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h
-
-    zip1        v0.2d,  v2.2d,  v3.2d
-    zip2        v1.2d,  v2.2d,  v3.2d
-    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h
-
-    trn1        v0.8h,  v2.8h,  v3.8h
-    trn2        v1.8h,  v2.8h,  v3.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v0.8h,  v1.8h
-
-    trn1        v0.4s,  v2.4s,  v3.4s
-    trn2        v1.4s,  v2.4s,  v3.4s
-    abs         v0.8h,  v0.8h
-    abs         v1.8h,  v1.8h
-    umax        v0.8h,  v0.8h,  v1.8h
-
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret
-endfunc
-
-function x264_pixel_satd_4x8_neon, export=1
-    ld1        {v1.s}[0],  [x2], x3
-    ld1        {v0.s}[0],  [x0], x1
-    ld1        {v3.s}[0],  [x2], x3
-    ld1        {v2.s}[0],  [x0], x1
-    ld1        {v5.s}[0],  [x2], x3
-    ld1        {v4.s}[0],  [x0], x1
-    ld1        {v7.s}[0],  [x2], x3
-    ld1        {v6.s}[0],  [x0], x1
-    ld1        {v1.s}[1],  [x2], x3
-    ld1        {v0.s}[1],  [x0], x1
-    ld1        {v3.s}[1],  [x2], x3
-    ld1        {v2.s}[1],  [x0], x1
-    ld1        {v5.s}[1],  [x2], x3
-    ld1        {v4.s}[1],  [x0], x1
-    ld1        {v7.s}[1],  [x2], x3
-    ld1        {v6.s}[1],  [x0], x1
-    b           x264_satd_4x8_8x4_end_neon
-endfunc
-
-function x264_pixel_satd_8x4_neon, export=1
-    ld1        {v1.8b},  [x2], x3
-    ld1        {v0.8b},  [x0], x1
-    ld1        {v3.8b},  [x2], x3
-    ld1        {v2.8b},  [x0], x1
-    ld1        {v5.8b},  [x2], x3
-    ld1        {v4.8b},  [x0], x1
-    ld1        {v7.8b},  [x2], x3
-    ld1        {v6.8b},  [x0], x1
-endfunc
-
-function x264_satd_4x8_8x4_end_neon
-    usubl       v0.8h,  v0.8b,  v1.8b
-    usubl       v1.8h,  v2.8b,  v3.8b
-    usubl       v2.8h,  v4.8b,  v5.8b
-    usubl       v3.8h,  v6.8b,  v7.8b
-
-    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
-    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
-
-    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
-    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h
-
-    trn1        v0.8h,  v4.8h,  v5.8h
-    trn2        v1.8h,  v4.8h,  v5.8h
-    trn1        v2.8h,  v6.8h,  v7.8h
-    trn2        v3.8h,  v6.8h,  v7.8h
-
-    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
-    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
-
-    trn1        v0.4s,  v16.4s, v18.4s
-    trn2        v1.4s,  v16.4s, v18.4s
-    trn1        v2.4s,  v17.4s, v19.4s
-    trn2        v3.4s,  v17.4s, v19.4s
-    abs         v0.8h,  v0.8h
-    abs         v1.8h,  v1.8h
-    abs         v2.8h,  v2.8h
-    abs         v3.8h,  v3.8h
-    umax        v0.8h,  v0.8h,  v1.8h
-    umax        v1.8h,  v2.8h,  v3.8h
-    add         v0.8h,  v0.8h,  v1.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret
-endfunc
-
-function x264_pixel_satd_8x8_neon, export=1
-    mov         x4,  x30
-
-    bl x264_satd_8x8_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v0.8h,  v0.8h,  v1.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret         x4
-endfunc
-
-function x264_pixel_satd_8x16_neon, export=1
-    mov         x4,  x30
-
-    bl x264_satd_8x8_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v30.8h, v0.8h,  v1.8h
-
-    bl x264_satd_8x8_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v31.8h, v0.8h,  v1.8h
-    add         v0.8h,  v30.8h, v31.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret         x4
-endfunc
-
-.macro SUMSUBL_AB  sum, sub, a, b
-    uaddl      \sum,  \a,  \b
-    usubl      \sub,  \a,  \b
-.endm
-
-.macro load_diff_fly_8x8
-    ld1        {v1.8b},  [x2], x3
-    ld1        {v0.8b},  [x0], x1
-    ld1        {v3.8b},  [x2], x3
-    ld1        {v2.8b},  [x0], x1
-    usubl       v16.8h, v0.8b,  v1.8b
-    ld1        {v5.8b},  [x2], x3
-    ld1        {v4.8b},  [x0], x1
-    usubl       v17.8h, v2.8b,  v3.8b
-    ld1        {v7.8b},  [x2], x3
-    ld1        {v6.8b},  [x0], x1
-    usubl       v18.8h, v4.8b,  v5.8b
-    ld1        {v1.8b},  [x2], x3
-    ld1        {v0.8b},  [x0], x1
-    usubl       v19.8h, v6.8b,  v7.8b
-    ld1        {v3.8b},  [x2], x3
-    ld1        {v2.8b},  [x0], x1
-    usubl       v20.8h, v0.8b,  v1.8b
-    ld1        {v5.8b},  [x2], x3
-    ld1        {v4.8b},  [x0], x1
-    usubl       v21.8h, v2.8b,  v3.8b
-    ld1        {v7.8b},  [x2], x3
-    ld1        {v6.8b},  [x0], x1
-
-    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
-
-    usubl       v22.8h, v4.8b,  v5.8b
-    usubl       v23.8h, v6.8b,  v7.8b
-.endm
-
-.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
-    SUMSUB_AB   \s1, \d1, \a, \b
-    SUMSUB_AB   \s2, \d2, \c, \d
-.endm
-
-.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
-    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
-    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
-.endm
-
-function x264_satd_8x8_neon
-    load_diff_fly_8x8
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function x264_satd_8x4v_8x8h_neon
-    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
-    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
-
-    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
-
-    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
-    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
-    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
-    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h
-
-    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
-    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
-    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
-    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h
-
-    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
-    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
-    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
-    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s
-
-    abs         v0.8h,  v0.8h
-    abs         v1.8h,  v1.8h
-    abs         v2.8h,  v2.8h
-    abs         v3.8h,  v3.8h
-    abs         v4.8h,  v4.8h
-    abs         v5.8h,  v5.8h
-    abs         v6.8h,  v6.8h
-    abs         v7.8h,  v7.8h
-
-    umax        v0.8h,  v0.8h,  v2.8h
-    umax        v1.8h,  v1.8h,  v3.8h
-    umax        v2.8h,  v4.8h,  v6.8h
-    umax        v3.8h,  v5.8h,  v7.8h
-
-    ret
-endfunc
-
-function x264_pixel_satd_16x8_neon, export=1
-    mov         x4,  x30
-
-    bl          x264_satd_16x4_neon
-    add         v30.8h, v0.8h,  v1.8h
-    add         v31.8h, v2.8h,  v3.8h
-
-    bl          x264_satd_16x4_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v30.8h, v30.8h, v0.8h
-    add         v31.8h, v31.8h, v1.8h
-
-    add         v0.8h,  v30.8h, v31.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret         x4
-endfunc
-
-function x264_pixel_satd_16x16_neon, export=1
-    mov         x4,  x30
-
-    bl          x264_satd_16x4_neon
-    add         v30.8h, v0.8h,  v1.8h
-    add         v31.8h, v2.8h,  v3.8h
-
-    bl          x264_satd_16x4_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v30.8h, v30.8h, v0.8h
-    add         v31.8h, v31.8h, v1.8h
-
-    bl          x264_satd_16x4_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v30.8h, v30.8h, v0.8h
-    add         v31.8h, v31.8h, v1.8h
-
-    bl          x264_satd_16x4_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    add         v1.8h,  v2.8h,  v3.8h
-    add         v30.8h, v30.8h, v0.8h
-    add         v31.8h, v31.8h, v1.8h
-
-    add         v0.8h,  v30.8h, v31.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret         x4
-endfunc
-
-function x264_satd_16x4_neon
-    ld1        {v1.16b},  [x2], x3
-    ld1        {v0.16b},  [x0], x1
-    ld1        {v3.16b},  [x2], x3
-    ld1        {v2.16b},  [x0], x1
-    usubl       v16.8h, v0.8b,  v1.8b
-    usubl2      v20.8h, v0.16b, v1.16b
-    ld1        {v5.16b},  [x2], x3
-    ld1        {v4.16b},  [x0], x1
-    usubl       v17.8h, v2.8b,  v3.8b
-    usubl2      v21.8h, v2.16b, v3.16b
-    ld1        {v7.16b},  [x2], x3
-    ld1        {v6.16b},  [x0], x1
-
-    usubl       v18.8h, v4.8b,  v5.8b
-    usubl2      v22.8h, v4.16b, v5.16b
-    usubl       v19.8h, v6.8b,  v7.8b
-    usubl2      v23.8h, v6.16b, v7.16b
-
-    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
-
-    b           x264_satd_8x4v_8x8h_neon
-endfunc
-
-function x264_pixel_satd_4x16_neon, export=1
-    mov         x4,  x30
-    ld1        {v1.s}[0],  [x2], x3
-    ld1        {v0.s}[0],  [x0], x1
-    ld1        {v3.s}[0],  [x2], x3
-    ld1        {v2.s}[0],  [x0], x1
-    ld1        {v5.s}[0],  [x2], x3
-    ld1        {v4.s}[0],  [x0], x1
-    ld1        {v7.s}[0],  [x2], x3
-    ld1        {v6.s}[0],  [x0], x1
-    ld1        {v1.s}[1],  [x2], x3
-    ld1        {v0.s}[1],  [x0], x1
-    ld1        {v3.s}[1],  [x2], x3
-    ld1        {v2.s}[1],  [x0], x1
-    ld1        {v5.s}[1],  [x2], x3
-    ld1        {v4.s}[1],  [x0], x1
-    ld1        {v7.s}[1],  [x2], x3
-    ld1        {v6.s}[1],  [x0], x1
-    usubl       v16.8h, v0.8b,  v1.8b
-    usubl       v17.8h, v2.8b,  v3.8b
-    usubl       v18.8h, v4.8b,  v5.8b
-    usubl       v19.8h, v6.8b,  v7.8b
-    ld1        {v1.s}[0],  [x2], x3
-    ld1        {v0.s}[0],  [x0], x1
-    ld1        {v3.s}[0],  [x2], x3
-    ld1        {v2.s}[0],  [x0], x1
-    ld1        {v5.s}[0],  [x2], x3
-    ld1        {v4.s}[0],  [x0], x1
-    ld1        {v7.s}[0],  [x2], x3
-    ld1        {v6.s}[0],  [x0], x1
-    ld1        {v1.s}[1],  [x2], x3
-    ld1        {v0.s}[1],  [x0], x1
-    ld1        {v3.s}[1],  [x2], x3
-    ld1        {v2.s}[1],  [x0], x1
-    ld1        {v5.s}[1],  [x2], x3
-    ld1        {v4.s}[1],  [x0], x1
-    ld1        {v7.s}[1],  [x2], x3
-    ld1        {v6.s}[1],  [x0], x1
-    usubl       v20.8h, v0.8b,  v1.8b
-    usubl       v21.8h, v2.8b,  v3.8b
-    usubl       v22.8h, v4.8b,  v5.8b
-    usubl       v23.8h, v6.8b,  v7.8b
-
-    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
-
-    bl          x264_satd_8x4v_8x8h_neon
-
-    add         v30.8h, v0.8h,  v1.8h
-    add         v31.8h, v2.8h,  v3.8h
-    add         v0.8h,  v30.8h, v31.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    ret         x4
-endfunc
-
-function x264_pixel_sa8d_8x8_neon, export=1
-    mov         x4,  x30
-    bl          pixel_sa8d_8x8_neon
-    add         v0.8h,  v0.8h,  v1.8h
-    uaddlv      s0,  v0.8h
-    mov         w0,  v0.s[0]
-    add         w0,  w0,  #1
-    lsr         w0,  w0,  #1
-    ret         x4
-endfunc
-
-function x264_pixel_sa8d_16x16_neon, export=1
-    mov         x4,  x30
-    bl          pixel_sa8d_8x8_neon
-    uaddlp      v30.4s, v0.8h
-    uaddlp      v31.4s, v1.8h
-    bl          pixel_sa8d_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    sub         x0,  x0,  x1,  lsl #4
-    sub         x2,  x2,  x3,  lsl #4
-    add         x0,  x0,  #8
-    add         x2,  x2,  #8
-    bl          pixel_sa8d_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    bl          pixel_sa8d_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    add         v0.4s,  v30.4s, v31.4s
-    addv        s0,  v0.4s
-    mov         w0,  v0.s[0]
-    add         w0,  w0,  #1
-    lsr         w0,  w0,  #1
-    ret         x4
-endfunc
-
-.macro sa8d_satd_8x8 satd=
-function pixel_sa8d_\satd\()8x8_neon
-    load_diff_fly_8x8
-
-    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
-    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
-
-    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
-.ifc \satd, satd_
-    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
-    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
-    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
-    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h
-
-    SUMSUB_AB   v24.8h, v25.8h, v0.8h,  v1.8h
-    SUMSUB_AB   v26.8h, v27.8h, v2.8h,  v3.8h
-    SUMSUB_AB   v0.8h,  v1.8h,  v4.8h,  v5.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v6.8h,  v7.8h
-
-    transpose   v4.4s,  v6.4s,  v24.4s, v26.4s
-    transpose   v5.4s,  v7.4s,  v25.4s, v27.4s
-    transpose   v24.4s, v26.4s, v0.4s,  v2.4s
-    transpose   v25.4s, v27.4s, v1.4s,  v3.4s
-
-    abs         v0.8h,  v4.8h
-    abs         v1.8h,  v5.8h
-    abs         v2.8h,  v6.8h
-    abs         v3.8h,  v7.8h
-    abs         v4.8h,  v24.8h
-    abs         v5.8h,  v25.8h
-    abs         v6.8h,  v26.8h
-    abs         v7.8h,  v27.8h
-
-    umax        v0.8h,  v0.8h,  v2.8h
-    umax        v1.8h,  v1.8h,  v3.8h
-    umax        v2.8h,  v4.8h,  v6.8h
-    umax        v3.8h,  v5.8h,  v7.8h
-
-    add         v26.8h, v0.8h,  v1.8h
-    add         v27.8h, v2.8h,  v3.8h
-.endif
-
-    SUMSUB_AB   v0.8h,  v16.8h, v16.8h, v20.8h
-    SUMSUB_AB   v1.8h,  v17.8h, v17.8h, v21.8h
-    SUMSUB_AB   v2.8h,  v18.8h, v18.8h, v22.8h
-    SUMSUB_AB   v3.8h,  v19.8h, v19.8h, v23.8h
-
-    transpose   v20.8h, v21.8h, v16.8h, v17.8h
-    transpose   v4.8h,  v5.8h,  v0.8h,  v1.8h
-    transpose   v22.8h, v23.8h, v18.8h, v19.8h
-    transpose   v6.8h,  v7.8h,  v2.8h,  v3.8h
-
-    SUMSUB_AB   v2.8h,  v3.8h,  v20.8h, v21.8h
-    SUMSUB_AB   v24.8h, v25.8h, v4.8h,  v5.8h
-    SUMSUB_AB   v0.8h,  v1.8h,  v22.8h, v23.8h
-    SUMSUB_AB   v4.8h,  v5.8h,  v6.8h,  v7.8h
-
-    transpose   v20.4s, v22.4s, v2.4s,  v0.4s
-    transpose   v21.4s, v23.4s, v3.4s,  v1.4s
-    transpose   v16.4s, v18.4s, v24.4s, v4.4s
-    transpose   v17.4s, v19.4s, v25.4s, v5.4s
-
-    SUMSUB_AB   v0.8h,  v2.8h,  v20.8h, v22.8h
-    SUMSUB_AB   v1.8h,  v3.8h,  v21.8h, v23.8h
-    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
-    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h
-
-    transpose   v16.2d, v20.2d,  v0.2d,  v4.2d
-    transpose   v17.2d, v21.2d,  v1.2d,  v5.2d
-    transpose   v18.2d, v22.2d,  v2.2d,  v6.2d
-    transpose   v19.2d, v23.2d,  v3.2d,  v7.2d
-
-    abs         v16.8h, v16.8h
-    abs         v20.8h, v20.8h
-    abs         v17.8h, v17.8h
-    abs         v21.8h, v21.8h
-    abs         v18.8h, v18.8h
-    abs         v22.8h, v22.8h
-    abs         v19.8h, v19.8h
-    abs         v23.8h, v23.8h
-
-    umax        v16.8h, v16.8h, v20.8h
-    umax        v17.8h, v17.8h, v21.8h
-    umax        v18.8h, v18.8h, v22.8h
-    umax        v19.8h, v19.8h, v23.8h
-
-    add         v0.8h,  v16.8h, v17.8h
-    add         v1.8h,  v18.8h, v19.8h
-
-    ret
-endfunc
-.endm
-
-sa8d_satd_8x8
-sa8d_satd_8x8 satd_
-
-function x264_pixel_sa8d_satd_16x16_neon, export=1
-    mov         x4,  x30
-    bl          pixel_sa8d_satd_8x8_neon
-    uaddlp      v30.4s, v0.8h
-    uaddlp      v31.4s, v1.8h
-    uaddlp      v28.4s, v26.8h
-    uaddlp      v29.4s, v27.8h
-    bl          pixel_sa8d_satd_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    uadalp      v28.4s, v26.8h
-    uadalp      v29.4s, v27.8h
-    sub         x0,  x0,  x1,  lsl #4
-    sub         x2,  x2,  x3,  lsl #4
-    add         x0,  x0,  #8
-    add         x2,  x2,  #8
-    bl          pixel_sa8d_satd_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    uadalp      v28.4s, v26.8h
-    uadalp      v29.4s, v27.8h
-    bl          pixel_sa8d_satd_8x8_neon
-    uadalp      v30.4s, v0.8h
-    uadalp      v31.4s, v1.8h
-    uadalp      v28.4s, v26.8h
-    uadalp      v29.4s, v27.8h
-    add         v0.4s,  v30.4s, v31.4s  // sa8d
-    add         v1.4s,  v28.4s, v29.4s  // satd
-    addv        s0,  v0.4s
-    addv        s1,  v1.4s
-    urshr       v0.4s,  v0.4s,  #1
-    fmov        w0,  s0
-    fmov        w1,  s1
-    add         x0,  x0,  x1, lsl #32
-    ret         x4
-endfunc
-
-.macro HADAMARD_AC w h
-function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
-    movrel      x5, mask_ac_4_8
-    mov         x4,  x30
-    ld1         {v30.8h,v31.8h}, [x5]
-    movi        v28.16b, #0
-    movi        v29.16b, #0
-
-    bl          x264_hadamard_ac_8x8_neon
-.if \h > 8
-    bl          x264_hadamard_ac_8x8_neon
-.endif
-.if \w > 8
-    sub         x0,  x0,  x1,  lsl #3
-    add         x0,  x0,  #8
-    bl          x264_hadamard_ac_8x8_neon
-.endif
-.if \w * \h == 256
-    sub         x0,  x0,  x1,  lsl #4
-    bl          x264_hadamard_ac_8x8_neon
-.endif
-
-    addv        s1,  v29.4s
-    addv        s0,  v28.4s
-    mov         w1,  v1.s[0]
-    mov         w0,  v0.s[0]
-    lsr         w1,  w1,  #2
-    lsr         w0,  w0,  #1
-    orr         x0,  x0,  x1, lsl #32
-    ret         x4
-endfunc
-.endm
-
-HADAMARD_AC  8, 8
-HADAMARD_AC  8, 16
-HADAMARD_AC 16, 8
-HADAMARD_AC 16, 16
-
-// v28: satd  v29: sa8d  v30: mask_ac4  v31: mask_ac8
-function x264_hadamard_ac_8x8_neon
-    ld1         {v16.8b}, [x0], x1
-    ld1         {v17.8b}, [x0], x1
-    ld1         {v18.8b}, [x0], x1
-    ld1         {v19.8b}, [x0], x1
-    SUMSUBL_AB  v0.8h,  v1.8h, v16.8b, v17.8b
-    ld1         {v20.8b}, [x0], x1
-    ld1         {v21.8b}, [x0], x1
-    SUMSUBL_AB  v2.8h,  v3.8h, v18.8b, v19.8b
-    ld1         {v22.8b}, [x0], x1
-    ld1         {v23.8b}, [x0], x1
-    SUMSUBL_AB  v4.8h,  v5.8h, v20.8b, v21.8b
-    SUMSUBL_AB  v6.8h,  v7.8h, v22.8b, v23.8b
-
-    SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h,  v2.8h,  v1.8h,  v3.8h
-    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
-
-    transpose   v0.8h,  v1.8h,  v16.8h,  v17.8h
-    transpose   v2.8h,  v3.8h,  v18.8h,  v19.8h
-    transpose   v4.8h,  v5.8h,  v20.8h,  v21.8h
-    transpose   v6.8h,  v7.8h,  v22.8h,  v23.8h
-
-    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
-    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
-    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
-    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h
-
-    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
-    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
-    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
-    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s
-
-    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
-    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
-    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
-
-    abs         v0.8h,  v16.8h
-    abs         v4.8h,  v20.8h
-    abs         v1.8h,  v17.8h
-    abs         v5.8h,  v21.8h
-    abs         v2.8h,  v18.8h
-    abs         v6.8h,  v22.8h
-    abs         v3.8h,  v19.8h
-    abs         v7.8h,  v23.8h
-
-    add         v0.8h,  v0.8h,  v4.8h
-    add         v1.8h,  v1.8h,  v5.8h
-    and         v0.16b, v0.16b, v30.16b
-    add         v2.8h,  v2.8h,  v6.8h
-    add         v3.8h,  v3.8h,  v7.8h
-    add         v0.8h,  v0.8h,  v2.8h
-    add         v1.8h,  v1.8h,  v3.8h
-    uadalp      v28.4s, v0.8h
-    uadalp      v28.4s, v1.8h
-
-    SUMSUB_AB   v6.8h,  v7.8h,  v23.8h, v19.8h
-    SUMSUB_AB   v4.8h,  v5.8h,  v22.8h, v18.8h
-    SUMSUB_AB   v2.8h,  v3.8h,  v21.8h, v17.8h
-    SUMSUB_AB   v1.8h,  v0.8h,  v16.8h,  v20.8h
-
-    transpose   v16.2d, v17.2d,  v6.2d,  v7.2d
-    transpose   v18.2d, v19.2d,  v4.2d,  v5.2d
-    transpose   v20.2d, v21.2d,  v2.2d,  v3.2d
-
-    abs         v16.8h,  v16.8h
-    abs         v17.8h,  v17.8h
-    abs         v18.8h,  v18.8h
-    abs         v19.8h,  v19.8h
-    abs         v20.8h,  v20.8h
-    abs         v21.8h,  v21.8h
-
-    transpose   v7.2d,  v6.2d,  v1.2d,  v0.2d
-
-    umax        v3.8h,  v16.8h,  v17.8h
-    umax        v2.8h,  v18.8h,  v19.8h
-    umax        v1.8h,  v20.8h,  v21.8h
-
-    SUMSUB_AB   v4.8h,  v5.8h,  v7.8h,  v6.8h
-
-    add         v2.8h,  v2.8h,  v3.8h
-    add         v2.8h,  v2.8h,  v1.8h
-    and         v4.16b, v4.16b, v31.16b
-    add         v2.8h,  v2.8h,  v2.8h
-    abs         v5.8h,  v5.8h
-    abs         v4.8h,  v4.8h
-    add         v2.8h,  v2.8h,  v5.8h
-    add         v2.8h,  v2.8h,  v4.8h
-    uadalp      v29.4s, v2.8h
-    ret
-endfunc
-
-
-function x264_pixel_ssim_4x4x2_core_neon, export=1
-    ld1        {v0.8b},  [x0], x1
-    ld1        {v2.8b},  [x2], x3
-    umull       v16.8h, v0.8b,  v0.8b
-    umull       v17.8h, v0.8b,  v2.8b
-    umull       v18.8h, v2.8b,  v2.8b
-
-    ld1        {v28.8b}, [x0], x1
-    ld1        {v29.8b}, [x2], x3
-    umull       v20.8h, v28.8b, v28.8b
-    umull       v21.8h, v28.8b, v29.8b
-    umull       v22.8h, v29.8b, v29.8b
-
-    uaddlp      v16.4s, v16.8h
-    uaddlp      v17.4s, v17.8h
-    uaddl       v0.8h,  v0.8b,  v28.8b
-    uadalp      v16.4s, v18.8h
-    uaddl       v1.8h,  v2.8b,  v29.8b
-
-    ld1        {v26.8b}, [x0], x1
-    ld1        {v27.8b}, [x2], x3
-    umull       v23.8h, v26.8b, v26.8b
-    umull       v24.8h, v26.8b, v27.8b
-    umull       v25.8h, v27.8b, v27.8b
-
-    uadalp      v16.4s, v20.8h
-    uaddw       v0.8h,  v0.8h,  v26.8b
-    uadalp      v17.4s, v21.8h
-    uaddw       v1.8h,  v1.8h,  v27.8b
-    uadalp      v16.4s, v22.8h
-
-    ld1        {v28.8b}, [x0], x1
-    ld1        {v29.8b}, [x2], x3
-    umull       v20.8h, v28.8b, v28.8b
-    umull       v21.8h, v28.8b, v29.8b
-    umull       v22.8h, v29.8b, v29.8b
-
-    uadalp      v16.4s, v23.8h
-    uaddw       v0.8h,  v0.8h,  v28.8b
-    uadalp      v17.4s, v24.8h
-    uaddw       v1.8h,  v1.8h,  v29.8b
-    uadalp      v16.4s, v25.8h
-
-    uadalp      v16.4s, v20.8h
-    uadalp      v17.4s, v21.8h
-    uadalp      v16.4s, v22.8h
-
-    uaddlp      v0.4s,  v0.8h
-    uaddlp      v1.4s,  v1.8h
-
-    addp        v0.4s,  v0.4s,  v0.4s
-    addp        v1.4s,  v1.4s,  v1.4s
-    addp        v2.4s,  v16.4s, v16.4s
-    addp        v3.4s,  v17.4s, v17.4s
-
-    st4        {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
-    ret
-endfunc
-
-function x264_pixel_ssim_end4_neon, export=1
-    mov         x5,  #4
-    ld1        {v16.4s,v17.4s}, [x0], #32
-    ld1        {v18.4s,v19.4s}, [x1], #32
-    mov         w4,  #0x99bb
-    subs        x2,  x5,  w2, uxtw
-    mov         w3,  #416                       // ssim_c1 = .01*.01*255*255*64
-    movk        w4,  #0x03, lsl #16             // ssim_c2 = .03*.03*255*255*64*63
-    add         v0.4s,  v16.4s,  v18.4s
-    add         v1.4s,  v17.4s,  v19.4s
-    add         v0.4s,  v0.4s,  v1.4s
-    ld1        {v20.4s,v21.4s}, [x0], #32
-    ld1        {v22.4s,v23.4s}, [x1], #32
-    add         v2.4s,  v20.4s, v22.4s
-    add         v3.4s,  v21.4s, v23.4s
-    add         v1.4s,  v1.4s,  v2.4s
-    ld1        {v16.4s}, [x0], #16
-    ld1        {v18.4s}, [x1], #16
-    add         v16.4s, v16.4s, v18.4s
-    add         v2.4s,  v2.4s,  v3.4s
-    add         v3.4s,  v3.4s,  v16.4s
-
-    dup         v30.4s, w3
-    dup         v31.4s, w4
-
-    transpose   v4.4s,  v5.4s,  v0.4s,  v1.4s
-    transpose   v6.4s,  v7.4s,  v2.4s,  v3.4s
-    transpose   v0.2d,  v2.2d,  v4.2d,  v6.2d
-    transpose   v1.2d,  v3.2d,  v5.2d,  v7.2d
-
-    mul         v16.4s, v0.4s, v1.4s    // s1*s2
-    mul         v0.4s,  v0.4s, v0.4s
-    mla         v0.4s,  v1.4s, v1.4s    // s1*s1 + s2*s2
-
-    shl         v3.4s,  v3.4s,  #7
-    shl         v2.4s,  v2.4s,  #6
-    add         v1.4s,  v16.4s, v16.4s
-
-    sub         v2.4s,  v2.4s,  v0.4s    // vars
-    sub         v3.4s,  v3.4s,  v1.4s    // covar*2
-    add         v0.4s,  v0.4s,  v30.4s
-    add         v2.4s,  v2.4s,  v31.4s
-    add         v1.4s,  v1.4s,  v30.4s
-    add         v3.4s,  v3.4s,  v31.4s
-
-    scvtf       v0.4s,  v0.4s
-    scvtf       v2.4s,  v2.4s
-    scvtf       v1.4s,  v1.4s
-    scvtf       v3.4s,  v3.4s
-
-    fmul        v0.4s,  v0.4s,  v2.4s
-    fmul        v1.4s,  v1.4s,  v3.4s
-
-    fdiv        v0.4s,  v1.4s,  v0.4s
-
-    b.eq        1f
-    movrel      x3,  mask
-    add         x3,  x3,  x2,  lsl #2
-    ld1        {v29.4s}, [x3]
-    and         v0.16b, v0.16b, v29.16b
-1:
-    faddp       v0.4s,  v0.4s,  v0.4s
-    faddp       s0,  v0.2s
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/pixel.h b/android/src/main/libenc/jni/libx264/common/aarch64/pixel.h
deleted file mode 100755
index 5e7eaf0..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/pixel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*****************************************************************************
- * pixel.h: aarch64 pixel metrics
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_PIXEL_H
-#define X264_AARCH64_PIXEL_H
-
-#define DECL_PIXELS( ret, name, suffix, args ) \
-    ret x264_pixel_##name##_16x16_##suffix args;\
-    ret x264_pixel_##name##_16x8_##suffix args;\
-    ret x264_pixel_##name##_8x16_##suffix args;\
-    ret x264_pixel_##name##_8x8_##suffix args;\
-    ret x264_pixel_##name##_8x4_##suffix args;\
-    ret x264_pixel_##name##_4x16_##suffix args;\
-    ret x264_pixel_##name##_4x8_##suffix args;\
-    ret x264_pixel_##name##_4x4_##suffix args;\
-
-#define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
-
-#define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
-
-DECL_X1( sad, neon )
-DECL_X4( sad, neon )
-DECL_X1( satd, neon )
-DECL_X1( ssd, neon )
-
-
-void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
-
-int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
-
-int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
-uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
-
-uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
-uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-
-uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
-
-void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
-                                      const uint8_t *, intptr_t,
-                                      int sums[2][4] );
-float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
-
-int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/predict-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/predict-a.S
deleted file mode 100755
index 0bfb9b4..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/predict-a.S
+++ /dev/null
@@ -1,904 +0,0 @@
-/*****************************************************************************
- * predict.S: aarch64 intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Mans Rullgard <mans@mansr.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-const p8weight, align=4
-    .short      1, 2, 3, 4, 1, 2, 3, 4
-endconst
-const p16weight, align=4
-    .short      1, 2, 3, 4, 5, 6, 7, 8
-endconst
-
-.macro ldcol.8  vd,  xn,  xm,  n=8,  hi=0
-.if \n == 8 || \hi == 0
-    ld1        {\vd\().b}[0], [\xn], \xm
-    ld1        {\vd\().b}[1], [\xn], \xm
-    ld1        {\vd\().b}[2], [\xn], \xm
-    ld1        {\vd\().b}[3], [\xn], \xm
-.endif
-.if \n == 8 || \hi == 1
-    ld1        {\vd\().b}[4], [\xn], \xm
-    ld1        {\vd\().b}[5], [\xn], \xm
-    ld1        {\vd\().b}[6], [\xn], \xm
-    ld1        {\vd\().b}[7], [\xn], \xm
-.endif
-.endm
-
-.macro ldcol.16  vd,  xn,  xm
-    ldcol.8     \vd, \xn, \xm
-    ld1        {\vd\().b}[ 8], [\xn], \xm
-    ld1        {\vd\().b}[ 9], [\xn], \xm
-    ld1        {\vd\().b}[10], [\xn], \xm
-    ld1        {\vd\().b}[11], [\xn], \xm
-    ld1        {\vd\().b}[12], [\xn], \xm
-    ld1        {\vd\().b}[13], [\xn], \xm
-    ld1        {\vd\().b}[14], [\xn], \xm
-    ld1        {\vd\().b}[15], [\xn], \xm
-.endm
-
-
-function x264_predict_4x4_h_aarch64, export=1
-    ldrb    w1,  [x0, #0*FDEC_STRIDE-1]
-    mov     w5,  #0x01010101
-    ldrb    w2,  [x0, #1*FDEC_STRIDE-1]
-    ldrb    w3,  [x0, #2*FDEC_STRIDE-1]
-    mul     w1,  w1,  w5
-    ldrb    w4,  [x0, #3*FDEC_STRIDE-1]
-    mul     w2,  w2,  w5
-    str     w1,  [x0, #0*FDEC_STRIDE]
-    mul     w3,  w3,  w5
-    str     w2,  [x0, #1*FDEC_STRIDE]
-    mul     w4,  w4,  w5
-    str     w3,  [x0, #2*FDEC_STRIDE]
-    str     w4,  [x0, #3*FDEC_STRIDE]
-    ret
-endfunc
-
-function x264_predict_4x4_v_aarch64, export=1
-    ldr     w1,  [x0, #0 - 1 * FDEC_STRIDE]
-    str     w1,  [x0, #0 + 0 * FDEC_STRIDE]
-    str     w1,  [x0, #0 + 1 * FDEC_STRIDE]
-    str     w1,  [x0, #0 + 2 * FDEC_STRIDE]
-    str     w1,  [x0, #0 + 3 * FDEC_STRIDE]
-    ret
-endfunc
-
-function x264_predict_4x4_dc_neon, export=1
-    sub         x1,  x0,  #FDEC_STRIDE
-    ldrb        w4,  [x0, #-1 + 0 * FDEC_STRIDE]
-    ldrb        w5,  [x0, #-1 + 1 * FDEC_STRIDE]
-    ldrb        w6,  [x0, #-1 + 2 * FDEC_STRIDE]
-    ldrb        w7,  [x0, #-1 + 3 * FDEC_STRIDE]
-    add         w4,  w4,  w5
-    ldr         s0, [x1]
-    add         w6,  w6,  w7
-    uaddlv      h0,  v0.8b
-    add         w4,  w4,  w6
-    dup         v0.4h,  v0.h[0]
-    dup         v1.4h,  w4
-    add         v0.4h,  v0.4h,  v1.4h
-    rshrn       v0.8b,  v0.8h,  #3
-    str         s0,  [x0]
-    str         s0,  [x0, #1 * FDEC_STRIDE]
-    str         s0,  [x0, #2 * FDEC_STRIDE]
-    str         s0,  [x0, #3 * FDEC_STRIDE]
-    ret
-endfunc
-
-function x264_predict_4x4_dc_top_neon, export=1
-    sub         x1,  x0,  #FDEC_STRIDE
-    ldr         s0, [x1]
-    uaddlv      h0,  v0.8b
-    dup         v0.4h,  v0.h[0]
-    rshrn       v0.8b,  v0.8h,  #2
-    str         s0,  [x0]
-    str         s0,  [x0, #1 * FDEC_STRIDE]
-    str         s0,  [x0, #2 * FDEC_STRIDE]
-    str         s0,  [x0, #3 * FDEC_STRIDE]
-    ret
-    ret
-endfunc
-
-function x264_predict_4x4_ddr_neon, export=1
-    sub         x1,  x0,  #FDEC_STRIDE+1
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1], x7            // # -FDEC_STRIDE-1
-    ld1r       {v1.8b}, [x1], x7            // #0*FDEC_STRIDE-1
-    ld1r       {v2.8b}, [x1], x7            // #1*FDEC_STRIDE-1
-    ext         v0.8b,  v1.8b,  v0.8b,  #7
-    ld1r       {v3.8b}, [x1], x7            // #2*FDEC_STRIDE-1
-    ext         v0.8b,  v2.8b,  v0.8b,  #7  // a
-    ld1r       {v4.8b}, [x1], x7            // #3*FDEC_STRIDE-1
-    ext         v1.8b,  v3.8b,  v0.8b,  #7  // b
-    ext         v2.8b,  v4.8b,  v1.8b,  #7  // c
-    uaddl       v0.8h,  v0.8b,  v1.8b
-    uaddl       v1.8h,  v1.8b,  v2.8b
-    add         v0.8h,  v0.8h,  v1.8h
-    rshrn       v0.8b,  v0.8h,  #2
-
-    ext         v3.8b,  v0.8b, v0.8b,  #3
-    ext         v2.8b,  v0.8b, v0.8b,  #2
-    ext         v1.8b,  v0.8b, v0.8b,  #1
-
-    str         s3,  [x0], #FDEC_STRIDE
-    str         s2,  [x0], #FDEC_STRIDE
-    str         s1,  [x0], #FDEC_STRIDE
-    str         s0,  [x0]
-    ret
-endfunc
-
-function x264_predict_4x4_ddl_neon, export=1
-    sub         x0,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x0],  x7
-    dup         v3.8b,  v0.b[7]
-    ext         v1.8b,  v0.8b,  v0.8b,  #1
-    ext         v2.8b,  v0.8b,  v3.8b,  #2
-    uhadd       v0.8b,  v0.8b,  v2.8b
-    urhadd      v0.8b,  v0.8b,  v1.8b
-    str         s0,  [x0], #FDEC_STRIDE
-    ext         v1.8b,  v0.8b,  v0.8b,  #1
-    ext         v2.8b,  v0.8b,  v0.8b,  #2
-    str         s1,  [x0], #FDEC_STRIDE
-    ext         v3.8b,  v0.8b,  v0.8b,  #3
-    str         s2,  [x0], #FDEC_STRIDE
-    str         s3,  [x0]
-    ret
-endfunc
-
-function x264_predict_8x8_dc_neon, export=1
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.16b}, [x1], #16
-    ld1        {v1.8b},  [x1]
-    ext         v0.16b, v0.16b, v0.16b, #7
-    uaddlv      h1,  v1.8b
-    uaddlv      h0,  v0.8b
-    add         v0.8h,  v0.8h,  v1.8h
-    dup         v0.8h,  v0.h[0]
-    rshrn       v0.8b,  v0.8h,  #4
-.rept 8
-    st1        {v0.8b}, [x0], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_8x8_h_neon, export=1
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v16.16b}, [x1]
-    dup         v0.8b, v16.b[14]
-    dup         v1.8b, v16.b[13]
-    st1        {v0.8b}, [x0], x7
-    dup         v2.8b, v16.b[12]
-    st1        {v1.8b}, [x0], x7
-    dup         v3.8b, v16.b[11]
-    st1        {v2.8b}, [x0], x7
-    dup         v4.8b, v16.b[10]
-    st1        {v3.8b}, [x0], x7
-    dup         v5.8b, v16.b[9]
-    st1        {v4.8b}, [x0], x7
-    dup         v6.8b, v16.b[8]
-    st1        {v5.8b}, [x0], x7
-    dup         v7.8b, v16.b[7]
-    st1        {v6.8b}, [x0], x7
-    st1        {v7.8b}, [x0], x7
-    ret
-endfunc
-
-function x264_predict_8x8_v_neon, export=1
-    add         x1,  x1,  #16
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1]
-.rept 8
-    st1        {v0.8b}, [x0], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_8x8_ddl_neon, export=1
-    add         x1,  x1,  #16
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.16b}, [x1]
-    movi        v3.16b, #0
-    dup         v2.16b, v0.b[15]
-    ext         v4.16b, v3.16b, v0.16b, #15
-    ext         v2.16b, v0.16b, v2.16b, #1
-    uhadd       v4.16b, v4.16b, v2.16b
-    urhadd      v0.16b, v0.16b, v4.16b
-    ext         v1.16b, v0.16b, v0.16b, #1
-    ext         v2.16b, v0.16b, v0.16b, #2
-    st1        {v1.8b}, [x0], x7
-    ext         v3.16b, v0.16b, v0.16b, #3
-    st1        {v2.8b}, [x0], x7
-    ext         v4.16b, v0.16b, v0.16b, #4
-    st1        {v3.8b}, [x0], x7
-    ext         v5.16b, v0.16b, v0.16b, #5
-    st1        {v4.8b}, [x0], x7
-    ext         v6.16b, v0.16b, v0.16b, #6
-    st1        {v5.8b}, [x0], x7
-    ext         v7.16b, v0.16b, v0.16b, #7
-    st1        {v6.8b}, [x0], x7
-    ext         v0.16b, v0.16b, v0.16b, #8
-    st1        {v7.8b}, [x0], x7
-    st1        {v0.8b}, [x0], x7
-    ret
-endfunc
-
-function x264_predict_8x8_ddr_neon, export=1
-    ld1        {v0.16b,v1.16b}, [x1]
-    ext         v2.16b, v0.16b, v1.16b, #7
-    ext         v4.16b, v0.16b, v1.16b, #9
-    ext         v3.16b, v0.16b, v1.16b, #8
-
-    uhadd       v2.16b, v2.16b, v4.16b
-    urhadd      v7.16b, v3.16b, v2.16b
-
-    add         x0,  x0,  #7*FDEC_STRIDE
-    mov         x7,  #-1*FDEC_STRIDE
-
-    ext         v6.16b, v7.16b, v7.16b, #1
-    st1        {v7.8b},  [x0], x7
-    ext         v5.16b, v7.16b, v7.16b, #2
-    st1        {v6.8b},  [x0], x7
-    ext         v4.16b, v7.16b, v7.16b, #3
-    st1        {v5.8b},  [x0], x7
-    ext         v3.16b, v7.16b, v7.16b, #4
-    st1        {v4.8b},  [x0], x7
-    ext         v2.16b, v7.16b, v7.16b, #5
-    st1        {v3.8b},  [x0], x7
-    ext         v1.16b, v7.16b, v7.16b, #6
-    st1        {v2.8b},  [x0], x7
-    ext         v0.16b, v7.16b, v7.16b, #7
-    st1        {v1.8b},  [x0], x7
-    st1        {v0.8b},  [x0], x7
-    ret
-endfunc
-
-function x264_predict_8x8_vl_neon, export=1
-    add         x1,  x1,  #16
-    mov         x7, #FDEC_STRIDE
-
-    ld1        {v0.16b}, [x1]
-    ext         v1.16b, v1.16b, v0.16b, #15
-    ext         v2.16b, v0.16b, v2.16b, #1
-
-    uhadd       v1.16b, v1.16b, v2.16b
-    urhadd      v3.16b, v0.16b, v2.16b
-
-    urhadd      v0.16b, v0.16b, v1.16b
-
-    ext        v4.16b, v0.16b, v0.16b, #1
-    st1        {v3.8b}, [x0], x7
-    ext        v5.16b, v3.16b, v3.16b, #1
-    st1        {v4.8b}, [x0], x7
-    ext        v6.16b, v0.16b, v0.16b, #2
-    st1        {v5.8b}, [x0], x7
-    ext        v7.16b, v3.16b, v3.16b, #2
-    st1        {v6.8b}, [x0], x7
-    ext        v4.16b, v0.16b, v0.16b, #3
-    st1        {v7.8b}, [x0], x7
-    ext        v5.16b, v3.16b, v3.16b, #3
-    st1        {v4.8b}, [x0], x7
-    ext        v6.16b, v0.16b, v0.16b, #4
-    st1        {v5.8b}, [x0], x7
-    st1        {v6.8b}, [x0], x7
-    ret
-endfunc
-
-function x264_predict_8x8_vr_neon, export=1
-    add         x1,  x1,  #8
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v2.16b}, [x1]
-
-    ext         v1.16b, v2.16b, v2.16b, #14
-    ext         v0.16b, v2.16b, v2.16b, #15
-
-    uhadd       v3.16b, v2.16b, v1.16b
-    urhadd      v2.16b, v2.16b, v0.16b
-    urhadd      v0.16b, v0.16b, v3.16b
-
-    ext         v1.16b, v2.16b, v2.16b, #8
-    uzp1        v2.8b,  v0.8b,  v0.8b
-    uzp2        v3.8b,  v0.8b,  v0.8b
-    ext         v0.16b, v0.16b, v0.16b, #8
-
-    st1        {v1.8b}, [x0], x7
-    st1        {v0.8b}, [x0], x7
-    ext         v4.8b, v3.8b, v1.8b, #7
-    ext         v5.8b, v2.8b, v0.8b, #7
-    st1        {v4.8b}, [x0], x7
-    st1        {v5.8b}, [x0], x7
-    ext         v6.8b, v3.8b, v1.8b, #6
-    ext         v7.8b, v2.8b, v0.8b, #6
-    st1        {v6.8b}, [x0], x7
-    st1        {v7.8b}, [x0], x7
-    ext         v1.8b, v3.8b, v1.8b, #5
-    ext         v0.8b, v2.8b, v0.8b, #5
-    st1        {v1.8b}, [x0], x7
-    st1        {v0.8b}, [x0], x7
-    ret
-endfunc
-
-function x264_predict_8x8_hd_neon, export=1
-    add         x1,  x1,  #7
-    mov         x7, #FDEC_STRIDE
-
-    ld1        {v1.16b}, [x1]
-    ext         v3.16b, v1.16b, v1.16b, #1
-    ext         v2.16b, v1.16b, v1.16b, #2
-
-    urhadd      v4.16b, v1.16b, v3.16b
-
-    uhadd       v1.16b, v1.16b, v2.16b
-    urhadd      v0.16b, v1.16b, v3.16b
-
-    zip1        v16.8b, v4.8b,  v0.8b
-    zip2        v17.8b, v4.8b,  v0.8b
-    ext         v7.16b, v0.16b, v0.16b, #8
-
-    ext         v0.8b,  v17.8b, v7.8b,  #6
-    ext         v1.8b,  v17.8b, v7.8b,  #4
-    st1        {v0.8b},  [x0], x7
-    ext         v2.8b,  v17.8b, v7.8b,  #2
-    st1        {v1.8b},  [x0], x7
-    st1        {v2.8b},  [x0], x7
-    ext         v3.8b,  v16.8b, v17.8b, #6
-    st1        {v17.8b}, [x0], x7
-    ext         v4.8b,  v16.8b, v17.8b, #4
-    st1        {v3.8b},  [x0], x7
-    ext         v5.8b,  v16.8b, v17.8b, #2
-    st1        {v4.8b},  [x0], x7
-    st1        {v5.8b},  [x0], x7
-    st1        {v16.8b}, [x0], x7
-
-    ret
-endfunc
-
-function x264_predict_8x8_hu_neon, export=1
-    add         x1,  x1,  #7
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v7.8b}, [x1]
-    dup         v6.8b,  v7.b[0]
-    rev64       v7.8b,  v7.8b
-
-    ext         v4.8b,  v7.8b,  v6.8b,  #2
-    ext         v2.8b,  v7.8b,  v6.8b,  #1
-
-    uhadd       v5.8b,  v7.8b,  v4.8b
-    urhadd      v0.8b,  v2.8b,  v7.8b
-    urhadd      v1.8b,  v5.8b,  v2.8b
-
-    zip1        v16.8b, v0.8b,  v1.8b
-    zip2        v17.8b, v0.8b,  v1.8b
-
-    dup         v18.4h, v17.h[3]
-
-    ext         v0.8b,  v16.8b, v17.8b, #2
-    ext         v1.8b,  v16.8b, v17.8b, #4
-    ext         v2.8b,  v16.8b, v17.8b, #6
-    st1        {v16.8b}, [x0], x7
-    st1        {v0.8b},  [x0], x7
-    st1        {v1.8b},  [x0], x7
-    st1        {v2.8b},  [x0], x7
-
-    ext         v4.8b,  v17.8b, v18.8b, #2
-    ext         v5.8b,  v17.8b, v18.8b, #4
-    ext         v6.8b,  v17.8b, v18.8b, #6
-    st1        {v17.8b}, [x0], x7
-    st1        {v4.8b},  [x0], x7
-    st1        {v5.8b},  [x0], x7
-    st1        {v6.8b},  [x0]
-    ret
-endfunc
-
-
-function x264_predict_8x8c_dc_top_neon, export=1
-    sub         x2,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    ld1        {v0.8b},  [x2]
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
-    rshrn       v0.8b,  v0.8h,  #2
-    dup         v3.8b,  v0.b[1]
-    dup         v2.8b,  v0.b[0]
-    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
-    b           pred8x8c_dc_end
-endfunc
-
-function x264_predict_8x8c_dc_left_neon, export=1
-    ldrb        w2,  [x0, #0 * FDEC_STRIDE - 1]
-    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
-    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
-    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
-    mov         x1,  #FDEC_STRIDE
-    add         w2,  w2,  w3
-    add         w3,  w4,  w5
-    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
-    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
-    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
-    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
-    add         w6,  w6,  w7
-    add         w7,  w8,  w9
-    add         w2,  w2,  w3
-    add         w6,  w6,  w7
-    dup         v0.8h,  w2
-    dup         v1.8h,  w6
-    rshrn       v0.8b,  v0.8h,  #2
-    rshrn       v1.8b,  v1.8h,  #2
-    b           pred8x8c_dc_end
-endfunc
-
-function x264_predict_8x8c_dc_neon, export=1
-    mov         x1,  #FDEC_STRIDE
-    sub         x2,  x0,  #FDEC_STRIDE
-    ldrb        w10, [x0, #0 * FDEC_STRIDE - 1]
-    ldrb        w11, [x0, #1 * FDEC_STRIDE - 1]
-    ldrb        w12, [x0, #2 * FDEC_STRIDE - 1]
-    ldrb        w13, [x0, #3 * FDEC_STRIDE - 1]
-    add         w10, w10, w11
-    ldrb        w4,  [x0, #4 * FDEC_STRIDE - 1]
-    ldrb        w5,  [x0, #5 * FDEC_STRIDE - 1]
-    add         w12, w12, w13
-    ldrb        w6,  [x0, #6 * FDEC_STRIDE - 1]
-    ldrb        w7,  [x0, #7 * FDEC_STRIDE - 1]
-    add         w4,  w4,  w5
-    add         w6,  w6,  w7
-    add         w10, w10, w12, lsl #16
-    add         w4,  w4,  w6,  lsl #16
-    ld1        {v0.8b},  [x2]
-    add         x10, x10, x4,  lsl #32
-    uaddlp      v0.4h,  v0.8b  // s0, s1
-    mov         v1.d[0],  x10  // s2, s3
-    add         v3.4h,  v0.4h,  v1.4h
-    addp        v0.4h,  v0.4h,  v1.4h // s0, s1, s2, s3
-    addp        v1.4h,  v3.4h,  v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
-    uzp2        v0.4h,  v0.4h,  v0.4h // s1,    s3,    s1,    s3
-    uzp1        v1.2d,  v1.2d,  v1.2d
-    uzp1        v0.2d,  v0.2d,  v0.2d
-    rshrn       v3.8b,  v1.8h,  #3
-    rshrn       v2.8b,  v0.8h,  #2
-    uzp1        v0.8b,  v3.8b,  v2.8b
-    uzp2        v1.8b,  v2.8b,  v3.8b
-pred8x8c_dc_end:
-    add         x2,  x0,  #2 * FDEC_STRIDE
-    add         x4,  x0,  #4 * FDEC_STRIDE
-    add         x5,  x0,  #6 * FDEC_STRIDE
-    st1        {v0.8b}, [x0], x1
-    st1        {v0.8b}, [x2], x1
-    st1        {v0.8b}, [x0]
-    st1        {v0.8b}, [x2]
-    st1        {v1.8b}, [x4], x1
-    st1        {v1.8b}, [x5], x1
-    st1        {v1.8b}, [x4]
-    st1        {v1.8b}, [x5]
-    ret
-endfunc
-
-function x264_predict_8x8c_h_neon, export=1
-    sub         x1,  x0,  #1
-    mov         x7,  #FDEC_STRIDE
-.rept 4
-    ld1r       {v0.8b}, [x1], x7
-    ld1r       {v1.8b}, [x1], x7
-    st1        {v0.8b}, [x0], x7
-    st1        {v1.8b}, [x0], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_8x8c_v_aarch64, export=1
-    ldr         x1,  [x0, #-FDEC_STRIDE]
-.irp c, 0,1,2,3,4,5,6,7
-    str         x1,  [x0, #\c * FDEC_STRIDE]
-.endr
-    ret
-endfunc
-
-function x264_predict_8x8c_p_neon, export=1
-    sub         x3,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    add         x2,  x3,  #4
-    sub         x3,  x3,  #1
-    ld1        {v0.s}[0], [x3]
-    ld1        {v2.s}[0], [x2], x1
-    ldcol.8     v0,  x3,  x1,  4,  hi=1
-    add         x3,  x3,  x1
-    ldcol.8     v3,  x3,  x1,  4
-    movrel      x4,  p8weight
-    movrel      x5,  p16weight
-    uaddl       v4.8h,  v2.8b,  v3.8b
-    rev32       v0.8b,  v0.8b
-    trn1        v2.2s,  v2.2s,  v3.2s
-    ld1        {v7.8h}, [x4]
-    usubl       v2.8h,  v2.8b,  v0.8b
-    mul         v2.8h,  v2.8h,  v7.8h
-    ld1        {v0.8h}, [x5]
-    saddlp      v2.4s,  v2.8h
-    addp        v2.4s,  v2.4s,  v2.4s
-    shl         v3.2s,  v2.2s,  #4
-    add         v2.2s,  v2.2s,  v3.2s
-    rshrn       v5.4h,  v2.4s,  #5    // b, c, x, x
-    addp        v2.4h,  v5.4h,  v5.4h
-    shl         v3.4h,  v2.4h,  #2
-    sub         v3.4h,  v3.4h,  v2.4h // 3 * (b + c)
-    rev64       v4.4h,  v4.4h
-    add         v4.4h,  v4.4h,  v0.4h
-    shl         v2.4h,  v4.4h,  #4              // a
-    sub         v2.4h,  v2.4h,  v3.4h           // a - 3 * (b + c) + 16
-    ext         v0.16b, v0.16b, v0.16b, #14
-    sub         v6.4h,  v5.4h,  v3.4h
-    mov         v0.h[0],  wzr
-    mul         v0.8h,  v0.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
-    dup         v1.8h,  v2.h[0]                 // pix
-    dup         v2.8h,  v5.h[1]                 // c
-    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
-    mov         x3,  #8
-1:
-    subs        x3,  x3,  #1
-    sqshrun     v0.8b,  v1.8h,  #5
-    add         v1.8h,  v1.8h,  v2.8h
-    st1        {v0.8b}, [x0], x1
-    b.ne        1b
-    ret
-endfunc
-
-
-.macro loadsum4 wd, t1, t2, t3, x, idx
-    ldrb        \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
-    ldrb        \t1,  [\x, #(\idx + 1) * FDEC_STRIDE - 1]
-    ldrb        \t2,  [\x, #(\idx + 2) * FDEC_STRIDE - 1]
-    ldrb        \t3,  [\x, #(\idx + 3) * FDEC_STRIDE - 1]
-    add         \wd,  \wd,  \t1
-    add         \t1,  \t2,  \t3
-    add         \wd,  \wd,  \t1
-.endm
-
-function x264_predict_8x16c_h_neon, export=1
-    sub         x2,  x0,  #1
-    add         x3,  x0,  #FDEC_STRIDE - 1
-    mov         x7,  #2 * FDEC_STRIDE
-    add         x1,  x0,  #FDEC_STRIDE
-.rept 4
-    ld1r       {v0.8b}, [x2], x7
-    ld1r       {v1.8b}, [x3], x7
-    ld1r       {v2.8b}, [x2], x7
-    ld1r       {v3.8b}, [x3], x7
-    st1        {v0.8b}, [x0], x7
-    st1        {v1.8b}, [x1], x7
-    st1        {v2.8b}, [x0], x7
-    st1        {v3.8b}, [x1], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_8x16c_v_neon, export=1
-    sub         x1,  x0,  #FDEC_STRIDE
-    mov         x2,  #2 * FDEC_STRIDE
-    ld1        {v0.8b}, [x1], x2
-.rept 8
-    st1        {v0.8b}, [x0], x2
-    st1        {v0.8b}, [x1], x2
-.endr
-    ret
-endfunc
-
-function x264_predict_8x16c_p_neon, export=1
-    movrel      x4,  p16weight
-    ld1        {v17.8h}, [x4]
-    sub         x3,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    add         x2,  x3,  #4
-    sub         x3,  x3,  #1
-
-    ld1        {v0.8b}, [x3]
-    ld1        {v2.8b}, [x2], x1
-    ldcol.8     v1,  x3,  x1
-    add         x3,  x3,  x1
-    ldcol.8     v3,  x3,  x1
-    ext         v4.8b,  v2.8b,  v2.8b,  #3
-    ext         v5.8b,  v3.8b,  v3.8b,  #7
-    rev32       v0.8b,  v0.8b
-    rev64       v1.8b,  v1.8b
-
-    uaddl       v4.8h,  v5.8b,  v4.8b // a * 1/16
-
-    usubl       v2.8h,  v2.8b,  v0.8b
-    mul         v2.8h,  v2.8h,  v17.8h
-    saddlp      v2.4s,  v2.8h
-    addp        v2.4s,  v2.4s,  v2.4s  // H
-
-    usubl       v3.8h,  v3.8b,  v1.8b
-    mul         v3.8h,  v3.8h,  v17.8h
-    saddlp      v3.4s,  v3.8h
-    addp        v3.4s,  v3.4s,  v3.4s
-    addp        v3.4s,  v3.4s,  v3.4s  // V
-
-    ext         v17.16b, v17.16b, v17.16b, #14
-
-    shl         v4.4h,  v4.4h,  #4     // a
-    shl         v6.2s,  v2.2s,  #4     // 16 * H
-    shl         v7.2s,  v3.2s,  #2     // 4 * V
-    add         v2.2s,  v2.2s,  v6.2s  // 17 * H
-    add         v3.2s,  v3.2s,  v7.2s  // 5 * V
-    rshrn       v2.4h,  v2.4s,  #5     // b
-    rshrn       v3.4h,  v3.4s,  #6     // c
-
-    mov         v17.h[0],  wzr
-
-    sub         v4.4h,  v4.4h,  v2.4h  // a - b
-    shl         v6.4h,  v2.4h,  #1     // 2 * b
-    add         v4.4h,  v4.4h,  v3.4h  // a - b + c
-    shl         v7.4h,  v3.4h,  #3     // 8 * c
-    sub         v4.4h,  v4.4h,  v6.4h  // a - 3b + c
-    sub         v4.4h,  v4.4h,  v7.4h  // a - 3b - 7c
-
-    mul         v0.8h,  v17.8h, v2.h[0]         // 0,1,2,3,4,5,6,7 * b
-    dup         v1.8h,  v4.h[0]                 // i00
-    dup         v2.8h,  v3.h[0]                 // c
-    add         v1.8h,  v1.8h,  v0.8h           // pix + {0..7}*b
-    mov         x3,  #16
-1:
-    subs        x3,  x3,  #2
-    sqrshrun    v4.8b,  v1.8h,  #5
-    add         v1.8h,  v1.8h,  v2.8h
-    sqrshrun    v5.8b,  v1.8h,  #5
-    st1        {v4.8b}, [x0], x1
-    add         v1.8h,  v1.8h,  v2.8h
-    st1        {v5.8b}, [x0], x1
-    b.ne        1b
-    ret
-endfunc
-
-function x264_predict_8x16c_dc_neon, export=1
-    mov         x1,  #FDEC_STRIDE
-    sub         x10, x0,  #FDEC_STRIDE
-    loadsum4    w2, w3, w4, w5, x0, 0
-    ld1        {v6.8b}, [x10]
-    loadsum4    w6, w7, w8, w9, x0, 4
-    uaddlp      v6.4h,  v6.8b
-    dup         v22.8h, w2              // s2
-    dup         v23.8h, w6              // s3
-    loadsum4    w2, w3, w4, w5, x0, 8
-    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
-    loadsum4    w6, w7, w8, w9, x0, 12
-    dup         v20.8h, v6.h[0]         // s0
-    dup         v21.8h, v6.h[1]         // s1
-    dup         v24.8h, w2              // s4
-    dup         v25.8h, w6              // s5
-
-    ext         v16.16b, v20.16b, v21.16b, #8
-    ext         v17.16b, v22.16b, v21.16b, #8
-    ext         v1.16b,  v23.16b, v21.16b, #8
-    ext         v2.16b,  v24.16b, v21.16b, #8
-    ext         v3.16b,  v25.16b, v21.16b, #8
-
-    add         v0.8h,  v16.8h, v17.8h
-    add         v1.8h,  v1.8h,  v23.8h
-    add         v2.8h,  v2.8h,  v24.8h
-    add         v3.8h,  v3.8h,  v25.8h
-
-    rshrn       v0.8b,  v0.8h,  #3
-    rshrn       v1.8b,  v1.8h,  #3
-    rshrn       v2.8b,  v2.8h,  #3
-    rshrn       v3.8b,  v3.8h,  #3
-
-    add         x11, x0,  #4  * FDEC_STRIDE
-    add         x12, x0,  #8  * FDEC_STRIDE
-    add         x13, x0,  #12 * FDEC_STRIDE
-.rept 4
-    st1        {v0.8b}, [x0],  x1
-    st1        {v1.8b}, [x11], x1
-    st1        {v2.8b}, [x12], x1
-    st1        {v3.8b}, [x13], x1
-.endr
-    ret
-endfunc
-
-function x264_predict_8x16c_dc_left_neon, export=1
-    mov         x1,  #FDEC_STRIDE
-    ldrb        w2,  [x0, # 0 * FDEC_STRIDE - 1]
-    ldrb        w3,  [x0, # 1 * FDEC_STRIDE - 1]
-    ldrb        w4,  [x0, # 2 * FDEC_STRIDE - 1]
-    ldrb        w5,  [x0, # 3 * FDEC_STRIDE - 1]
-    add         w2,  w2,  w3
-
-    ldrb        w6,  [x0, # 4 * FDEC_STRIDE - 1]
-    add         w4,  w4,  w5
-    ldrb        w7,  [x0, # 5 * FDEC_STRIDE - 1]
-    add         w2,  w2,  w4
-    ldrb        w8,  [x0, # 6 * FDEC_STRIDE - 1]
-    ldrb        w9,  [x0, # 7 * FDEC_STRIDE - 1]
-    dup         v0.8h,  w2
-    add         w6,  w6,  w7
-    rshrn       v0.8b,  v0.8h,  #2
-    add         w8,  w8,  w9
-
-    ldrb        w10, [x0, # 8 * FDEC_STRIDE - 1]
-    ldrb        w11, [x0, # 9 * FDEC_STRIDE - 1]
-    add         w6,  w6,  w8
-    ldrb        w12, [x0, #10 * FDEC_STRIDE - 1]
-    ldrb        w13, [x0, #11 * FDEC_STRIDE - 1]
-    dup         v1.8h,  w6
-    add         w10,  w10,  w11
-    rshrn       v1.8b,  v1.8h,  #2
-    add         w12,  w12,  w13
-
-    ldrb        w2,  [x0, #12 * FDEC_STRIDE - 1]
-    ldrb        w3,  [x0, #13 * FDEC_STRIDE - 1]
-    add         w10,  w10,  w12
-    ldrb        w4,  [x0, #14 * FDEC_STRIDE - 1]
-    ldrb        w5,  [x0, #15 * FDEC_STRIDE - 1]
-    dup         v2.8h,  w10
-    add         w2,  w2,  w3
-    rshrn       v2.8b,  v2.8h,  #2
-    add         w4,  w4,  w5
-    st1        {v0.8b}, [x0], x1
-    st1        {v0.8b}, [x0], x1
-    add         w2,  w2,  w4
-    st1        {v0.8b}, [x0], x1
-    dup         v3.8h,  w2
-    st1        {v0.8b}, [x0], x1
-    rshrn       v3.8b,  v3.8h,  #2
-
-.irp  idx, 1, 2, 3
-.rept 4
-    st1        {v\idx\().8b}, [x0], x1
-.endr
-.endr
-    ret
-endfunc
-
-function x264_predict_8x16c_dc_top_neon, export=1
-    sub         x2,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x2]
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
-    rshrn       v4.8b,  v0.8h,  #2
-    dup         v0.8b,  v4.b[0]
-    dup         v1.8b,  v4.b[1]
-    ext         v0.8b,  v0.8b,  v1.8b,  #4
-.rept 16
-    st1        {v0.8b}, [x0], x1
-.endr
-    ret
-endfunc
-
-
-function x264_predict_16x16_dc_top_neon, export=1
-    sub         x2,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    ld1        {v0.16b}, [x2]
-    uaddlv      h0,     v0.16b
-    rshrn       v0.8b,  v0.8h,  #4
-    dup         v0.16b, v0.b[0]
-    b           pred16x16_dc_end
-endfunc
-
-function x264_predict_16x16_dc_left_neon, export=1
-    sub         x2,  x0,  #1
-    mov         x1,  #FDEC_STRIDE
-    ldcol.16    v0,  x2,  x1
-    uaddlv      h0,     v0.16b
-    rshrn       v0.8b,  v0.8h,  #4
-    dup         v0.16b, v0.b[0]
-    b           pred16x16_dc_end
-endfunc
-
-function x264_predict_16x16_dc_neon, export=1
-    sub         x3,  x0,  #FDEC_STRIDE
-    sub         x2,  x0,  #1
-    mov         x1,  #FDEC_STRIDE
-    ld1        {v0.16b}, [x3]
-    ldcol.16    v1,  x2,  x1
-    uaddlv      h0,     v0.16b
-    uaddlv      h1,     v1.16b
-    add         v0.4h,  v0.4h,  v1.4h
-    rshrn       v0.8b,  v0.8h,  #5
-    dup         v0.16b, v0.b[0]
-pred16x16_dc_end:
-.rept 16
-    st1        {v0.16b}, [x0], x1
-.endr
-    ret
-endfunc
-
-function x264_predict_16x16_h_neon, export=1
-    sub         x1,  x0,  #1
-    mov         x7, #FDEC_STRIDE
-.rept 8
-    ld1r       {v0.16b}, [x1], x7
-    ld1r       {v1.16b}, [x1], x7
-    st1        {v0.16b}, [x0], x7
-    st1        {v1.16b}, [x0], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_16x16_v_neon, export=1
-    sub         x0,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.16b}, [x0], x7
-.rept 16
-    st1        {v0.16b}, [x0], x7
-.endr
-    ret
-endfunc
-
-function x264_predict_16x16_p_neon, export=1
-    sub         x3,  x0,  #FDEC_STRIDE
-    mov         x1,  #FDEC_STRIDE
-    add         x2,  x3,  #8
-    sub         x3,  x3,  #1
-    ld1        {v0.8b}, [x3]
-    ld1        {v2.8b}, [x2], x1
-    ldcol.8     v1,  x3,  x1
-    add         x3,  x3,  x1
-    ldcol.8     v3,  x3,  x1
-    rev64       v0.8b,  v0.8b
-    rev64       v1.8b,  v1.8b
-    movrel      x4,  p16weight
-    uaddl       v4.8h,  v2.8b,  v3.8b
-    ld1        {v7.8h}, [x4]
-    usubl       v2.8h,  v2.8b,  v0.8b
-    usubl       v3.8h,  v3.8b,  v1.8b
-    mul         v2.8h,  v2.8h,  v7.8h
-    mul         v3.8h,  v3.8h,  v7.8h
-    saddlp      v2.4s,  v2.8h
-    saddlp      v3.4s,  v3.8h
-    addp        v2.4s,  v2.4s,  v3.4s
-    addp        v2.4s,  v2.4s,  v2.4s
-    shl         v3.2s,  v2.2s,  #2
-    add         v2.2s,  v2.2s,  v3.2s
-    rshrn       v5.4h,  v2.4s,  #6    // b, c, x, x
-    addp        v2.4h,  v5.4h,  v5.4h
-    shl         v3.4h,  v2.4h,  #3
-    sub         v3.4h,  v3.4h,  v2.4h // 7 * (b + c)
-    ext         v4.16b, v4.16b, v4.16b, #14
-    add         v4.4h,  v4.4h,  v7.4h
-    shl         v2.4h,  v4.4h,  #4              // a
-    sub         v2.4h,  v2.4h,  v3.4h           // a - 7 * (b + c) + 16
-    ext         v7.16b, v7.16b, v7.16b, #14
-    mov         v7.h[0],  wzr
-    dup         v3.8h,  v5.h[0]
-    mul         v0.8h,  v7.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
-    dup         v1.8h,  v2.h[0]                 // pix
-    dup         v2.8h,  v5.h[1]                 // c
-    shl         v3.8h,  v3.8h,  #3
-    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
-    add         v3.8h,  v3.8h,  v1.8h           // pix + x{8-15}*b
-    mov         x3,  #16
-1:
-    subs        x3,  x3,  #1
-    sqshrun     v0.8b,  v1.8h,  #5
-    add         v1.8h,  v1.8h,  v2.8h
-    sqshrun2    v0.16b, v3.8h,  #5
-    add         v3.8h,  v3.8h,  v2.8h
-    st1        {v0.16b}, [x0], x1
-    b.ne        1b
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/predict-c.c b/android/src/main/libenc/jni/libx264/common/aarch64/predict-c.c
deleted file mode 100755
index c06e7d1..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/predict-c.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*****************************************************************************
- * predict.c: aarch64 intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "predict.h"
-#include "pixel.h"
-
-void x264_predict_4x4_dc_top_neon( uint8_t *src );
-void x264_predict_4x4_ddr_neon( uint8_t *src );
-void x264_predict_4x4_ddl_neon( uint8_t *src );
-
-void x264_predict_8x8c_dc_top_neon( uint8_t *src );
-void x264_predict_8x8c_dc_left_neon( uint8_t *src );
-void x264_predict_8x8c_p_neon( uint8_t *src );
-
-void x264_predict_8x16c_dc_left_neon( uint8_t *src );
-void x264_predict_8x16c_dc_top_neon( uint8_t *src );
-void x264_predict_8x16c_p_neon( uint8_t *src );
-
-void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
-
-void x264_predict_16x16_dc_top_neon( uint8_t *src );
-void x264_predict_16x16_dc_left_neon( uint8_t *src );
-void x264_predict_16x16_p_neon( uint8_t *src );
-
-void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
-{
-#if !HIGH_BIT_DEPTH
-    if (cpu&X264_CPU_ARMV8)
-    {
-        pf[I_PRED_4x4_H]   = x264_predict_4x4_h_aarch64;
-        pf[I_PRED_4x4_V]   = x264_predict_4x4_v_aarch64;
-    }
-
-    if (cpu&X264_CPU_NEON)
-    {
-        pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_neon;
-        pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
-        pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_neon;
-        pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_neon;
-    }
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
-{
-#if !HIGH_BIT_DEPTH
-    if (cpu&X264_CPU_ARMV8) {
-        pf[I_PRED_CHROMA_V]   = x264_predict_8x8c_v_aarch64;
-    }
-
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
-    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-
-void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_neon;
-    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_neon;
-    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_neon;
-    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_neon;
-    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
-    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
-    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
-    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
-    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
-    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
-    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
-    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
-    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
-    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
-    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
-    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
-    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
-    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
-    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/predict.h b/android/src/main/libenc/jni/libx264/common/aarch64/predict.h
deleted file mode 100755
index f0589a5..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/predict.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*****************************************************************************
- * predict.h: aarch64 intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_PREDICT_H
-#define X264_AARCH64_PREDICT_H
-
-void x264_predict_4x4_h_aarch64( uint8_t *src );
-void x264_predict_4x4_v_aarch64( uint8_t *src );
-void x264_predict_8x8c_v_aarch64( uint8_t *src );
-
-// for the merged 4x4 intra sad/satd which expects unified suffix
-#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
-#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
-#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
-
-void x264_predict_4x4_dc_neon( uint8_t *src );
-void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8c_dc_neon( uint8_t *src );
-void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x16c_v_neon( uint8_t *src );
-void x264_predict_8x16c_h_neon( uint8_t *src );
-void x264_predict_8x16c_dc_neon( uint8_t *src );
-void x264_predict_16x16_v_neon( uint8_t *src );
-void x264_predict_16x16_h_neon( uint8_t *src );
-void x264_predict_16x16_dc_neon( uint8_t *src );
-
-void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
-void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] );
-void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
-
-#endif /* X264_AARCH64_PREDICT_H */
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/quant-a.S b/android/src/main/libenc/jni/libx264/common/aarch64/quant-a.S
deleted file mode 100755
index 46b971e..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/quant-a.S
+++ /dev/null
@@ -1,592 +0,0 @@
-/****************************************************************************
- * quant.S: arm quantization and level-run
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *          Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
-    add         v18.8h, v18.8h, \bias0
-    add         v19.8h, v19.8h, \bias1
-    umull       v20.4s, v18.4h, \mf0_1\().4h
-    umull2      v21.4s, v18.8h, \mf0_1\().8h
-    umull       v22.4s, v19.4h, \mf2_3\().4h
-    umull2      v23.4s, v19.8h, \mf2_3\().8h
-    sshr        v16.8h, v16.8h, #15
-    sshr        v17.8h, v17.8h, #15
-    shrn        v18.4h, v20.4s, #16
-    shrn2       v18.8h, v21.4s, #16
-    shrn        v19.4h, v22.4s, #16
-    shrn2       v19.8h, v23.4s, #16
-    eor         v18.16b, v18.16b, v16.16b
-    eor         v19.16b, v19.16b, v17.16b
-    sub         v18.8h, v18.8h, v16.8h
-    sub         v19.8h, v19.8h, v17.8h
-    orr         \mask,  v18.16b, v19.16b
-    st1        {v18.8h,v19.8h}, [x0], #32
-.endm
-
-.macro QUANT_END d
-    fmov        x2,  \d
-    mov         w0,  #0
-    tst         x2,  x2
-    cinc        w0,  w0,  ne
-    ret
-.endm
-
-// quant_2x2_dc( int16_t dct[4], int mf, int bias )
-function x264_quant_2x2_dc_neon, export=1
-    ld1        {v0.4h}, [x0]
-    dup         v2.4h,  w2
-    dup         v1.4h,  w1
-    abs         v3.4h,  v0.4h
-    add         v3.4h,  v3.4h,  v2.4h
-    umull       v3.4s,  v3.4h,  v1.4h
-    sshr        v0.4h,  v0.4h,  #15
-    shrn        v3.4h,  v3.4s,  #16
-    eor         v3.8b,  v3.8b,  v0.8b
-    sub         v3.4h,  v3.4h,  v0.4h
-    st1        {v3.4h}, [x0]
-    QUANT_END   d3
-endfunc
-
-// quant_4x4_dc( int16_t dct[16], int mf, int bias )
-function x264_quant_4x4_dc_neon, export=1
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h,  v16.8h
-    abs         v19.8h,  v17.8h
-    dup         v0.8h,  w2
-    dup         v2.8h,  w1
-    QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
-    uqxtn       v0.8b,  v0.8h
-    QUANT_END   d0
-endfunc
-
-// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4_neon, export=1
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h,  v16.8h
-    abs         v19.8h,  v17.8h
-    ld1        {v0.8h,v1.8h}, [x2]
-    ld1        {v2.8h,v3.8h}, [x1]
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
-    uqxtn       v0.8b,  v0.8h
-    QUANT_END   d0
-endfunc
-
-// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4x4_neon, export=1
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    ld1        {v0.8h,v1.8h}, [x2]
-    ld1        {v2.8h,v3.8h}, [x1]
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
-    uqxtn       v4.8b,  v4.8h
-    uqxtn       v7.8b,  v7.8h
-    uqxtn       v6.8b,  v6.8h
-    uqxtn       v5.8b,  v5.8h
-    fmov        x7,  d7
-    fmov        x6,  d6
-    fmov        x5,  d5
-    fmov        x4,  d4
-    mov         w0,  #0
-    tst         x7,  x7
-    cinc        w0,  w0,  ne
-    lsl         w0,  w0,  #1
-    tst         x6,  x6
-    cinc        w0,  w0,  ne
-    lsl         w0,  w0,  #1
-    tst         x5,  x5
-    cinc        w0,  w0,  ne
-    lsl         w0,  w0,  #1
-    tst         x4,  x4
-    cinc        w0,  w0,  ne
-    ret
-endfunc
-
-// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-function x264_quant_8x8_neon, export=1
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    ld1        {v0.8h,v1.8h}, [x2], #32
-    ld1        {v2.8h,v3.8h}, [x1], #32
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
-.rept 3
-    ld1        {v16.8h,v17.8h}, [x0]
-    abs         v18.8h, v16.8h
-    abs         v19.8h, v17.8h
-    ld1        {v0.8h,v1.8h}, [x2], #32
-    ld1        {v2.8h,v3.8h}, [x1], #32
-    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
-    orr         v4.16b, v4.16b, v5.16b
-.endr
-    uqxtn       v0.8b,  v4.8h
-    QUANT_END   d0
-endfunc
-
-.macro DEQUANT_START mf_size offset dc=no
-    mov         w3,  #0x2b
-    mul         w3,  w3,  w2
-    lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
-    add         w5,  w3,  w3,  lsl #1
-    sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
-    lsl         w2,  w2,  #\mf_size
-.ifc \dc,no
-    add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
-.else
-    ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
-.endif
-    subs        w3,  w3,  #\offset      // 6 for 8x8
-.endm
-
-// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-.macro DEQUANT size bits
-function x264_dequant_\size\()_neon, export=1
-    DEQUANT_START \bits+2, \bits
-.ifc \size, 8x8
-    mov         w2,  #4
-.endif
-    b.lt        dequant_\size\()_rshift
-
-    dup         v31.8h, w3
-dequant_\size\()_lshift_loop:
-.ifc \size, 8x8
-    subs        w2,  w2,  #1
-.endif
-    ld1        {v16.4s}, [x1], #16
-    ld1        {v17.4s}, [x1], #16
-    sqxtn       v2.4h,  v16.4s
-    ld1        {v18.4s}, [x1], #16
-    sqxtn2      v2.8h,  v17.4s
-    ld1        {v19.4s}, [x1], #16
-    sqxtn       v3.4h,  v18.4s
-    ld1        {v0.8h,v1.8h}, [x0]
-    sqxtn2      v3.8h,  v19.4s
-    mul         v0.8h,  v0.8h,  v2.8h
-    mul         v1.8h,  v1.8h,  v3.8h
-    sshl        v0.8h,  v0.8h,  v31.8h
-    sshl        v1.8h,  v1.8h,  v31.8h
-    st1        {v0.8h,v1.8h}, [x0], #32
-.ifc \size, 8x8
-    b.gt        dequant_\size\()_lshift_loop
-.endif
-    ret
-
-dequant_\size\()_rshift:
-    dup         v31.4s, w3
-    neg         w3,  w3
-    mov         w5,  #1
-    sub         w3,  w3,  #1
-    lsl         w5,  w5,  w3
-
-.ifc \size, 8x8
-dequant_\size\()_rshift_loop:
-    subs        w2,  w2,  #1
-.endif
-    ld1        {v16.4s}, [x1], #16
-    ld1        {v17.4s}, [x1], #16
-    sqxtn       v2.4h,  v16.4s
-    ld1        {v18.4s}, [x1], #16
-    dup         v16.4s, w5
-    sqxtn2      v2.8h,  v17.4s
-    ld1        {v19.4s}, [x1], #16
-    dup         v17.4s, w5
-    sqxtn       v3.4h,  v18.4s
-    ld1        {v0.8h,v1.8h}, [x0]
-    dup         v18.4s, w5
-    sqxtn2      v3.8h,  v19.4s
-    dup         v19.4s, w5
-
-    smlal       v16.4s, v0.4h,  v2.4h
-    smlal2      v17.4s, v0.8h,  v2.8h
-    smlal       v18.4s, v1.4h,  v3.4h
-    smlal2      v19.4s, v1.8h,  v3.8h
-    sshl        v16.4s, v16.4s, v31.4s
-    sshl        v17.4s, v17.4s, v31.4s
-    sshl        v18.4s, v18.4s, v31.4s
-    sshl        v19.4s, v19.4s, v31.4s
-
-    sqxtn       v0.4h,  v16.4s
-    sqxtn2      v0.8h,  v17.4s
-    sqxtn       v1.4h,  v18.4s
-    sqxtn2      v1.8h,  v19.4s
-    st1        {v0.8h,v1.8h}, [x0], #32
-.ifc \size, 8x8
-    b.gt        dequant_\size\()_rshift_loop
-.endif
-    ret
-endfunc
-.endm
-
-DEQUANT 4x4, 4
-DEQUANT 8x8, 6
-
-// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-function x264_dequant_4x4_dc_neon, export=1
-    DEQUANT_START 6, 6, yes
-    b.lt        dequant_4x4_dc_rshift
-
-    lsl         w1,  w1,  w3
-    dup         v2.8h,  w1
-    ld1        {v0.8h,v1.8h},   [x0]
-
-    mul         v0.8h,  v0.8h,  v2.8h
-    mul         v1.8h,  v1.8h,  v2.8h
-    st1        {v0.8h,v1.8h},   [x0]
-    ret
-
-dequant_4x4_dc_rshift:
-    dup         v4.8h,  w1
-    dup         v3.4s, w3
-    neg         w3,  w3
-    mov         w5,  #1
-    sub         w3,  w3,  #1
-    lsl         w5,  w5,  w3
-
-    dup         v16.4s, w5
-    dup         v17.4s, w5
-    ld1        {v0.8h,v1.8h}, [x0]
-    dup         v18.4s, w5
-    dup         v19.4s, w5
-
-    smlal       v16.4s, v0.4h,  v4.4h
-    smlal2      v17.4s, v0.8h,  v4.8h
-    smlal       v18.4s, v1.4h,  v4.4h
-    smlal2      v19.4s, v1.8h,  v4.8h
-    sshl        v16.4s, v16.4s, v3.4s
-    sshl        v17.4s, v17.4s, v3.4s
-    sshl        v18.4s, v18.4s, v3.4s
-    sshl        v19.4s, v19.4s, v3.4s
-
-    sqxtn       v0.4h,  v16.4s
-    sqxtn2      v0.8h,  v17.4s
-    sqxtn       v1.4h,  v18.4s
-    sqxtn2      v1.8h,  v19.4s
-    st1        {v0.8h,v1.8h}, [x0]
-    ret
-endfunc
-
-.macro decimate_score_1x size
-function x264_decimate_score\size\()_neon, export=1
-    ld1        {v0.8h,v1.8h}, [x0]
-    movrel      x5,  X(x264_decimate_table4)
-    movi        v3.16b, #0x01
-    sqxtn       v0.8b,  v0.8h
-    sqxtn2      v0.16b, v1.8h
-    abs         v2.16b, v0.16b
-    cmeq        v1.16b, v0.16b, #0
-    cmhi        v2.16b, v2.16b, v3.16b
-    shrn        v1.8b,  v1.8h,  #4
-    shrn        v2.8b,  v2.8h,  #4
-    fmov        x2,  d2
-    fmov        x1,  d1
-    cbnz        x2,  9f
-    mvn         x1,  x1
-    mov         w0,  #0
-    cbz         x1,  0f
-.ifc \size, 15
-    lsr         x1,  x1,  #1
-.endif
-    rbit        x1,  x1
-1:
-    clz         x3,  x1
-    lsr         x6,  x3,  #2
-    lsl         x1,  x1,  x3
-    ldrb        w7,  [x5, x6]
-    lsl         x1,  x1,  #4
-    add         w0,  w0,  w7
-    cbnz        x1,  1b
-    ret
-9:
-    mov         w0,  #9
-0:
-    ret
-endfunc
-.endm
-
-decimate_score_1x 15
-decimate_score_1x 16
-
-const mask64, align=6
-    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
-    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
-endconst
-
-function x264_decimate_score64_neon, export=1
-    ld1        {v0.8h,v1.8h}, [x0], #32
-    ld1        {v2.8h,v3.8h}, [x0], #32
-    ld1        {v4.8h,v5.8h}, [x0], #32
-    ld1        {v6.8h,v7.8h}, [x0]
-    movrel      x6,  mask64
-    movi        v31.16b, #0x01
-    sqxtn       v16.8b,  v1.8h
-    sqxtn2      v16.16b, v0.8h
-    sqxtn       v17.8b,  v3.8h
-    sqxtn2      v17.16b, v2.8h
-    sqxtn       v18.8b,  v5.8h
-    sqxtn2      v18.16b, v4.8h
-    sqxtn       v19.8b,  v7.8h
-    sqxtn2      v19.16b, v6.8h
-    abs         v4.16b, v16.16b
-    abs         v5.16b, v17.16b
-    abs         v6.16b, v18.16b
-    abs         v7.16b, v19.16b
-    ld1        {v30.16b}, [x6]
-    cmeq        v0.16b, v16.16b, #0
-    cmeq        v1.16b, v17.16b, #0
-    cmeq        v2.16b, v18.16b, #0
-    cmeq        v3.16b, v19.16b, #0
-    umax        v4.16b, v4.16b, v5.16b
-    umax        v6.16b, v6.16b, v7.16b
-    and         v0.16b, v0.16b, v30.16b
-    and         v1.16b, v1.16b, v30.16b
-    and         v2.16b, v2.16b, v30.16b
-    and         v3.16b, v3.16b, v30.16b
-    umax        v4.16b, v4.16b, v6.16b
-    addp        v0.16b, v1.16b, v0.16b
-    addp        v2.16b, v3.16b, v2.16b
-    cmhi        v4.16b, v4.16b, v31.16b
-    addp        v0.16b, v2.16b, v0.16b
-    shrn        v4.8b,  v4.8h,  #4
-    addp        v0.16b, v0.16b, v0.16b
-    fmov        x2,  d4
-    fmov        x1,  d0
-    cbnz        x2,  9f
-    mvn         x1,  x1
-    mov         w0,  #0
-    cbz         x1,  0f
-    movrel      x5,  X(x264_decimate_table8)
-1:
-    clz         x3,  x1
-    lsl         x1,  x1,  x3
-    ldrb        w7,  [x5, x3]
-    lsl         x1,  x1,  #1
-    add         w0,  w0,  w7
-    cbnz        x1,  1b
-    ret
-9:
-    mov         w0,  #9
-0:
-    ret
-endfunc
-
-// int coeff_last( int16_t *l )
-function x264_coeff_last4_aarch64, export=1
-    ldr         x2,  [x0]
-    mov         w4,  #3
-    clz         x0,  x2
-    sub         w0,  w4,  w0, lsr #4
-    ret
-endfunc
-
-function x264_coeff_last8_aarch64, export=1
-    ldr         x3,  [x0, #8]
-    mov         w4,  #7
-    clz         x2,  x3
-    cmp         w2,  #64
-    b.ne        1f
-    ldr         x3,  [x0]
-    sub         w4,  w4,  #4
-    clz         x2,  x3
-1:
-    sub         w0,  w4,  w2, lsr #4
-    ret
-endfunc
-
-.macro COEFF_LAST_1x size
-function x264_coeff_last\size\()_neon, export=1
-.if \size == 15
-    sub         x0,  x0,  #2
-.endif
-    ld1        {v0.8h,v1.8h}, [x0]
-    uqxtn       v0.8b,  v0.8h
-    uqxtn2      v0.16b, v1.8h
-    cmtst       v0.16b, v0.16b, v0.16b
-    shrn        v0.8b,  v0.8h,  #4
-    fmov        x1,  d0
-    mov         w3,  #\size - 1
-    clz         x2,  x1
-    sub         w0,  w3,  w2, lsr #2
-    ret
-endfunc
-.endm
-
-COEFF_LAST_1x 15
-COEFF_LAST_1x 16
-
-function x264_coeff_last64_neon, export=1
-    ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
-    movi        v31.8h,  #8
-    movi        v30.8h,  #1
-    uqxtn       v0.8b,  v0.8h
-    uqxtn2      v0.16b, v1.8h
-    ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
-    uqxtn       v1.8b,  v2.8h
-    uqxtn2      v1.16b, v3.8h
-    uqxtn       v2.8b,  v4.8h
-    uqxtn2      v2.16b, v5.8h
-    uqxtn       v3.8b,  v6.8h
-    uqxtn2      v3.16b, v7.8h
-
-    cmtst       v0.16b, v0.16b, v0.16b
-    cmtst       v1.16b, v1.16b, v1.16b
-    cmtst       v2.16b, v2.16b, v2.16b
-    cmtst       v3.16b, v3.16b, v3.16b
-
-    shrn        v0.8b,  v0.8h,  #4
-    shrn2       v0.16b, v1.8h,  #4
-    shrn        v1.8b,  v2.8h,  #4
-    shrn2       v1.16b, v3.8h,  #4
-
-    clz         v0.4s,  v0.4s
-    clz         v1.4s,  v1.4s
-
-    shrn        v0.4h,  v0.4s,  #2
-    shrn2       v0.8h,  v1.4s,  #2
-
-    sub         v0.8h,  v31.8h,  v0.8h
-    sshl        v0.8h,  v30.8h,  v0.8h
-    shrn        v0.8b,  v0.8h,  #1
-
-    fmov        x2,  d0
-    mov         w3,  #63
-    clz         x2,  x2
-    sub         w0,  w3,  w2
-    ret
-endfunc
-
-.macro coeff_level_run_start size
-    add         x6,  x1,  #23            // runlevel->mask
-    mov         w7,  #0
-    mov         w8,  #0
-    mov         w9,  #1
-    and         x6,  x6,  #~15
-    mov         w4,  #\size - 1
-.endm
-
-.macro coeff_level_run shift
-    clz         x3,  x2
-    subs        w4,  w4,  w3, lsr #\shift
-    str         w4,  [x1], #4
-1:
-    ldrh        w5,  [x0, x4, lsl #1]
-    strh        w5,  [x6], #2
-    add         w7,  w7,  #1
-    lsl         w10, w9, w4
-    orr         w8,  w8,  w10
-    b.le        2f
-    add         w3,  w3,  #1 << \shift
-    sub         w4,  w4,  #1
-    and         x3,  x3,  #~((1 << \shift) - 1)
-    lsl         x2,  x2,  x3
-    clz         x3,  x2
-    subs        w4,  w4,  w3, lsr #\shift
-    b.ge        1b
-2:
-    str         w8,  [x1]
-    mov         w0,  w7
-.endm
-
-function x264_coeff_level_run4_aarch64, export=1
-    ldr         x2,  [x0]
-
-    coeff_level_run_start 4
-
-    coeff_level_run 4
-
-    ret
-endfunc
-
-.macro X264_COEFF_LEVEL_RUN size
-function x264_coeff_level_run\size\()_neon, export=1
-.if \size == 15
-    sub         x0,  x0,  #2
-.endif
-.if         \size < 15
-    ld1         {v0.8h}, [x0]
-    uqxtn       v0.8b,  v0.8h
-    cmtst       v0.8b,  v0.8b,  v0.8b
-.else
-    ld1         {v0.8h,v1.8h}, [x0]
-    uqxtn       v0.8b,  v0.8h
-    uqxtn2      v0.16b, v1.8h
-    cmtst       v0.16b, v0.16b, v0.16b
-    shrn        v0.8b,  v0.8h,  #4
-.endif
-    fmov        x2,  d0
-.if \size == 15
-    add         x0,  x0,  #2
-.endif
-
-    coeff_level_run_start \size
-
-    coeff_level_run (4 - (\size + 1) / 8)
-
-    ret
-endfunc
-.endm
-
-X264_COEFF_LEVEL_RUN 8
-X264_COEFF_LEVEL_RUN 15
-X264_COEFF_LEVEL_RUN 16
-
-function x264_denoise_dct_neon, export=1
-1:  subs        w3,  w3,  #16
-    ld1         {v0.8h,v1.8h}, [x0]
-    ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
-    abs         v16.8h,  v0.8h
-    abs         v17.8h,  v1.8h
-    ld1         {v2.8h,v3.8h}, [x2], #32
-    cmlt        v18.8h,  v0.8h,   #0
-    cmlt        v19.8h,  v1.8h,   #0
-    uaddw       v4.4s,   v4.4s,   v16.4h
-    uaddw2      v5.4s,   v5.4s,   v16.8h
-    uqsub       v20.8h,  v16.8h,  v2.8h
-    uqsub       v21.8h,  v17.8h,  v3.8h
-    uaddw       v6.4s,   v6.4s,   v17.4h
-    uaddw2      v7.4s,   v7.4s,   v17.8h
-    neg         v22.8h,  v20.8h
-    neg         v23.8h,  v21.8h
-    bsl         v18.16b, v22.16b, v20.16b
-    bsl         v19.16b, v23.16b, v21.16b
-    st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
-    st1         {v18.8h,v19.8h}, [x0], #32
-    b.gt        1b
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/aarch64/quant.h b/android/src/main/libenc/jni/libx264/common/aarch64/quant.h
deleted file mode 100755
index b38b686..0000000
--- a/android/src/main/libenc/jni/libx264/common/aarch64/quant.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*****************************************************************************
- * quant.h: arm quantization and level-run
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_AARCH64_QUANT_H
-#define X264_AARCH64_QUANT_H
-
-int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
-
-int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
-int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
-int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
-
-void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-
-int x264_decimate_score15_neon( int16_t * );
-int x264_decimate_score16_neon( int16_t * );
-int x264_decimate_score64_neon( int16_t * );
-
-int x264_coeff_last4_aarch64( int16_t * );
-int x264_coeff_last8_aarch64( int16_t * );
-int x264_coeff_last15_neon( int16_t * );
-int x264_coeff_last16_neon( int16_t * );
-int x264_coeff_last64_neon( int16_t * );
-
-int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
-int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
-int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
-int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
-
-void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/arm/asm.S b/android/src/main/libenc/jni/libx264/common/arm/asm.S
deleted file mode 100755
index 3287e18..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/asm.S
+++ /dev/null
@@ -1,243 +0,0 @@
-/*****************************************************************************
- * asm.S: arm utility macros
- *****************************************************************************
- * Copyright (C) 2008-2016 x264 project
- *
- * Authors: Mans Rullgard <mans@mansr.com>
- *          David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "config.h"
-
-.syntax unified
-
-#if   HAVE_NEON
-        .arch           armv7-a
-#elif HAVE_ARMV6T2
-        .arch           armv6t2
-#elif HAVE_ARMV6
-        .arch           armv6
-#endif
-
-.fpu neon
-
-#ifdef PREFIX
-#   define EXTERN_ASM _
-#else
-#   define EXTERN_ASM
-#endif
-
-#ifdef __ELF__
-#   define ELF
-#else
-#   define ELF @
-#endif
-
-#if HAVE_AS_FUNC
-#   define FUNC
-#else
-#   define FUNC @
-#endif
-
-#if SYS_LINUX
-#define HAVE_SECTION_DATA_REL_RO 1
-#else
-#define HAVE_SECTION_DATA_REL_RO 0
-#endif
-
-.macro require8, val=1
-ELF     .eabi_attribute 24, \val
-.endm
-
-.macro preserve8, val=1
-ELF     .eabi_attribute 25, \val
-.endm
-
-.macro function name, export=1
-    .macro endfunc
-ELF     .size   \name, . - \name
-FUNC    .endfunc
-        .purgem endfunc
-    .endm
-        .align  2
-.if \export == 1
-        .global EXTERN_ASM\name
-ELF     .hidden EXTERN_ASM\name
-ELF     .type   EXTERN_ASM\name, %function
-FUNC    .func   EXTERN_ASM\name
-EXTERN_ASM\name:
-.else
-ELF     .hidden \name
-ELF     .type   \name, %function
-FUNC    .func   \name
-\name:
-.endif
-.endm
-
-.macro const name, align=2, relocate=0
-    .macro endconst
-ELF     .size   \name, . - \name
-        .purgem endconst
-    .endm
-.if HAVE_SECTION_DATA_REL_RO && \relocate
-        .section        .data.rel.ro
-.else
-        .section        .rodata
-.endif
-        .align          \align
-\name:
-.endm
-
-.macro movrel rd, val
-#if defined(PIC)
-        ldr             \rd,  1f
-        b               2f
-1:
-@ FIXME: thumb
-        .word           \val - (2f + 8)
-2:
-        add             \rd,  \rd,  pc
-#elif HAVE_ARMV6T2
-        movw            \rd, #:lower16:\val
-        movt            \rd, #:upper16:\val
-#else
-        ldr             \rd, =\val
-#endif
-.endm
-
-.macro movrelx rd, val, got
-#if defined(PIC) && defined(__ELF__)
-        ldr             \got, 2f
-        ldr             \rd,  1f
-        b               3f
-1:
-@ FIXME: thumb
-        .word \val(GOT)
-2:
-        .word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
-3:
-        add             \got, \got, pc
-        ldr             \rd, [\got, \rd]
-#elif defined(PIC) && defined(__APPLE__)
-        ldr             \rd,  1f
-        b               2f
-1:
-@ FIXME: thumb
-        .word           3f - (2f + 8)
-2:
-        ldr             \rd, [pc, \rd]
-        .non_lazy_symbol_pointer
-3:
-        .indirect_symbol \val
-        .word           0
-        .text
-#else
-        movrel          \rd, \val
-#endif
-.endm
-
-.macro movconst rd, val
-#if HAVE_ARMV6T2
-    movw        \rd, #:lower16:\val
-.if \val >> 16
-    movt        \rd, #:upper16:\val
-.endif
-#else
-    ldr         \rd, =\val
-#endif
-.endm
-
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
-
-#define FENC_STRIDE 16
-#define FDEC_STRIDE 32
-
-.macro HORIZ_ADD dest, a, b
-.ifnb \b
-    vadd.u16    \a, \a, \b
-.endif
-    vpaddl.u16  \a, \a
-    vpaddl.u32  \dest, \a
-.endm
-
-.macro SUMSUB_AB sum, diff, a, b
-    vadd.s16    \sum,  \a, \b
-    vsub.s16    \diff, \a, \b
-.endm
-
-.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
-    SUMSUB_AB   \s1, \d1, \a, \b
-    SUMSUB_AB   \s2, \d2, \c, \d
-.endm
-
-.macro ABS2 a b
-    vabs.s16 \a, \a
-    vabs.s16 \b, \b
-.endm
-
-// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
-// op = sumsub/amax (sum and diff / maximum of absolutes)
-// d1/2 = destination registers
-// s1/2 = source registers
-.macro HADAMARD dist, op, d1, d2, s1, s2
-.if \dist == 1
-    vtrn.16     \s1, \s2
-.else
-    vtrn.32     \s1, \s2
-.endif
-.ifc \op, sumsub
-    SUMSUB_AB   \d1, \d2, \s1, \s2
-.else
-    vabs.s16    \s1, \s1
-    vabs.s16    \s2, \s2
-    vmax.s16    \d1, \s1, \s2
-.endif
-.endm
-
-.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
-    vtrn.32         \r0, \r4
-    vtrn.32         \r1, \r5
-    vtrn.32         \r2, \r6
-    vtrn.32         \r3, \r7
-    vtrn.16         \r0, \r2
-    vtrn.16         \r1, \r3
-    vtrn.16         \r4, \r6
-    vtrn.16         \r5, \r7
-    vtrn.8          \r0, \r1
-    vtrn.8          \r2, \r3
-    vtrn.8          \r4, \r5
-    vtrn.8          \r6, \r7
-.endm
-
-.macro TRANSPOSE4x4 r0 r1 r2 r3
-    vtrn.16         \r0, \r2
-    vtrn.16         \r1, \r3
-    vtrn.8          \r0, \r1
-    vtrn.8          \r2, \r3
-.endm
-
-.macro TRANSPOSE4x4_16  d0 d1 d2 d3
-    vtrn.32     \d0, \d2
-    vtrn.32     \d1, \d3
-    vtrn.16     \d0, \d1
-    vtrn.16     \d2, \d3
-.endm
diff --git a/android/src/main/libenc/jni/libx264/common/arm/bitstream-a.S b/android/src/main/libenc/jni/libx264/common/arm/bitstream-a.S
deleted file mode 100755
index 676d531..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/bitstream-a.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/*****************************************************************************
- * bitstream-a.S: arm bitstream functions
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-function x264_nal_escape_neon
-    push        {r4-r5,lr}
-    vmov.u8     q0,  #0xff
-    vmov.u8     q8,  #4
-    mov         r3,  #3
-    subs        lr,  r1,  r2
-    beq         99f
-0:
-    cmn         lr,  #15
-    blt         16f
-    mov         r1,  r2
-    b           100f
-16:
-    vld1.8      {q1}, [r1]!
-    vext.8      q2,  q0,  q1, #14
-    vext.8      q3,  q0,  q1, #15
-    vcgt.u8     q11, q8,  q1
-    vceq.u8     q9,  q2,  #0
-    vceq.u8     q10, q3,  #0
-    vand        q9,  q9,  q11
-    vand        q9,  q9,  q10
-    vshrn.u16   d22, q9,  #4
-    vmov        ip,  lr,  d22
-    orrs        ip,  ip,  lr
-    beq         16f
-    mov         lr,  #-16
-100:
-    vmov.u8     r5,  d1[6]
-    vmov.u8     r4,  d1[7]
-    orr         r5,  r4,  r5, lsl #8
-101:
-    ldrb        r4,  [r1, lr]
-    orr         ip,  r4,  r5, lsl #16
-    cmp         ip,  #3
-    bhi         102f
-    strb        r3,  [r0], #1
-    orr         r5,  r3,  r5, lsl #8
-102:
-    adds        lr,  lr,  #1
-    strb        r4,  [r0], #1
-    orr         r5,  r4,  r5, lsl #8
-    blt         101b
-    subs        lr,  r1,  r2
-    lsr         ip,  r5,  #8
-    vmov.u8     d1[6],  ip
-    vmov.u8     d1[7],  r5
-    blt         0b
-
-    pop         {r4-r5,pc}
-16:
-    subs        lr,  r1,  r2
-    vst1.8      {q1}, [r0]!
-    vmov        q0, q1
-    blt         0b
-99:
-    pop         {r4-r5,pc}
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/cpu-a.S b/android/src/main/libenc/jni/libx264/common/arm/cpu-a.S
deleted file mode 100755
index 075b0a9..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/cpu-a.S
+++ /dev/null
@@ -1,108 +0,0 @@
-/*****************************************************************************
- * cpu-a.S: arm cpu detection
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.align 2
-
-// done in gas because .fpu neon overrides the refusal to assemble
-// instructions the selected -march/-mcpu doesn't support
-function x264_cpu_neon_test
-    vadd.i16    q0, q0, q0
-    bx          lr
-endfunc
-
-// return: 0 on success
-//         1 if counters were already enabled
-//         9 if lo-res counters were already enabled
-function x264_cpu_enable_armv7_counter, export=0
-    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
-    ands        r0, r2, #1
-    andne       r0, r2, #9
-
-    orr         r2, r2, #1                  // enable counters
-    bic         r2, r2, #8                  // full resolution
-    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
-    mov         r2, #1 << 31                // enable cycle counter
-    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
-    bx          lr
-endfunc
-
-function x264_cpu_disable_armv7_counter, export=0
-    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
-    bic         r0, r0, #1                  // disable counters
-    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
-    bx          lr
-endfunc
-
-
-.macro READ_TIME r
-    mrc         p15, 0, \r, c9, c13, 0
-.endm
-
-// return: 0 if transfers neon -> arm transfers take more than 10 cycles
-//         nonzero otherwise
-function x264_cpu_fast_neon_mrc_test
-    // check for user access to performance counters
-    mrc         p15, 0, r0, c9, c14, 0
-    cmp         r0, #0
-    bxeq        lr
-
-    push        {r4-r6,lr}
-    bl          x264_cpu_enable_armv7_counter
-    ands        r1, r0, #8
-    mov         r3, #0
-    mov         ip, #4
-    mov         r6, #4
-    moveq       r5, #1
-    movne       r5, #64
-
-average_loop:
-    mov         r4, r5
-    READ_TIME   r1
-1:  subs        r4, r4, #1
-.rept 8
-    vmov.u32    lr, d0[0]
-    add         lr, lr, lr
-.endr
-    bgt         1b
-    READ_TIME   r2
-
-    subs        r6, r6, #1
-    sub         r2, r2, r1
-    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
-    addle       r3, r3, r2
-    subsle      ip, ip, #1
-    bgt         average_loop
-
-    // disable counters if we enabled them
-    ands        r0, r0, #1
-    bleq        x264_cpu_disable_armv7_counter
-
-    lsr         r0, r3, #5
-    cmp         r0, #10
-    movgt       r0, #0
-    pop         {r4-r6,pc}
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/dct-a.S b/android/src/main/libenc/jni/libx264/common/arm/dct-a.S
deleted file mode 100755
index a1984a5..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/dct-a.S
+++ /dev/null
@@ -1,766 +0,0 @@
-/****************************************************************************
- * dct-a.S: arm transform and zigzag
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-
-scan4x4_frame:
-.byte    0,1,   8,9,   2,3,   4,5
-.byte    2,3,   8,9,  16,17, 10,11
-.byte   12,13,  6,7,  14,15, 20,21
-.byte   10,11, 12,13,  6,7,  14,15
-
-.text
-
-// sum = a + (b>>shift)   sub = (a>>shift) - b
-.macro SUMSUB_SHR shift sum sub a b t0 t1
-    vshr.s16    \t0,  \b, #\shift
-    vshr.s16    \t1,  \a, #\shift
-    vadd.s16    \sum, \a, \t0
-    vsub.s16    \sub, \t1, \b
-.endm
-
-// sum = (a>>shift) + b   sub = a - (b>>shift)
-.macro SUMSUB_SHR2 shift sum sub a b t0 t1
-    vshr.s16    \t0,  \a, #\shift
-    vshr.s16    \t1,  \b, #\shift
-    vadd.s16    \sum, \t0, \b
-    vsub.s16    \sub, \a, \t1
-.endm
-
-// a += 1.5*ma   b -= 1.5*mb
-.macro SUMSUB_15 a b ma mb t0 t1
-    vshr.s16    \t0, \ma, #1
-    vshr.s16    \t1, \mb, #1
-    vadd.s16    \t0, \t0, \ma
-    vadd.s16    \t1, \t1, \mb
-    vadd.s16    \a,  \a,  \t0
-    vsub.s16    \b,  \b,  \t1
-.endm
-
-
-function x264_dct4x4dc_neon
-    vld1.64         {d0-d3}, [r0,:128]
-    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
-    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
-
-    vmov.s16        d31, #1
-    HADAMARD        1, sumsub, q2, q3, q0, q1
-    vtrn.32         d4,  d5
-    vadd.s16        d16, d4,  d31
-    vtrn.32         d6,  d7
-    vadd.s16        d17, d6,  d31
-    vrhadd.s16      d0,  d4,  d5
-    vhsub.s16       d1,  d16, d5
-    vhsub.s16       d2,  d17, d7
-    vrhadd.s16      d3,  d6,  d7
-    vst1.64         {d0-d3}, [r0,:128]
-    bx              lr
-endfunc
-
-function x264_idct4x4dc_neon
-    vld1.64         {d0-d3}, [r0,:128]
-    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
-    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
-
-    HADAMARD        1, sumsub, q2, q3, q0, q1
-    HADAMARD        2, sumsub, d0, d1, d4, d5
-    HADAMARD        2, sumsub, d3, d2, d6, d7
-    vst1.64         {d0-d3}, [r0,:128]
-    bx              lr
-endfunc
-
-
-.macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
-    SUMSUB_AB       \d1, \d6, \d5, \d6
-    SUMSUB_AB       \d3, \d7, \d4, \d7
-    vadd.s16        \d0, \d3, \d1
-    vadd.s16        \d4, \d7, \d7
-    vadd.s16        \d5, \d6, \d6
-    vsub.s16        \d2, \d3, \d1
-    vadd.s16        \d1, \d4, \d6
-    vsub.s16        \d3, \d7, \d5
-.endm
-
-function x264_sub4x4_dct_neon
-    mov             r3, #FENC_STRIDE
-    mov             ip, #FDEC_STRIDE
-    vld1.32         {d0[]}, [r1,:32], r3
-    vld1.32         {d1[]}, [r2,:32], ip
-    vld1.32         {d2[]}, [r1,:32], r3
-    vsubl.u8        q8,  d0,  d1
-    vld1.32         {d3[]}, [r2,:32], ip
-    vld1.32         {d4[]}, [r1,:32], r3
-    vsubl.u8        q9,  d2,  d3
-    vld1.32         {d5[]}, [r2,:32], ip
-    vld1.32         {d6[]}, [r1,:32], r3
-    vsubl.u8        q10, d4,  d5
-    vld1.32         {d7[]}, [r2,:32], ip
-    vsubl.u8        q11, d6,  d7
-
-    DCT_1D          d0, d1, d2, d3, d16, d18, d20, d22
-    TRANSPOSE4x4_16 d0, d1, d2, d3
-    DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
-    vst1.64         {d4-d7}, [r0,:128]
-    bx              lr
-endfunc
-
-function x264_sub8x4_dct_neon, export=0
-    vld1.64         {d0}, [r1,:64], r3
-    vld1.64         {d1}, [r2,:64], ip
-    vsubl.u8        q8,  d0,  d1
-    vld1.64         {d2}, [r1,:64], r3
-    vld1.64         {d3}, [r2,:64], ip
-    vsubl.u8        q9,  d2,  d3
-    vld1.64         {d4}, [r1,:64], r3
-    vld1.64         {d5}, [r2,:64], ip
-    vsubl.u8        q10, d4,  d5
-    vld1.64         {d6}, [r1,:64], r3
-    vld1.64         {d7}, [r2,:64], ip
-    vsubl.u8        q11, d6,  d7
-
-    DCT_1D          q0, q1, q2, q3,  q8, q9, q10, q11
-    TRANSPOSE4x4_16 q0, q1, q2, q3
-
-    SUMSUB_AB       q8,  q12, q0,  q3
-    SUMSUB_AB       q9,  q10, q1,  q2
-    vadd.i16        q13, q12, q12
-    vadd.i16        q11, q10, q10
-    vadd.i16        d0,  d16, d18
-    vadd.i16        d1,  d26, d20
-    vsub.i16        d2,  d16, d18
-    vsub.i16        d3,  d24, d22
-    vst1.64         {d0-d1}, [r0,:128]!
-    vadd.i16        d4,  d17, d19
-    vadd.i16        d5,  d27, d21
-    vst1.64         {d2-d3}, [r0,:128]!
-    vsub.i16        d6,  d17, d19
-    vsub.i16        d7,  d25, d23
-    vst1.64         {d4-d5}, [r0,:128]!
-    vst1.64         {d6-d7}, [r0,:128]!
-    bx              lr
-endfunc
-
-function x264_sub8x8_dct_neon
-    push            {lr}
-    mov             r3, #FENC_STRIDE
-    mov             ip, #FDEC_STRIDE
-    bl              x264_sub8x4_dct_neon
-    pop             {lr}
-    b               x264_sub8x4_dct_neon
-endfunc
-
-function x264_sub16x16_dct_neon
-    push            {lr}
-    mov             r3, #FENC_STRIDE
-    mov             ip, #FDEC_STRIDE
-    bl              x264_sub8x4_dct_neon
-    bl              x264_sub8x4_dct_neon
-    sub             r1, r1, #8*FENC_STRIDE-8
-    sub             r2, r2, #8*FDEC_STRIDE-8
-    bl              x264_sub8x4_dct_neon
-    bl              x264_sub8x4_dct_neon
-    sub             r1, r1, #8
-    sub             r2, r2, #8
-    bl              x264_sub8x4_dct_neon
-    bl              x264_sub8x4_dct_neon
-    sub             r1, r1, #8*FENC_STRIDE-8
-    sub             r2, r2, #8*FDEC_STRIDE-8
-    bl              x264_sub8x4_dct_neon
-    pop             {lr}
-    b               x264_sub8x4_dct_neon
-endfunc
-
-
-.macro DCT8_1D type
-    SUMSUB_AB       q2,  q1,  q11, q12  // s34/d34
-    SUMSUB_AB       q3,  q11, q10, q13  // s25/d25
-    SUMSUB_AB       q13, q10, q9,  q14  // s16/d16
-    SUMSUB_AB       q14, q8,  q8,  q15  // s07/d07
-
-    SUMSUB_AB       q9,  q2,  q14, q2   // a0/a2
-    SUMSUB_AB       q12, q14, q13, q3   // a1/a3
-
-    SUMSUB_AB       q3,  q13, q8,  q1   // a6/a5
-    vshr.s16        q0,  q10, #1
-    vshr.s16        q15, q11, #1
-    vadd.s16        q0,  q0,  q10
-    vadd.s16        q15, q15, q11
-    vsub.s16        q3,  q3,  q0
-    vsub.s16        q13, q13, q15
-
-    SUMSUB_AB       q0,  q15, q10, q11  // a4/a7
-    vshr.s16        q10, q8,  #1
-    vshr.s16        q11, q1,  #1
-    vadd.s16        q10, q10, q8
-    vadd.s16        q11, q11, q1
-    vadd.s16        q10, q0,  q10
-    vadd.s16        q15, q15, q11
-
-    SUMSUB_AB       q8,  q12, q9,  q12
-    SUMSUB_SHR   2, q9,  q15, q10, q15,  q0, q1
-    SUMSUB_SHR   1, q10, q14, q2,  q14,  q0, q1
-    SUMSUB_SHR2  2, q11, q13, q3,  q13,  q0, q1
-.endm
-
-function x264_sub8x8_dct8_neon
-    mov             r3, #FENC_STRIDE
-    mov             ip, #FDEC_STRIDE
-    vld1.64         {d16}, [r1,:64], r3
-    vld1.64         {d17}, [r2,:64], ip
-    vsubl.u8        q8,  d16, d17
-    vld1.64         {d18}, [r1,:64], r3
-    vld1.64         {d19}, [r2,:64], ip
-    vsubl.u8        q9,  d18, d19
-    vld1.64         {d20}, [r1,:64], r3
-    vld1.64         {d21}, [r2,:64], ip
-    vsubl.u8        q10, d20, d21
-    vld1.64         {d22}, [r1,:64], r3
-    vld1.64         {d23}, [r2,:64], ip
-    vsubl.u8        q11, d22, d23
-    vld1.64         {d24}, [r1,:64], r3
-    vld1.64         {d25}, [r2,:64], ip
-    vsubl.u8        q12, d24, d25
-    vld1.64         {d26}, [r1,:64], r3
-    vld1.64         {d27}, [r2,:64], ip
-    vsubl.u8        q13, d26, d27
-    vld1.64         {d28}, [r1,:64], r3
-    vld1.64         {d29}, [r2,:64], ip
-    vsubl.u8        q14, d28, d29
-    vld1.64         {d30}, [r1,:64], r3
-    vld1.64         {d31}, [r2,:64], ip
-    vsubl.u8        q15, d30, d31
-
-    DCT8_1D row
-    vswp            d17, d24    // 8, 12
-    vswp            d21, d28    // 10,14
-    vtrn.32         q8,  q10
-    vtrn.32         q12, q14
-
-    vswp            d19, d26    // 9, 13
-    vswp            d23, d30    // 11,15
-    vtrn.32         q9,  q11
-    vtrn.32         q13, q15
-
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q8,  q9
-    vtrn.16         q14, q15
-    DCT8_1D col
-
-    vst1.64         {d16-d19}, [r0,:128]!
-    vst1.64         {d20-d23}, [r0,:128]!
-    vst1.64         {d24-d27}, [r0,:128]!
-    vst1.64         {d28-d31}, [r0,:128]!
-    bx              lr
-endfunc
-
-function x264_sub16x16_dct8_neon
-    push            {lr}
-    bl              X(x264_sub8x8_dct8_neon)
-    sub             r1,  r1,  #FENC_STRIDE*8 - 8
-    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    bl              X(x264_sub8x8_dct8_neon)
-    sub             r1,  r1,  #8
-    sub             r2,  r2,  #8
-    bl              X(x264_sub8x8_dct8_neon)
-    pop             {lr}
-    sub             r1,  r1,  #FENC_STRIDE*8 - 8
-    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    b               X(x264_sub8x8_dct8_neon)
-endfunc
-
-
-// First part of IDCT (minus final SUMSUB_BA)
-.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
-    SUMSUB_AB       \d4, \d5, \d0, \d2
-    vshr.s16        \d7, \d1, #1
-    vshr.s16        \d6, \d3, #1
-    vsub.s16        \d7, \d7, \d3
-    vadd.s16        \d6, \d6, \d1
-.endm
-
-function x264_add4x4_idct_neon
-    mov             r2, #FDEC_STRIDE
-    vld1.64         {d0-d3}, [r1,:128]
-
-    IDCT_1D         d4, d5, d6, d7, d0, d1, d2, d3
-    vld1.32         {d30[0]}, [r0,:32], r2
-    SUMSUB_AB       q0, q1, q2, q3
-
-    TRANSPOSE4x4_16 d0, d1, d3, d2
-
-    IDCT_1D         d4, d5, d6, d7, d0, d1, d3, d2
-    vld1.32         {d30[1]}, [r0,:32], r2
-    SUMSUB_AB       q0, q1, q2, q3
-
-    vrshr.s16       q0, q0, #6
-    vld1.32         {d31[1]}, [r0,:32], r2
-    vrshr.s16       q1, q1, #6
-    vld1.32         {d31[0]}, [r0,:32], r2
-
-    sub             r0, r0, r2, lsl #2
-    vaddw.u8        q0, q0, d30
-    vaddw.u8        q1, q1, d31
-    vqmovun.s16     d0, q0
-    vqmovun.s16     d2, q1
-
-    vst1.32         {d0[0]}, [r0,:32], r2
-    vst1.32         {d0[1]}, [r0,:32], r2
-    vst1.32         {d2[1]}, [r0,:32], r2
-    vst1.32         {d2[0]}, [r0,:32], r2
-    bx              lr
-endfunc
-
-function x264_add8x4_idct_neon, export=0
-    vld1.64         {d0-d3}, [r1,:128]!
-    IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
-    vld1.64         {d4-d7}, [r1,:128]!
-    IDCT_1D         d17, d19, d21, d23, d4, d5, d6, d7
-    SUMSUB_AB       q0,  q3,  q8,  q10
-    SUMSUB_AB       q1,  q2,  q9,  q11
-
-    TRANSPOSE4x4_16 q0,  q1,  q2,  q3
-
-    IDCT_1D         q8,  q9,  q10, q11, q0, q1, q2, q3
-    SUMSUB_AB       q0,  q3,  q8,  q10
-    SUMSUB_AB       q1,  q2,  q9,  q11
-
-    vrshr.s16       q0,  q0,  #6
-    vld1.32         {d28}, [r0,:64], r2
-    vrshr.s16       q1,  q1,  #6
-    vld1.32         {d29}, [r0,:64], r2
-    vrshr.s16       q2,  q2,  #6
-    vld1.32         {d30}, [r0,:64], r2
-    vrshr.s16       q3,  q3,  #6
-    vld1.32         {d31}, [r0,:64], r2
-
-    sub             r0,  r0,  r2,  lsl #2
-    vaddw.u8        q0,  q0,  d28
-    vaddw.u8        q1,  q1,  d29
-    vaddw.u8        q2,  q2,  d30
-    vaddw.u8        q3,  q3,  d31
-
-    vqmovun.s16     d0,  q0
-    vqmovun.s16     d1,  q1
-    vst1.32         {d0}, [r0,:64], r2
-    vqmovun.s16     d2,  q2
-    vst1.32         {d1}, [r0,:64], r2
-    vqmovun.s16     d3,  q3
-    vst1.32         {d2}, [r0,:64], r2
-    vst1.32         {d3}, [r0,:64], r2
-    bx              lr
-endfunc
-
-function x264_add8x8_idct_neon
-    mov             r2, #FDEC_STRIDE
-    mov             ip, lr
-    bl              x264_add8x4_idct_neon
-    mov             lr, ip
-    b               x264_add8x4_idct_neon
-endfunc
-
-function x264_add16x16_idct_neon
-    mov             r2, #FDEC_STRIDE
-    mov             ip, lr
-    bl              x264_add8x4_idct_neon
-    bl              x264_add8x4_idct_neon
-    sub             r0, r0, #8*FDEC_STRIDE-8
-    bl              x264_add8x4_idct_neon
-    bl              x264_add8x4_idct_neon
-    sub             r0, r0, #8
-    bl              x264_add8x4_idct_neon
-    bl              x264_add8x4_idct_neon
-    sub             r0, r0, #8*FDEC_STRIDE-8
-    bl              x264_add8x4_idct_neon
-    mov             lr, ip
-    b               x264_add8x4_idct_neon
-endfunc
-
-
-.macro IDCT8_1D type
-.ifc \type, col
-    vswp            d21, d28
-.endif
-    SUMSUB_AB       q0,  q1,  q8,  q12              // a0/a2
-.ifc \type, row
-    vld1.64         {d28-d31}, [r1,:128]!
-.else
-    vswp            d19, d26
-.endif
-    SUMSUB_SHR   1, q2,  q3,  q10, q14,  q8, q12    // a6/a4
-.ifc \type, col
-    vswp            d23, d30
-.endif
-    SUMSUB_AB       q8,  q10, q13, q11
-    SUMSUB_15       q8,  q10, q9,  q15,  q12, q14   // a7/a1
-    SUMSUB_AB       q14, q15, q15, q9
-    SUMSUB_15       q15, q14, q13, q11,  q12, q9    // a5/a3
-
-    SUMSUB_SHR   2, q13, q14, q14, q15,  q11, q9    // b3/b5
-    SUMSUB_SHR2  2, q12, q15, q8,  q10,  q11, q9    // b1/b7
-
-    SUMSUB_AB       q10, q2,  q0,  q2               // b0/b6
-    SUMSUB_AB       q11, q3,  q1,  q3               // b2/b4
-
-    SUMSUB_AB       q8,  q15, q10, q15
-    SUMSUB_AB       q9,  q14, q11, q14
-    SUMSUB_AB       q10, q13, q3,  q13
-.ifc \type, row
-    vtrn.16         q8,  q9
-.endif
-    SUMSUB_AB       q11, q12, q2,  q12
-.endm
-
-function x264_add8x8_idct8_neon
-    mov             r2,  #FDEC_STRIDE
-    vld1.64         {d16-d19}, [r1,:128]!
-    vld1.64         {d20-d23}, [r1,:128]!
-    vld1.64         {d24-d27}, [r1,:128]!
-
-    IDCT8_1D row
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vswp            d17, d24
-    IDCT8_1D col
-
-    vld1.64         {d0}, [r0,:64], r2
-    vrshr.s16       q8,  q8,  #6
-    vld1.64         {d1}, [r0,:64], r2
-    vrshr.s16       q9,  q9,  #6
-    vld1.64         {d2}, [r0,:64], r2
-    vrshr.s16       q10, q10, #6
-    vld1.64         {d3}, [r0,:64], r2
-    vrshr.s16       q11, q11, #6
-    vld1.64         {d4}, [r0,:64], r2
-    vrshr.s16       q12, q12, #6
-    vld1.64         {d5}, [r0,:64], r2
-    vrshr.s16       q13, q13, #6
-    vld1.64         {d6}, [r0,:64], r2
-    vrshr.s16       q14, q14, #6
-    vld1.64         {d7}, [r0,:64], r2
-    vrshr.s16       q15, q15, #6
-    sub             r0,  r0,  r2,  lsl #3
-
-    vaddw.u8        q8,  q8,  d0
-    vaddw.u8        q9,  q9,  d1
-    vaddw.u8        q10, q10, d2
-    vqmovun.s16     d0,  q8
-    vqmovun.s16     d1,  q9
-    vqmovun.s16     d2,  q10
-    vaddw.u8        q11, q11, d3
-    vst1.64         {d0}, [r0,:64], r2
-    vaddw.u8        q12, q12, d4
-    vst1.64         {d1}, [r0,:64], r2
-    vaddw.u8        q13, q13, d5
-    vst1.64         {d2}, [r0,:64], r2
-    vqmovun.s16     d3,  q11
-    vqmovun.s16     d4,  q12
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-    vst1.64         {d3}, [r0,:64], r2
-    vqmovun.s16     d5,  q13
-    vst1.64         {d4}, [r0,:64], r2
-    vqmovun.s16     d6,  q14
-    vqmovun.s16     d7,  q15
-    vst1.64         {d5}, [r0,:64], r2
-    vst1.64         {d6}, [r0,:64], r2
-    vst1.64         {d7}, [r0,:64], r2
-    bx              lr
-endfunc
-
-function x264_add16x16_idct8_neon
-    mov             ip,  lr
-    bl              X(x264_add8x8_idct8_neon)
-    sub             r0,  r0,  #8*FDEC_STRIDE-8
-    bl              X(x264_add8x8_idct8_neon)
-    sub             r0,  r0,  #8
-    bl              X(x264_add8x8_idct8_neon)
-    sub             r0,  r0,  #8*FDEC_STRIDE-8
-    mov             lr,  ip
-    b               X(x264_add8x8_idct8_neon)
-endfunc
-
-
-function x264_add8x8_idct_dc_neon
-    mov             r2,  #FDEC_STRIDE
-    vld1.64         {d16}, [r1,:64]
-    vrshr.s16       d16, d16, #6
-    vld1.64         {d0}, [r0,:64], r2
-    vmov.i16        q15, #0
-    vld1.64         {d1}, [r0,:64], r2
-    vld1.64         {d2}, [r0,:64], r2
-    vdup.16         d20, d16[0]
-    vld1.64         {d3}, [r0,:64], r2
-    vdup.16         d21, d16[1]
-    vld1.64         {d4}, [r0,:64], r2
-    vdup.16         d22, d16[2]
-    vld1.64         {d5}, [r0,:64], r2
-    vdup.16         d23, d16[3]
-    vld1.64         {d6}, [r0,:64], r2
-    vsub.s16        q12, q15, q10
-    vld1.64         {d7}, [r0,:64], r2
-    vsub.s16        q13, q15, q11
-
-    sub             r0,  r0,  #8*FDEC_STRIDE
-
-    vqmovun.s16     d20, q10
-    vqmovun.s16     d22, q11
-    vqmovun.s16     d24, q12
-    vqmovun.s16     d26, q13
-
-    vmov            d21, d20
-    vqadd.u8        q0,  q0,  q10
-    vmov            d23, d22
-    vqadd.u8        q1,  q1,  q10
-    vmov            d25, d24
-    vqadd.u8        q2,  q2,  q11
-    vmov            d27, d26
-    vqadd.u8        q3,  q3,  q11
-    vqsub.u8        q0,  q0,  q12
-    vqsub.u8        q1,  q1,  q12
-    vqsub.u8        q2,  q2,  q13
-
-    vst1.64         {d0}, [r0,:64], r2
-    vqsub.u8        q3,  q3,  q13
-    vst1.64         {d1}, [r0,:64], r2
-    vst1.64         {d2}, [r0,:64], r2
-    vst1.64         {d3}, [r0,:64], r2
-    vst1.64         {d4}, [r0,:64], r2
-    vst1.64         {d5}, [r0,:64], r2
-    vst1.64         {d6}, [r0,:64], r2
-    vst1.64         {d7}, [r0,:64], r2
-    bx              lr
-endfunc
-
-.macro ADD16x4_IDCT_DC dc
-    vld1.64         {d16-d17}, [r0,:128], r3
-    vld1.64         {d18-d19}, [r0,:128], r3
-    vdup.16         d4,  \dc[0]
-    vdup.16         d5,  \dc[1]
-    vld1.64         {d20-d21}, [r0,:128], r3
-    vdup.16         d6,  \dc[2]
-    vdup.16         d7,  \dc[3]
-    vld1.64         {d22-d23}, [r0,:128], r3
-    vsub.s16        q12, q15, q2
-    vsub.s16        q13, q15, q3
-
-    vqmovun.s16     d4,  q2
-    vqmovun.s16     d5,  q3
-    vqmovun.s16     d6,  q12
-    vqmovun.s16     d7,  q13
-
-    vqadd.u8        q8,  q8,  q2
-    vqadd.u8        q9,  q9,  q2
-    vqadd.u8        q10, q10, q2
-    vqadd.u8        q11, q11, q2
-
-    vqsub.u8        q8,  q8,  q3
-    vqsub.u8        q9,  q9,  q3
-    vqsub.u8        q10, q10, q3
-    vst1.64         {d16-d17}, [r2,:128], r3
-    vqsub.u8        q11, q11, q3
-    vst1.64         {d18-d19}, [r2,:128], r3
-    vst1.64         {d20-d21}, [r2,:128], r3
-    vst1.64         {d22-d23}, [r2,:128], r3
-.endm
-
-function x264_add16x16_idct_dc_neon
-    mov             r2,  r0
-    mov             r3,  #FDEC_STRIDE
-    vmov.i16        q15, #0
-
-    vld1.64         {d0-d3}, [r1,:64]
-    vrshr.s16       q0, #6
-    vrshr.s16       q1, #6
-
-    ADD16x4_IDCT_DC d0
-    ADD16x4_IDCT_DC d1
-    ADD16x4_IDCT_DC d2
-    ADD16x4_IDCT_DC d3
-    bx              lr
-endfunc
-
-function x264_sub8x8_dct_dc_neon
-    mov             r3,  #FENC_STRIDE
-    mov             ip,  #FDEC_STRIDE
-    vld1.64         {d16}, [r1,:64], r3
-    vld1.64         {d17}, [r2,:64], ip
-    vsubl.u8        q8,  d16, d17
-    vld1.64         {d18}, [r1,:64], r3
-    vld1.64         {d19}, [r2,:64], ip
-    vsubl.u8        q9,  d18, d19
-    vld1.64         {d20}, [r1,:64], r3
-    vld1.64         {d21}, [r2,:64], ip
-    vsubl.u8        q10, d20, d21
-    vld1.64         {d22}, [r1,:64], r3
-    vadd.s16        q0,  q8,  q9
-    vld1.64         {d23}, [r2,:64], ip
-    vsubl.u8        q11, d22, d23
-    vld1.64         {d24}, [r1,:64], r3
-    vadd.s16        q0,  q0,  q10
-    vld1.64         {d25}, [r2,:64], ip
-    vsubl.u8        q12, d24, d25
-    vld1.64         {d26}, [r1,:64], r3
-    vadd.s16        q0,  q0,  q11
-    vld1.64         {d27}, [r2,:64], ip
-    vsubl.u8        q13, d26, d27
-    vld1.64         {d28}, [r1,:64], r3
-    vld1.64         {d29}, [r2,:64], ip
-    vsubl.u8        q14, d28, d29
-    vld1.64         {d30}, [r1,:64], r3
-    vadd.s16        q1,  q12, q13
-    vld1.64         {d31}, [r2,:64], ip
-    vsubl.u8        q15, d30, d31
-    vadd.s16        q1,  q1,  q14
-
-    vadd.s16        d4,  d0,  d1
-    vadd.s16        q1,  q1,  q15
-    vsub.s16        d5,  d0,  d1
-    vadd.s16        d6,  d2,  d3
-    vsub.s16        d7,  d2,  d3
-    vadd.s16        q0,  q2,  q3
-    vsub.s16        q1,  q2,  q3
-
-    vpadd.s16       d0,  d0,  d2
-    vpadd.s16       d1,  d1,  d3
-    vpadd.s16       d0,  d0,  d1
-    vst1.64         {d0}, [r0,:64]
-    bx              lr
-endfunc
-
-function x264_sub8x16_dct_dc_neon
-    mov             r3,  #FENC_STRIDE
-    mov             ip,  #FDEC_STRIDE
-    vld1.64         {d16}, [r1,:64], r3
-    vld1.64         {d17}, [r2,:64], ip
-    vsubl.u8        q8,  d16, d17
-    vld1.64         {d18}, [r1,:64], r3
-    vld1.64         {d19}, [r2,:64], ip
-    vsubl.u8        q9,  d18, d19
-    vld1.64         {d20}, [r1,:64], r3
-    vld1.64         {d21}, [r2,:64], ip
-    vsubl.u8        q10, d20, d21
-    vld1.64         {d22}, [r1,:64], r3
-    vadd.s16        q0,  q8,  q9
-    vld1.64         {d23}, [r2,:64], ip
-    vsubl.u8        q11, d22, d23
-    vld1.64         {d24}, [r1,:64], r3
-    vadd.s16        q0,  q0,  q10
-    vld1.64         {d25}, [r2,:64], ip
-    vsubl.u8        q12, d24, d25
-    vld1.64         {d26}, [r1,:64], r3
-    vadd.s16        q0,  q0,  q11
-    vld1.64         {d27}, [r2,:64], ip
-    vsubl.u8        q13, d26, d27
-    vld1.64         {d28}, [r1,:64], r3
-    vld1.64         {d29}, [r2,:64], ip
-    vsubl.u8        q14, d28, d29
-    vld1.64         {d30}, [r1,:64], r3
-    vadd.s16        q1,  q12, q13
-    vld1.64         {d31}, [r2,:64], ip
-    vsubl.u8        q15, d30, d31
-
-    vld1.64         {d16}, [r1,:64], r3
-    vadd.s16        q1,  q1,  q14
-    vld1.64         {d17}, [r2,:64], ip
-    vadd.s16        q1,  q1,  q15
-    vld1.64         {d18}, [r1,:64], r3
-    vsubl.u8        q8,  d16, d17
-    vld1.64         {d19}, [r2,:64], ip
-    vsubl.u8        q9,  d18, d19
-    vld1.64         {d20}, [r1,:64], r3
-    vld1.64         {d21}, [r2,:64], ip
-    vsubl.u8        q10, d20, d21
-    vld1.64         {d22}, [r1,:64], r3
-    vadd.s16        q2,  q8,  q9
-    vld1.64         {d23}, [r2,:64], ip
-    vsubl.u8        q11, d22, d23
-    vld1.64         {d24}, [r1,:64], r3
-    vadd.s16        q2,  q2,  q10
-    vld1.64         {d25}, [r2,:64], ip
-    vsubl.u8        q12, d24, d25
-    vld1.64         {d26}, [r1,:64], r3
-    vadd.s16        q2,  q2,  q11
-    vld1.64         {d27}, [r2,:64], ip
-    vsubl.u8        q13, d26, d27
-    vld1.64         {d28}, [r1,:64], r3
-    vld1.64         {d29}, [r2,:64], ip
-    vsubl.u8        q14, d28, d29
-    vld1.64         {d30}, [r1,:64], r3
-    vadd.s16        q3,  q12, q13
-    vld1.64         {d31}, [r2,:64], ip
-    vsubl.u8        q15, d30, d31
-    vadd.s16        q3,  q3,  q14
-
-    vadd.s16        d16, d0,  d1  @ b0
-    vadd.s16        q3,  q3,  q15
-    vsub.s16        d17, d0,  d1  @ b4
-    vadd.s16        d18, d2,  d3  @ b1
-    vsub.s16        d19, d2,  d3  @ b5
-    vadd.s16        d20, d4,  d5  @ b2
-    vsub.s16        d21, d4,  d5  @ b6
-    vadd.s16        d22, d6,  d7  @ b3
-    vsub.s16        d23, d6,  d7  @ b7
-    vadd.s16        q0,  q8,  q9  @ b0 + b1, b4 + b5; a0, a2
-    vsub.s16        q1,  q8,  q9  @ b0 - b1, b4 - b5; a4, a6
-    vadd.s16        q2,  q10, q11 @ b2 + b3, b6 + b7; a1, a3
-    vsub.s16        q3,  q10, q11 @ b2 - b3, b6 - b7; a5, a7
-
-    vadd.s16        q8,  q0,  q2  @ a0 + a1, a2 + a3
-    vsub.s16        q9,  q0,  q2  @ a0 - a1, a2 - a3
-    vsub.s16        q10, q1,  q3  @ a4 - a5, a6 - a7
-    vadd.s16        q11, q1,  q3  @ a4 + a5, a6 + a7
-
-    vpadd.s16       d0,  d16, d17
-    vpadd.s16       d1,  d18, d19
-    vpadd.s16       d2,  d20, d21
-    vpadd.s16       d3,  d22, d23
-    vpadd.s16       d0,  d0,  d1
-    vpadd.s16       d1,  d2,  d3
-    vst1.64         {q0}, [r0,:64]
-    bx              lr
-endfunc
-
-
-function x264_zigzag_scan_4x4_frame_neon
-    movrel      r2, scan4x4_frame
-    vld1.64     {d0-d3},   [r1,:128]
-    vld1.64     {d16-d19}, [r2,:128]
-    vtbl.8      d4, {d0-d1}, d16
-    vtbl.8      d5, {d1-d3}, d17
-    vtbl.8      d6, {d0-d2}, d18
-    vtbl.8      d7, {d2-d3}, d19
-    vst1.64     {d4-d7},   [r0,:128]
-    bx          lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/dct.h b/android/src/main/libenc/jni/libx264/common/arm/dct.h
deleted file mode 100755
index 95f1018..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/dct.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*****************************************************************************
- * dct.h: arm transform and zigzag
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ARM_DCT_H
-#define X264_ARM_DCT_H
-
-void x264_dct4x4dc_neon( int16_t d[16] );
-void x264_idct4x4dc_neon( int16_t d[16] );
-
-void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
-void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
-
-void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
-void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
-void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
-
-void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/arm/deblock-a.S b/android/src/main/libenc/jni/libx264/common/arm/deblock-a.S
deleted file mode 100755
index d781828..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/deblock-a.S
+++ /dev/null
@@ -1,793 +0,0 @@
-/*****************************************************************************
- * deblock.S: arm deblocking
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Mans Rullgard <mans@mansr.com>
- *          Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.macro h264_loop_filter_start
-    ldr             ip,  [sp]
-    ldr             ip,  [ip]
-    vdup.32         d24, ip
-    and             ip,  ip,  ip, lsl #16
-    ands            ip,  ip,  ip, lsl #8
-    bxlt            lr
-.endm
-
-.macro align_push_regs
-    and             ip,  sp,  #15
-    add             ip,  ip,  #32
-    sub             sp,  sp,  ip
-    vst1.64         {d12-d15}, [sp,:128]
-    sub             sp,  sp,  #32
-    vst1.64         {d8-d11},  [sp,:128]
-.endm
-
-.macro align_pop_regs
-    vld1.64         {d8-d11},  [sp,:128]!
-    vld1.64         {d12-d15}, [sp,:128], ip
-.endm
-
-.macro h264_loop_filter_luma
-    vdup.8          q11, r2         @ alpha
-    vmovl.u8        q12, d24
-    vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
-    vmovl.u16       q12, d24
-    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
-    vsli.16         q12, q12, #8
-    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
-    vsli.32         q12, q12, #16
-    vclt.u8         q6,  q6,  q11   @ < alpha
-    vdup.8          q11, r3         @ beta
-    vclt.s8         q7,  q12, #0
-    vclt.u8         q14, q14, q11   @ < beta
-    vclt.u8         q15, q15, q11   @ < beta
-    vbic            q6,  q6,  q7
-    vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
-    vand            q6,  q6,  q14
-    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
-    vclt.u8         q4,  q4,  q11   @ < beta
-    vand            q6,  q6,  q15
-    vclt.u8         q5,  q5,  q11   @ < beta
-    vand            q4,  q4,  q6
-    vand            q5,  q5,  q6
-    vand            q12, q12, q6
-    vrhadd.u8       q14, q8,  q0
-    vsub.i8         q6,  q12, q4
-    vqadd.u8        q7,  q9,  q12
-    vhadd.u8        q10, q10, q14
-    vsub.i8         q6,  q6,  q5
-    vhadd.u8        q14, q2,  q14
-    vmin.u8         q7,  q7,  q10
-    vqsub.u8        q11, q9,  q12
-    vqadd.u8        q2,  q1,  q12
-    vmax.u8         q7,  q7,  q11
-    vqsub.u8        q11, q1,  q12
-    vmin.u8         q14, q2,  q14
-    vmovl.u8        q2,  d0
-    vmax.u8         q14, q14, q11
-    vmovl.u8        q10, d1
-    vsubw.u8        q2,  q2,  d16
-    vsubw.u8        q10, q10, d17
-    vshl.i16        q2,  q2,  #2
-    vshl.i16        q10, q10, #2
-    vaddw.u8        q2,  q2,  d18
-    vaddw.u8        q10, q10, d19
-    vsubw.u8        q2,  q2,  d2
-    vsubw.u8        q10, q10, d3
-    vrshrn.i16      d4,  q2,  #3
-    vrshrn.i16      d5,  q10, #3
-    vbsl            q4,  q7,  q9
-    vbsl            q5,  q14, q1
-    vneg.s8         q7,  q6
-    vmovl.u8        q14, d16
-    vmin.s8         q2,  q2,  q6
-    vmovl.u8        q6,  d17
-    vmax.s8         q2,  q2,  q7
-    vmovl.u8        q11, d0
-    vmovl.u8        q12, d1
-    vaddw.s8        q14, q14, d4
-    vaddw.s8        q6,  q6,  d5
-    vsubw.s8        q11, q11, d4
-    vsubw.s8        q12, q12, d5
-    vqmovun.s16     d16, q14
-    vqmovun.s16     d17, q6
-    vqmovun.s16     d0,  q11
-    vqmovun.s16     d1,  q12
-.endm
-
-function x264_deblock_v_luma_neon
-    h264_loop_filter_start
-
-    vld1.64         {d0, d1},  [r0,:128], r1
-    vld1.64         {d2, d3},  [r0,:128], r1
-    vld1.64         {d4, d5},  [r0,:128], r1
-    sub             r0,  r0,  r1, lsl #2
-    sub             r0,  r0,  r1, lsl #1
-    vld1.64         {d20,d21}, [r0,:128], r1
-    vld1.64         {d18,d19}, [r0,:128], r1
-    vld1.64         {d16,d17}, [r0,:128], r1
-
-    align_push_regs
-
-    h264_loop_filter_luma
-
-    sub             r0,  r0,  r1, lsl #1
-    vst1.64         {d8, d9},  [r0,:128], r1
-    vst1.64         {d16,d17}, [r0,:128], r1
-    vst1.64         {d0, d1},  [r0,:128], r1
-    vst1.64         {d10,d11}, [r0,:128]
-
-    align_pop_regs
-    bx              lr
-endfunc
-
-function x264_deblock_h_luma_neon
-    h264_loop_filter_start
-
-    sub             r0,  r0,  #4
-    vld1.64         {d6},  [r0], r1
-    vld1.64         {d20}, [r0], r1
-    vld1.64         {d18}, [r0], r1
-    vld1.64         {d16}, [r0], r1
-    vld1.64         {d0},  [r0], r1
-    vld1.64         {d2},  [r0], r1
-    vld1.64         {d4},  [r0], r1
-    vld1.64         {d26}, [r0], r1
-    vld1.64         {d7},  [r0], r1
-    vld1.64         {d21}, [r0], r1
-    vld1.64         {d19}, [r0], r1
-    vld1.64         {d17}, [r0], r1
-    vld1.64         {d1},  [r0], r1
-    vld1.64         {d3},  [r0], r1
-    vld1.64         {d5},  [r0], r1
-    vld1.64         {d27}, [r0], r1
-
-    TRANSPOSE8x8    q3, q10, q9, q8, q0, q1, q2, q13
-
-    align_push_regs
-
-    h264_loop_filter_luma
-
-    TRANSPOSE4x4    q4, q8, q0, q5
-
-    sub             r0,  r0,  r1, lsl #4
-    add             r0,  r0,  #2
-    vst1.32         {d8[0]},  [r0], r1
-    vst1.32         {d16[0]}, [r0], r1
-    vst1.32         {d0[0]},  [r0], r1
-    vst1.32         {d10[0]}, [r0], r1
-    vst1.32         {d8[1]},  [r0], r1
-    vst1.32         {d16[1]}, [r0], r1
-    vst1.32         {d0[1]},  [r0], r1
-    vst1.32         {d10[1]}, [r0], r1
-    vst1.32         {d9[0]},  [r0], r1
-    vst1.32         {d17[0]}, [r0], r1
-    vst1.32         {d1[0]},  [r0], r1
-    vst1.32         {d11[0]}, [r0], r1
-    vst1.32         {d9[1]},  [r0], r1
-    vst1.32         {d17[1]}, [r0], r1
-    vst1.32         {d1[1]},  [r0], r1
-    vst1.32         {d11[1]}, [r0], r1
-
-    align_pop_regs
-    bx              lr
-endfunc
-
-.macro h264_loop_filter_luma_intra
-    vdup.8          q14, r2         @ alpha
-    vabd.u8         q4,  q8,  q0    @ abs(p0 - q0)
-    vabd.u8         q5,  q9,  q8    @ abs(p1 - p0)
-    vabd.u8         q6,  q1,  q0    @ abs(q1 - q0)
-    vdup.8          q15, r3         @ beta
-    vmov.u8         q13, #2
-    vclt.u8         q7,  q4,  q14   @ < alpha
-    vshr.u8         q14, q14, #2    @ alpha >> 2
-    vclt.u8         q5,  q5,  q15   @ < beta
-    vadd.u8         q14, q14, q13   @ (alpha >> 2) + 2
-    vand            q7,  q7,  q5
-    vclt.u8         q6,  q6,  q15   @ < beta
-    vclt.u8         q13, q4,  q14   @ < (alpha >> 2) + 2 if_2
-    vand            q12, q7,  q6    @ if_1
-    vshrn.u16       d28, q12,  #4
-    vcmp.f64        d28, #0
-    vmrs            APSR_nzcv, FPSCR
-    beq             9f
-
-    sub             sp,  sp,  #32
-    vst1.8         {q12-q13}, [sp,:128]
-
-    vshll.u8        q4,  d18, #1    @ 2*p1
-    vshll.u8        q5,  d19, #1
-    vaddw.u8        q4,  q4,  d16   @ 2*p1 + p0
-    vaddw.u8        q5,  q5,  d17
-    vaddw.u8        q4,  q4,  d2    @ 2*p1 + p0 + q1
-    vaddw.u8        q5,  q5,  d3
-    vrshrn.u16      d24, q4,  #2
-    vrshrn.u16      d25, q5,  #2
-
-    vaddl.u8        q6,  d20, d16   @ p2 + p0
-    vaddl.u8        q7,  d21, d17
-    vaddw.u8        q6,  q6,  d0    @ p2 + p0 + q0
-    vaddw.u8        q7,  q7,  d1
-    vadd.u16        q4,  q4,  q6    @ p2 + 2*p1 + 2*p0 + q0 + q1
-    vadd.u16        q5,  q5,  q7
-    vaddw.u8        q4,  q4,  d0    @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
-    vaddw.u8        q5,  q5,  d1
-    vrshrn.u16      d26, q4,  #3    @ p0'_2
-    vrshrn.u16      d27, q5,  #3
-    vaddw.u8        q6,  q6,  d18   @ p2 + p1 + p0 + q0
-    vaddw.u8        q7,  q7,  d19
-    vrshrn.u16      d28, q6,  #2    @ p1'_2
-    vrshrn.u16      d29, q7,  #2
-    vaddl.u8        q4,  d22, d20   @ p3 + p2
-    vaddl.u8        q5,  d23, d21
-    vshl.u16        q4,  q4,  #1    @ 2*p3 + 2*p2
-    vshl.u16        q5,  q5,  #1
-    vadd.u16        q4,  q4,  q6    @ 2*p3 + 3*p2 + p1 + p0 + q0
-    vadd.u16        q5,  q5,  q7
-    vrshrn.u16      d30, q4,  #3    @ p2'_2
-    vrshrn.u16      d31, q5,  #3
-
-    vdup.8          q4,  r3         @ beta
-    vabd.u8         q5,  q10, q8    @ abs(p2 - p0)
-    vld1.8         {q6-q7}, [sp,:128]   @ if_1, if_2
-    vclt.u8         q5,  q5,  q4    @ < beta if_3
-
-    vand            q7,  q7,  q5    @ if_2 && if_3
-    vmvn            q4,  q7
-    vand            q7,  q7,  q6    @ if_1 && if_2 && if_3
-    vand            q6,  q4,  q6    @ if_1 && !(if_2 && if_3)
-
-    @ copy p0 to q15 so it can be clobbered
-    vbit            q10, q15, q7
-    vmov            q15, q8
-    vbit            q8,  q12, q6
-
-    @ wait for q9 to clobber
-    vshll.u8        q4,  d2,  #1    @ 2*q1
-    vshll.u8        q5,  d3,  #1
-
-    vbit            q8,  q12, q6
-
-    vaddw.u8        q4,  q4,  d0    @ 2*q1 + q0
-    vaddw.u8        q5,  q5,  d1
-
-    vbit            q8,  q13, q7
-
-    vaddw.u8        q4,  q4,  d18   @ 2*q1 + q0 + p1
-    vaddw.u8        q5,  q5,  d19
-
-    vbit            q9,  q14, q7
-
-    vrshrn.u16      d24, q4,  #2
-    vrshrn.u16      d25, q5,  #2
-
-    vaddl.u8        q6,  d4,  d0    @ q2 + q0
-    vaddl.u8        q7,  d5,  d1
-    vaddw.u8        q6,  q6,  d30   @ q2 + q0 + p0
-    vaddw.u8        q7,  q7,  d31
-    vadd.u16        q4,  q4,  q6    @ q2 + 2*q1 + 2*q0 + p0 + p1
-    vadd.u16        q5,  q5,  q7
-    vaddw.u8        q4,  q4,  d30   @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
-    vaddw.u8        q5,  q5,  d31
-    vrshrn.u16      d26, q4,  #3    @ q0'_2
-    vrshrn.u16      d27, q5,  #3
-    vaddw.u8        q6,  q6,  d2    @ q2 + q1 + q0 + p0
-    vaddw.u8        q7,  q7,  d3
-    vrshrn.u16      d28, q6,  #2    @ q1'_2
-    vrshrn.u16      d29, q7,  #2
-    vaddl.u8        q4,  d6,  d4    @ q3 + q2
-    vaddl.u8        q5,  d7,  d5
-    vshl.u16        q4,  q4,  #1    @ 2*q3 + 2*q2
-    vshl.u16        q5,  q5,  #1
-    vadd.u16        q4,  q4,  q6    @ 2*q3 + 3*q2 + q1 + q0 + p0
-    vadd.u16        q5,  q5,  q7
-    vrshrn.u16      d30, q4,  #3    @ q2'_2
-    vrshrn.u16      d31, q5,  #3
-
-    vdup.8          q4,  r3         @ beta
-    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
-    vld1.8         {q6-q7}, [sp,:128]!   @ if_1, if_2
-    vclt.u8         q5,  q5,  q4    @ < beta if_4
-
-    vand            q7,  q7,  q5    @ if_2 && if_4
-    vmvn            q4,  q7
-    vand            q7,  q6,  q7    @ if_1 && if_2 && if_4
-    vand            q6,  q6,  q4    @ if_1 && !(if_2 && if_4)
-
-    vbit            q0,  q12, q6
-    vbit            q1,  q14, q7
-    vbit            q0,  q13, q7
-    vbit            q2,  q15, q7
-
-.endm
-
-function x264_deblock_v_luma_intra_neon
-    vld1.64         {d0, d1},  [r0,:128], r1
-    vld1.64         {d2, d3},  [r0,:128], r1
-    vld1.64         {d4, d5},  [r0,:128], r1
-    vld1.64         {d6, d7},  [r0,:128], r1
-    sub             r0,  r0,  r1, lsl #3
-    vld1.64         {d22,d23}, [r0,:128], r1
-    vld1.64         {d20,d21}, [r0,:128], r1
-    vld1.64         {d18,d19}, [r0,:128], r1
-    vld1.64         {d16,d17}, [r0,:128]
-
-    align_push_regs
-
-    h264_loop_filter_luma_intra
-
-    sub             r0,  r0,  r1, lsl #1
-    vst1.64         {d20,d21}, [r0,:128], r1
-    vst1.64         {d18,d19}, [r0,:128], r1
-    vst1.64         {d16,d17}, [r0,:128], r1
-    vst1.64         {d0, d1},  [r0,:128], r1
-    vst1.64         {d2, d3},  [r0,:128], r1
-    vst1.64         {d4, d5},  [r0,:128]
-9:
-    align_pop_regs
-    bx              lr
-endfunc
-
-function x264_deblock_h_luma_intra_neon
-    sub             r0,  r0,  #4
-    vld1.64         {d22}, [r0], r1
-    vld1.64         {d20}, [r0], r1
-    vld1.64         {d18}, [r0], r1
-    vld1.64         {d16}, [r0], r1
-    vld1.64         {d0},  [r0], r1
-    vld1.64         {d2},  [r0], r1
-    vld1.64         {d4},  [r0], r1
-    vld1.64         {d6},  [r0], r1
-    vld1.64         {d23}, [r0], r1
-    vld1.64         {d21}, [r0], r1
-    vld1.64         {d19}, [r0], r1
-    vld1.64         {d17}, [r0], r1
-    vld1.64         {d1},  [r0], r1
-    vld1.64         {d3},  [r0], r1
-    vld1.64         {d5},  [r0], r1
-    vld1.64         {d7},  [r0], r1
-
-    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
-
-    align_push_regs
-
-    h264_loop_filter_luma_intra
-
-    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
-
-    sub             r0,  r0,  r1, lsl #4
-    vst1.64         {d22}, [r0], r1
-    vst1.64         {d20}, [r0], r1
-    vst1.64         {d18}, [r0], r1
-    vst1.64         {d16}, [r0], r1
-    vst1.64         {d0},  [r0], r1
-    vst1.64         {d2},  [r0], r1
-    vst1.64         {d4},  [r0], r1
-    vst1.64         {d6},  [r0], r1
-    vst1.64         {d23}, [r0], r1
-    vst1.64         {d21}, [r0], r1
-    vst1.64         {d19}, [r0], r1
-    vst1.64         {d17}, [r0], r1
-    vst1.64         {d1},  [r0], r1
-    vst1.64         {d3},  [r0], r1
-    vst1.64         {d5},  [r0], r1
-    vst1.64         {d7},  [r0], r1
-9:
-    align_pop_regs
-    bx              lr
-endfunc
-
-.macro h264_loop_filter_chroma
-    vdup.8          q11, r2         // alpha
-    vmovl.u8        q12, d24
-    vabd.u8         q13, q8,  q0    // abs(p0 - q0)
-    vabd.u8         q14, q9,  q8    // abs(p1 - p0)
-    vsubl.u8        q2,  d0,  d16
-    vsubl.u8        q3,  d1,  d17
-    vsli.16         q12, q12, #8
-    vshl.i16        q2,  q2,  #2
-    vshl.i16        q3,  q3,  #2
-    vabd.u8         q15, q1,  q0    // abs(q1 - q0)
-    vmovl.u8        q12, d24
-    vaddw.u8        q2,  q2,  d18
-    vaddw.u8        q3,  q3,  d19
-    vclt.u8         q13, q13, q11   // < alpha
-    vsubw.u8        q2,  q2,  d2
-    vsubw.u8        q3,  q3,  d3
-    vsli.16         q12, q12, #8
-    vdup.8          q11, r3         // beta
-    vclt.s8         q10, q12, #0
-    vrshrn.i16      d4,  q2,  #3
-    vrshrn.i16      d5,  q3,  #3
-    vclt.u8         q14, q14, q11   // < beta
-    vbic            q13, q13, q10
-    vclt.u8         q15, q15, q11   // < beta
-    vand            q13, q13, q14
-    vneg.s8         q10, q12
-    vand            q13, q13, q15
-    vmin.s8         q2,  q2,  q12
-    vmovl.u8        q14, d16
-    vand            q2,  q2,  q13
-    vmovl.u8        q15, d17
-    vmax.s8         q2,  q2,  q10
-    vmovl.u8        q11, d0
-    vmovl.u8        q12, d1
-    vaddw.s8        q14, q14, d4
-    vaddw.s8        q15, q15, d5
-    vsubw.s8        q11, q11, d4
-    vsubw.s8        q12, q12, d5
-    vqmovun.s16     d16, q14
-    vqmovun.s16     d17, q15
-    vqmovun.s16     d0,  q11
-    vqmovun.s16     d1,  q12
-.endm
-
-function x264_deblock_v_chroma_neon
-    h264_loop_filter_start
-
-    sub             r0,  r0,  r1, lsl #1
-    vld1.8          {d18,d19}, [r0,:128], r1
-    vld1.8          {d16,d17}, [r0,:128], r1
-    vld1.8          {d0, d1},  [r0,:128], r1
-    vld1.8          {d2, d3},  [r0,:128]
-
-    h264_loop_filter_chroma
-
-    sub             r0,  r0,  r1, lsl #1
-    vst1.8          {d16,d17}, [r0,:128], r1
-    vst1.8          {d0, d1},  [r0,:128], r1
-
-    bx              lr
-endfunc
-
-function x264_deblock_h_chroma_neon
-    h264_loop_filter_start
-
-    sub             r0,  r0,  #4
-deblock_h_chroma:
-    vld1.8          {d18}, [r0], r1
-    vld1.8          {d16}, [r0], r1
-    vld1.8          {d0},  [r0], r1
-    vld1.8          {d2},  [r0], r1
-    vld1.8          {d19}, [r0], r1
-    vld1.8          {d17}, [r0], r1
-    vld1.8          {d1},  [r0], r1
-    vld1.8          {d3},  [r0], r1
-
-    TRANSPOSE4x4_16 q9, q8, q0, q1
-
-    h264_loop_filter_chroma
-
-    vtrn.16         q8,  q0
-
-    sub             r0,  r0,  r1, lsl #3
-    add             r0,  r0,  #2
-    vst1.32         {d16[0]}, [r0], r1
-    vst1.32         {d0[0]},  [r0], r1
-    vst1.32         {d16[1]}, [r0], r1
-    vst1.32         {d0[1]},  [r0], r1
-    vst1.32         {d17[0]}, [r0], r1
-    vst1.32         {d1[0]},  [r0], r1
-    vst1.32         {d17[1]}, [r0], r1
-    vst1.32         {d1[1]},  [r0], r1
-
-    bx              lr
-endfunc
-
-function x264_deblock_h_chroma_422_neon
-    h264_loop_filter_start
-    push            {lr}
-    sub             r0,  r0,  #4
-    add             r1,  r1,  r1
-    bl              deblock_h_chroma
-    ldr             ip,  [sp, #4]
-    ldr             ip,  [ip]
-    vdup.32         d24, ip
-    sub             r0,  r0,  r1, lsl #3
-    add             r0,  r0,  r1, lsr #1
-    sub             r0,  r0,  #2
-    pop             {lr}
-    b               deblock_h_chroma
-endfunc
-
-.macro h264_loop_filter_chroma8
-    vdup.8          d22, r2         @ alpha
-    vmovl.u8        q12, d24
-    vabd.u8         d26, d16, d0    @ abs(p0 - q0)
-    vabd.u8         d28, d18, d16   @ abs(p1 - p0)
-    vsubl.u8        q2,  d0,  d16
-    vsli.16         d24, d24, #8
-    vshl.i16        q2,  q2,  #2
-    vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
-    vaddw.u8        q2,  q2,  d18
-    vclt.u8         d26, d26, d22   @ < alpha
-    vsubw.u8        q2,  q2,  d2
-    vdup.8          d22, r3         @ beta
-    vclt.s8         d20, d24, #0
-    vrshrn.i16      d4,  q2,  #3
-    vclt.u8         d28, d28, d22   @ < beta
-    vbic            d26, d26, d20
-    vclt.u8         d30, d30, d22   @ < beta
-    vand            d26, d26, d28
-    vneg.s8         d20, d24
-    vand            d26, d26, d30
-    vmin.s8         d4,  d4,  d24
-    vmovl.u8        q14, d16
-    vand            d4,  d4,  d26
-    vmax.s8         d4,  d4,  d20
-    vmovl.u8        q11, d0
-    vaddw.s8        q14, q14, d4
-    vsubw.s8        q11, q11, d4
-    vqmovun.s16     d16, q14
-    vqmovun.s16     d0,  q11
-.endm
-
-function x264_deblock_h_chroma_mbaff_neon
-    h264_loop_filter_start
-
-    sub             r0,  r0,  #4
-    vld1.8          {d18}, [r0], r1
-    vld1.8          {d16}, [r0], r1
-    vld1.8          {d0},  [r0], r1
-    vld1.8          {d2},  [r0], r1
-
-    TRANSPOSE4x4_16 d18, d16, d0, d2
-
-    h264_loop_filter_chroma8
-
-    vtrn.16         d16, d0
-
-    sub             r0,  r0,  r1, lsl #2
-    add             r0,  r0,  #2
-    vst1.32         {d16[0]}, [r0], r1
-    vst1.32         {d0[0]},  [r0], r1
-    vst1.32         {d16[1]}, [r0], r1
-    vst1.32         {d0[1]},  [r0]
-
-    bx              lr
-endfunc
-
-.macro h264_loop_filter_chroma_intra, width=16
-    vdup.8          q11, r2         @ alpha
-    vabd.u8         q13, q8,  q0    @ abs(p0 - q0)
-    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
-    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
-    vclt.u8         q13, q13, q11   @ < alpha
-    vdup.8          q11, r3         @ beta
-    vclt.u8         q14, q14, q11   @ < beta
-    vclt.u8         q15, q15, q11   @ < beta
-    vand            q13, q13, q14
-    vand            q13, q13, q15
-
-    vshll.u8        q14, d18, #1
-    vshll.u8        q2,  d2,  #1
-.ifc \width, 16
-    vshll.u8        q15, d19, #1
-    vshll.u8        q3,  d3,  #1
-    vaddl.u8        q12, d17, d3
-    vaddl.u8        q10, d1,  d19
-.endif
-    vaddl.u8        q11, d16, d2
-    vaddl.u8        q1,  d18, d0    @ or vaddw q2, to not clobber q1
-    vadd.u16        q14, q14, q11
-    vadd.u16        q2,  q2,  q1
-.ifc \width, 16
-    vadd.u16        q15, q15, q12
-    vadd.u16        q3,  q3,  q10
-.endif
-    vqrshrn.u16     d28, q14, #2
-    vqrshrn.u16     d4,  q2, #2
-.ifc \width, 16
-    vqrshrn.u16     d29, q15, #2
-    vqrshrn.u16     d5,  q3, #2
-.endif
-    vbit            q8,  q14, q13
-    vbit            q0,  q2,  q13
-.endm
-
-function x264_deblock_v_chroma_intra_neon
-    sub             r0,  r0,  r1, lsl #1
-    vld2.8          {d18,d19}, [r0,:128], r1
-    vld2.8          {d16,d17}, [r0,:128], r1
-    vld2.8          {d0, d1},  [r0,:128], r1
-    vld2.8          {d2, d3},  [r0,:128]
-
-    h264_loop_filter_chroma_intra
-
-    sub             r0,  r0,  r1, lsl #1
-    vst2.8          {d16,d17}, [r0,:128], r1
-    vst2.8          {d0, d1},  [r0,:128], r1
-
-    bx              lr
-endfunc
-
-function x264_deblock_h_chroma_intra_neon
-    sub             r0,  r0,  #4
-    vld1.8          {d18}, [r0], r1
-    vld1.8          {d16}, [r0], r1
-    vld1.8          {d0},  [r0], r1
-    vld1.8          {d2},  [r0], r1
-    vld1.8          {d19}, [r0], r1
-    vld1.8          {d17}, [r0], r1
-    vld1.8          {d1},  [r0], r1
-    vld1.8          {d3},  [r0], r1
-
-    TRANSPOSE4x4_16 q9, q8, q0, q1
-
-    h264_loop_filter_chroma_intra
-
-    vtrn.16         q8,  q0
-
-    sub             r0,  r0,  r1, lsl #3
-    add             r0,  r0,  #2
-    vst1.32         {d16[0]}, [r0], r1
-    vst1.32         {d0[0]},  [r0], r1
-    vst1.32         {d16[1]}, [r0], r1
-    vst1.32         {d0[1]},  [r0], r1
-    vst1.32         {d17[0]}, [r0], r1
-    vst1.32         {d1[0]},  [r0], r1
-    vst1.32         {d17[1]}, [r0], r1
-    vst1.32         {d1[1]},  [r0], r1
-
-    bx              lr
-endfunc
-
-function x264_deblock_h_chroma_422_intra_neon
-    push            {lr}
-    bl              X(x264_deblock_h_chroma_intra_neon)
-    add             r0, r0,  #2
-    pop             {lr}
-    b               X(x264_deblock_h_chroma_intra_neon)
-endfunc
-
-function x264_deblock_h_chroma_intra_mbaff_neon
-    sub             r0,  r0,  #4
-    vld1.8          {d18}, [r0], r1
-    vld1.8          {d16}, [r0], r1
-    vld1.8          {d0},  [r0], r1
-    vld1.8          {d2},  [r0], r1
-
-    TRANSPOSE4x4_16 d18, d16, d0, d2
-
-    h264_loop_filter_chroma_intra width=8
-
-    vtrn.16         d16, d0
-
-    sub             r0,  r0,  r1, lsl #2
-    add             r0,  r0,  #2
-    vst1.32         {d16[0]}, [r0], r1
-    vst1.32         {d0[0]},  [r0], r1
-    vst1.32         {d16[1]}, [r0], r1
-    vst1.32         {d0[1]},  [r0]
-
-    bx              lr
-endfunc
-
-function x264_deblock_strength_neon
-    ldr             ip,  [sp]
-    vmov.i8         q8,  #0
-    lsl             ip,  ip,  #8
-    add             r3,  r3,  #32
-    sub             ip,  ip,  #(1<<8)-3
-    vmov.i8         q9,  #0
-    vdup.16         q10, ip
-    ldr             ip,  [sp, #4]
-
-lists:
-    @ load bytes ref
-    vld1.8          {d31}, [r1]!
-    add             r2,  r2,  #16
-    vld1.8          {q1},  [r1]!
-    vmov.i8         q0,  #0
-    vld1.8          {q2},  [r1]!
-    vext.8          q3,  q0,  q1,  #15
-    vext.8          q0,  q0,  q2,  #15
-    vuzp.32         q1,  q2
-    vuzp.32         q3,  q0
-    vext.8          q1,  q15, q2,  #12
-
-    veor            q0,  q0,  q2
-    veor            q1,  q1,  q2
-    vorr            q8,  q8,  q0
-    vorr            q9,  q9,  q1
-
-    vld1.16         {q11}, [r2,:128]!   @ mv + 0x10
-    vld1.16         {q3},  [r2,:128]!   @ mv + 0x20
-    vld1.16         {q12}, [r2,:128]!   @ mv + 0x30
-    vld1.16         {q2},  [r2,:128]!   @ mv + 0x40
-    vld1.16         {q13}, [r2,:128]!   @ mv + 0x50
-    vext.8          q3,  q3,  q12, #12
-    vext.8          q2,  q2,  q13, #12
-    vabd.s16        q0,  q12, q3
-    vld1.16         {q3},  [r2,:128]!   @ mv + 0x60
-    vabd.s16        q1,  q13, q2
-    vld1.16         {q14}, [r2,:128]!   @ mv + 0x70
-    vqmovn.u16      d0,  q0
-    vld1.16         {q2},  [r2,:128]!   @ mv + 0x80
-    vld1.16         {q15}, [r2,:128]!   @ mv + 0x90
-    vqmovn.u16      d1,  q1
-    vext.8          q3,  q3,  q14, #12
-    vext.8          q2,  q2,  q15, #12
-    vabd.s16        q3,  q14, q3
-    vabd.s16        q2,  q15, q2
-    vqmovn.u16      d2,  q3
-    vqmovn.u16      d3,  q2
-
-    vqsub.u8        q0,  q0,  q10
-    vqsub.u8        q1,  q1,  q10
-    vqmovn.u16      d0,  q0
-    vqmovn.u16      d1,  q1
-
-    vabd.s16        q1,  q12, q13
-    vorr            q8,  q8,  q0
-
-    vabd.s16        q0,  q11, q12
-    vabd.s16        q2,  q13, q14
-    vabd.s16        q3,  q14, q15
-    vqmovn.u16      d0,  q0
-    vqmovn.u16      d1,  q1
-    vqmovn.u16      d2,  q2
-    vqmovn.u16      d3,  q3
-
-    vqsub.u8        q0,  q0,  q10
-    vqsub.u8        q1,  q1,  q10
-    vqmovn.u16      d0,  q0
-    vqmovn.u16      d1,  q1
-    subs            ip,  ip,  #1
-    vorr            q9,  q9,  q0
-    beq             lists
-
-    mov             ip,  #-32
-    @ load bytes nnz
-    vld1.8          {d31}, [r0]!
-    vld1.8          {q1},  [r0]!
-    vmov.i8         q0,  #0
-    vld1.8          {q2},  [r0]
-    vext.8          q3,  q0,  q1,  #15
-    vext.8          q0,  q0,  q2,  #15
-    vuzp.32         q1,  q2
-    vuzp.32         q3,  q0
-    vext.8          q1,  q15, q2,  #12
-
-    vorr            q0,  q0,  q2
-    vorr            q1,  q1,  q2
-    vmov.u8         q10, #1
-    vmin.u8         q0,  q0,  q10
-    vmin.u8         q1,  q1,  q10
-    vmin.u8         q8,  q8,  q10       @ mv ? 1 : 0
-    vmin.u8         q9,  q9,  q10
-    vadd.u8         q0,  q0,  q0        @ nnz ? 2 : 0
-    vadd.u8         q1,  q1,  q1
-    vmax.u8         q8,  q8,  q0
-    vmax.u8         q9,  q9,  q1
-    vzip.16         d16, d17
-    vst1.8          {q9}, [r3,:128], ip @ bs[1]
-    vtrn.8          d16, d17
-    vtrn.32         d16, d17
-
-    vst1.8          {q8}, [r3,:128]     @ bs[0]
-    bx              lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/mc-a.S b/android/src/main/libenc/jni/libx264/common/arm/mc-a.S
deleted file mode 100755
index 165c1fa..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/mc-a.S
+++ /dev/null
@@ -1,1939 +0,0 @@
-/*****************************************************************************
- * mc.S: arm motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Mans Rullgard <mans@mansr.com>
- *          Stefan Groenroos <stefan.gronroos@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-pw_0to15:
-.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-
-.text
-
-// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
-// They also use nothing above armv5te, but we don't care about pre-armv6
-
-// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
-function x264_prefetch_ref_arm
-    sub         r2, r2, #1
-    add         r0, r0, #64
-    and         r2, r2, r1
-    add         r0, r0, r2, lsl #3
-    add         r2, r1, r1, lsl #1
-    pld         [r0]
-    pld         [r0, r1]
-    pld         [r0, r1, lsl #1]
-    add         r3, r0, r1, lsl #2
-    pld         [r0, r2]
-    pld         [r3]
-    pld         [r3, r1]
-    pld         [r3, r1, lsl #1]
-    pld         [r3, r2]
-    bx          lr
-endfunc
-
-// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
-//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
-function x264_prefetch_fenc_arm
-    ldr         ip, [sp]
-    push        {lr}
-    and         lr, ip, #3
-    smulbb      lr, lr, r1      // note: this assumes stride_y is <= 16 bits signed
-    and         ip, ip, #6
-    smulbb      ip, ip, r3
-    add         r0, r0, #64
-    add         r2, r2, #64
-    add         r0, r0, lr, lsl #2
-    pld         [r0]
-    add         lr, r0, r1, lsl #1
-    pld         [r0, r1]
-    pld         [lr]
-    add         r2, r2, ip, lsl #2
-    pld         [lr, r1]
-    pld         [r2]
-    add         ip, r2, r3, lsl #1
-    pld         [r2, r3]
-    pld         [ip]
-    pld         [ip, r3]
-    pop         {pc}
-endfunc
-
-
-// void *x264_memcpy_aligned( void *dst, const void *src, size_t n )
-function x264_memcpy_aligned_neon
-    orr         r3,  r0,  r1,  lsr #1
-    movrel      ip,  memcpy_table
-    and         r3,  r3,  #0xc
-    ldr         pc,  [ip, r3]
-endfunc
-
-.macro MEMCPY_ALIGNED srcalign dstalign
-function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0
-    mov         r3, r0
-.if \srcalign == 8 && \dstalign == 8
-    sub         r2, #16
-    vld1.64     {d0}, [r1,:64]!
-    vst1.64     {d0}, [r3,:64]!
-    .set r1align, 128
-    .set r3align, 128
-.else
-    .set r1align, \srcalign * 8
-    .set r3align, \dstalign * 8
-.endif
-    tst         r2, #16
-    beq         32f
-    sub         r2, #16
-    vld1.64     {d0-d1}, [r1,:r1align]!
-    vst1.64     {d0-d1}, [r3,:r3align]!
-32: // n is a multiple of 32
-    tst         r2, #32
-    beq         640f
-    sub         r2, #32
-    vld1.64     {d0-d3}, [r1,:r1align]!
-    vst1.64     {d0-d3}, [r3,:r3align]!
-640: // n is a multiple of 64
-    cmp         r2, #0
-    beq         1f
-64:
-    subs        r2, #64
-    vld1.64     {d0-d3}, [r1,:r1align]!
-    vld1.64     {d4-d7}, [r1,:r1align]!
-    vst1.64     {d0-d3}, [r3,:r3align]!
-    vst1.64     {d4-d7}, [r3,:r3align]!
-    bgt         64b
-1:   // end
-.if \srcalign == 8 && \dstalign == 8
-    vld1.64     {d0}, [r1,:64]!
-    vst1.64     {d0}, [r3,:64]!
-.endif
-    bx          lr
-endfunc
-.endm
-
-MEMCPY_ALIGNED 16, 16
-MEMCPY_ALIGNED 16, 8
-MEMCPY_ALIGNED  8, 16
-MEMCPY_ALIGNED  8, 8
-
-const memcpy_table align=2, relocate=1
-.word memcpy_aligned_16_16_neon
-.word memcpy_aligned_16_8_neon
-.word memcpy_aligned_8_16_neon
-.word memcpy_aligned_8_8_neon
-endconst
-
-.text
-
-.ltorg
-
-// void x264_memzero_aligned( void *dst, size_t n )
-function x264_memzero_aligned_neon
-    vmov.i8     q0, #0
-    vmov.i8     q1, #0
-memzero_loop:
-    subs        r1, #128
-.rept 4
-    vst1.64     {d0-d3}, [r0,:128]!
-.endr
-    bgt         memzero_loop
-    bx          lr
-endfunc
-
-
-// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
-//                 uint8_t *src1, intptr_t src1_stride,
-//                 uint8_t *src2, intptr_t src2_stride, int weight );
-.macro AVGH w h
-function x264_pixel_avg_\w\()x\h\()_neon
-    ldr         ip, [sp, #8]
-    push        {r4-r6,lr}
-    cmp         ip, #32
-    ldrd        r4, r5, [sp, #16]
-    mov         lr, #\h
-    beq         x264_pixel_avg_w\w\()_neon
-    rsbs        r6,  ip,  #64
-    blt         x264_pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
-    cmp         ip,  #0
-    bge         x264_pixel_avg_weight_w\w\()_add_add_neon
-    b           x264_pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
-endfunc
-.endm
-
-AVGH  4, 2
-AVGH  4, 4
-AVGH  4, 8
-AVGH  4, 16
-AVGH  8, 4
-AVGH  8, 8
-AVGH  8, 16
-AVGH 16, 8
-AVGH 16, 16
-
-// 0 < weight < 64
-.macro load_weights_add_add
-    vdup.8      d30, ip
-    vdup.8      d31, r6
-.endm
-
-.macro load_add_add d1 d2
-    vld1.32     {\d1}, [r2], r3
-    vld1.32     {\d2}, [r4], r5
-.endm
-
-.macro weight_add_add dst s1 s2
-    vmull.u8    \dst, \s1, d30
-    vmlal.u8    \dst, \s2, d31
-.endm
-
-// weight > 64
-.macro load_weights_add_sub
-    rsb         r6,  #0
-    vdup.8      d30, ip
-    vdup.8      d31, r6
-.endm
-
-.macro load_add_sub d1 d2
-    vld1.32     {\d1}, [r2], r3
-    vld1.32     {\d2}, [r4], r5
-.endm
-
-.macro weight_add_sub dst s1 s2
-    vmull.u8    \dst, \s1, d30
-    vmlsl.u8    \dst, \s2, d31
-.endm
-
-// weight < 0
-.macro load_weights_sub_add
-    rsb         ip,  #0
-    vdup.8      d31, r6
-    vdup.8      d30, ip
-.endm
-
-.macro load_sub_add d1 d2
-    vld1.32     {\d2}, [r4], r5
-    vld1.32     {\d1}, [r2], r3
-.endm
-
-.macro weight_sub_add dst s1 s2
-    vmull.u8    \dst, \s2, d31
-    vmlsl.u8    \dst, \s1, d30
-.endm
-
-.macro AVG_WEIGHT ext
-function x264_pixel_avg_weight_w4_\ext\()_neon, export=0
-    load_weights_\ext
-1:  // height loop
-    subs            lr,  lr,  #2
-    load_\ext       d0[], d1[]
-    weight_\ext     q8,  d0,  d1
-    load_\ext       d2[], d3[]
-    vqrshrun.s16    d0,  q8,  #6
-    weight_\ext     q9,  d2,  d3
-    vst1.32         {d0[0]}, [r0,:32], r1
-    vqrshrun.s16    d1,  q9,  #6
-    vst1.32         {d1[0]}, [r0,:32], r1
-    bgt             1b
-    pop             {r4-r6,pc}
-endfunc
-
-function x264_pixel_avg_weight_w8_\ext\()_neon, export=0
-    load_weights_\ext
-1:  // height loop
-    subs            lr,  lr,  #4
-    load_\ext       d0,  d1
-    weight_\ext     q8,  d0,  d1
-    load_\ext       d2,  d3
-    weight_\ext     q9,  d2,  d3
-    load_\ext       d4,  d5
-    weight_\ext     q10, d4,  d5
-    load_\ext       d6,  d7
-    weight_\ext     q11, d6,  d7
-    vqrshrun.s16    d0,  q8,  #6
-    vqrshrun.s16    d1,  q9,  #6
-    vqrshrun.s16    d2,  q10, #6
-    vqrshrun.s16    d3,  q11, #6
-    vst1.64         {d0}, [r0,:64], r1
-    vst1.64         {d1}, [r0,:64], r1
-    vst1.64         {d2}, [r0,:64], r1
-    vst1.64         {d3}, [r0,:64], r1
-    bgt             1b
-    pop             {r4-r6,pc}
-endfunc
-
-function x264_pixel_avg_weight_w16_\ext\()_neon, export=0
-    load_weights_\ext
-1:  // height loop
-    subs            lr,  lr,  #2
-    load_\ext       d0-d1, d2-d3
-    weight_\ext     q8,  d0,  d2
-    weight_\ext     q9,  d1,  d3
-    load_\ext       d4-d5, d6-d7
-    weight_\ext     q10, d4,  d6
-    weight_\ext     q11, d5,  d7
-    vqrshrun.s16    d0,  q8,  #6
-    vqrshrun.s16    d1,  q9,  #6
-    vqrshrun.s16    d2,  q10, #6
-    vqrshrun.s16    d3,  q11, #6
-    vst1.64         {d0-d1}, [r0,:128], r1
-    vst1.64         {d2-d3}, [r0,:128], r1
-    bgt             1b
-    pop             {r4-r6,pc}
-endfunc
-.endm
-
-AVG_WEIGHT add_add
-AVG_WEIGHT add_sub
-AVG_WEIGHT sub_add
-
-function x264_pixel_avg_w4_neon, export=0
-    subs        lr,  lr,  #2
-    vld1.32     {d0[]}, [r2], r3
-    vld1.32     {d2[]}, [r4], r5
-    vrhadd.u8   d0,  d0,  d2
-    vld1.32     {d1[]}, [r2], r3
-    vld1.32     {d3[]}, [r4], r5
-    vrhadd.u8   d1,  d1,  d3
-    vst1.32     {d0[0]}, [r0,:32], r1
-    vst1.32     {d1[0]}, [r0,:32], r1
-    bgt         x264_pixel_avg_w4_neon
-    pop         {r4-r6,pc}
-endfunc
-
-function x264_pixel_avg_w8_neon, export=0
-    subs        lr,  lr,  #4
-    vld1.64     {d0}, [r2], r3
-    vld1.64     {d2}, [r4], r5
-    vrhadd.u8   d0,  d0,  d2
-    vld1.64     {d1}, [r2], r3
-    vld1.64     {d3}, [r4], r5
-    vrhadd.u8   d1,  d1,  d3
-    vst1.64     {d0}, [r0,:64], r1
-    vld1.64     {d2}, [r2], r3
-    vld1.64     {d4}, [r4], r5
-    vrhadd.u8   d2,  d2,  d4
-    vst1.64     {d1}, [r0,:64], r1
-    vld1.64     {d3}, [r2], r3
-    vld1.64     {d5}, [r4], r5
-    vrhadd.u8   d3,  d3,  d5
-    vst1.64     {d2}, [r0,:64], r1
-    vst1.64     {d3}, [r0,:64], r1
-    bgt         x264_pixel_avg_w8_neon
-    pop         {r4-r6,pc}
-endfunc
-
-function x264_pixel_avg_w16_neon, export=0
-    subs        lr,  lr,  #4
-    vld1.64     {d0-d1}, [r2], r3
-    vld1.64     {d2-d3}, [r4], r5
-    vrhadd.u8   q0,  q0,  q1
-    vld1.64     {d2-d3}, [r2], r3
-    vld1.64     {d4-d5}, [r4], r5
-    vrhadd.u8   q1,  q1,  q2
-    vst1.64     {d0-d1}, [r0,:128], r1
-    vld1.64     {d4-d5}, [r2], r3
-    vld1.64     {d6-d7}, [r4], r5
-    vrhadd.u8   q2,  q2,  q3
-    vst1.64     {d2-d3}, [r0,:128], r1
-    vld1.64     {d6-d7}, [r2], r3
-    vld1.64     {d0-d1}, [r4], r5
-    vrhadd.u8   q3,  q3,  q0
-    vst1.64     {d4-d5}, [r0,:128], r1
-    vst1.64     {d6-d7}, [r0,:128], r1
-    bgt         x264_pixel_avg_w16_neon
-    pop         {r4-r6,pc}
-endfunc
-
-
-function x264_pixel_avg2_w4_neon
-    ldr         ip,  [sp, #4]
-    push        {lr}
-    ldr         lr,  [sp, #4]
-avg2_w4_loop:
-    subs        ip,  ip,  #2
-    vld1.32     {d0[]},  [r2], r3
-    vld1.32     {d2[]},  [lr], r3
-    vrhadd.u8   d0,  d0,  d2
-    vld1.32     {d1[]},  [r2], r3
-    vld1.32     {d3[]},  [lr], r3
-    vrhadd.u8   d1,  d1,  d3
-    vst1.32     {d0[0]}, [r0,:32], r1
-    vst1.32     {d1[0]}, [r0,:32], r1
-    bgt         avg2_w4_loop
-    pop         {pc}
-endfunc
-
-function x264_pixel_avg2_w8_neon
-    ldr         ip,  [sp, #4]
-    push        {lr}
-    ldr         lr,  [sp, #4]
-avg2_w8_loop:
-    subs        ip,  ip,  #2
-    vld1.64     {d0}, [r2], r3
-    vld1.64     {d2}, [lr], r3
-    vrhadd.u8   d0,  d0,  d2
-    vld1.64     {d1}, [r2], r3
-    vld1.64     {d3}, [lr], r3
-    vrhadd.u8   d1,  d1,  d3
-    vst1.64     {d0}, [r0,:64], r1
-    vst1.64     {d1}, [r0,:64], r1
-    bgt         avg2_w8_loop
-    pop         {pc}
-endfunc
-
-function x264_pixel_avg2_w16_neon
-    ldr         ip,  [sp, #4]
-    push        {lr}
-    ldr         lr,  [sp, #4]
-avg2_w16_loop:
-    subs        ip,  ip,  #2
-    vld1.64     {d0-d1}, [r2], r3
-    vld1.64     {d2-d3}, [lr], r3
-    vrhadd.u8   q0,  q0,  q1
-    vld1.64     {d4-d5}, [r2], r3
-    vld1.64     {d6-d7}, [lr], r3
-    vrhadd.u8   q2,  q2,  q3
-    vst1.64     {d0-d1}, [r0,:128], r1
-    vst1.64     {d4-d5}, [r0,:128], r1
-    bgt         avg2_w16_loop
-    pop         {pc}
-endfunc
-
-function x264_pixel_avg2_w20_neon
-    ldr         ip,  [sp, #4]
-    push        {lr}
-    sub         r1,  r1,  #16
-    ldr         lr,  [sp, #4]
-avg2_w20_loop:
-    subs        ip,  ip,  #2
-    vld1.64     {d0-d2},  [r2], r3
-    vld1.64     {d4-d6},  [lr], r3
-    vrhadd.u8   q0,  q0,  q2
-    vrhadd.u8   d2,  d2,  d6
-    vld1.64     {d4-d6},  [r2], r3
-    vld1.64     {d16-d18},[lr], r3
-    vrhadd.u8   q2,  q2,  q8
-    vst1.64     {d0-d1},  [r0,:128]!
-    vrhadd.u8   d6,  d6,  d18
-    vst1.32     {d2[0]},  [r0,:32], r1
-    vst1.64     {d4-d5},  [r0,:128]!
-    vst1.32     {d6[0]},  [r0,:32], r1
-    bgt         avg2_w20_loop
-    pop         {pc}
-endfunc
-
-
-.macro weight_prologue type
-    push        {r4-r5,lr}
-    ldr         r4,  [sp, #4*3]     // weight_t
-    ldr         ip,  [sp, #4*3+4]   // h
-.ifc \type, full
-    ldr         lr,  [r4, #32]      // denom
-.endif
-    ldrd        r4,  r5,  [r4, #32+4]    // scale, offset
-    vdup.8      d0,  r4
-    vdup.16     q1,  r5
-.ifc \type, full
-    rsb         lr,  lr,  #0
-    vdup.16     q2,  lr
-.endif
-.endm
-
-// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride,
-//                 const x264_weight_t *weight, int height )
-function x264_mc_weight_w20_neon
-    weight_prologue full
-    sub         r1, #16
-weight20_loop:
-    subs        ip,  #2
-    vld1.8      {d17-d19}, [r2], r3
-    vmull.u8    q10, d17, d0
-    vmull.u8    q11, d18, d0
-    vld1.8      {d16-d18}, [r2], r3
-    vmull.u8    q12, d16, d0
-    vmull.u8    q13, d17, d0
-    vtrn.32     d19, d18
-    vmull.u8    q14, d19, d0
-    vrshl.s16   q10, q10, q2
-    vrshl.s16   q11, q11, q2
-    vrshl.s16   q12, q12, q2
-    vrshl.s16   q13, q13, q2
-    vrshl.s16   q14, q14, q2
-    vadd.s16    q10, q10, q1
-    vadd.s16    q11, q11, q1
-    vadd.s16    q12, q12, q1
-    vadd.s16    q13, q13, q1
-    vadd.s16    q14, q14, q1
-    vqmovun.s16 d16, q10
-    vqmovun.s16 d17, q11
-    vqmovun.s16 d18, q12
-    vqmovun.s16 d19, q13
-    vqmovun.s16 d20, q14
-    vst1.8      {d16-d17}, [r0,:128]!
-    vst1.32     {d20[0]},  [r0,:32], r1
-    vst1.8      {d18-d19}, [r0,:128]!
-    vst1.32     {d20[1]},  [r0,:32], r1
-    bgt         weight20_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w16_neon
-    weight_prologue full
-weight16_loop:
-    subs        ip,  #2
-    vld1.8      {d16-d17}, [r2], r3
-    vld1.8      {d18-d19}, [r2], r3
-    vmull.u8    q10, d16, d0
-    vmull.u8    q11, d17, d0
-    vmull.u8    q12, d18, d0
-    vmull.u8    q13, d19, d0
-    vrshl.s16   q10, q10, q2
-    vrshl.s16   q11, q11, q2
-    vrshl.s16   q12, q12, q2
-    vrshl.s16   q13, q13, q2
-    vadd.s16    q10, q10, q1
-    vadd.s16    q11, q11, q1
-    vadd.s16    q12, q12, q1
-    vadd.s16    q13, q13, q1
-    vqmovun.s16 d16, q10
-    vqmovun.s16 d17, q11
-    vqmovun.s16 d18, q12
-    vqmovun.s16 d19, q13
-    vst1.8      {d16-d17}, [r0,:128], r1
-    vst1.8      {d18-d19}, [r0,:128], r1
-    bgt         weight16_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w8_neon
-    weight_prologue full
-weight8_loop:
-    subs        ip,  #2
-    vld1.8      {d16}, [r2], r3
-    vld1.8      {d18}, [r2], r3
-    vmull.u8    q8,  d16, d0
-    vmull.u8    q9,  d18, d0
-    vrshl.s16   q8,  q8,  q2
-    vrshl.s16   q9,  q9,  q2
-    vadd.s16    q8,  q8,  q1
-    vadd.s16    q9,  q9,  q1
-    vqmovun.s16 d16, q8
-    vqmovun.s16 d18, q9
-    vst1.8      {d16}, [r0,:64], r1
-    vst1.8      {d18}, [r0,:64], r1
-    bgt         weight8_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w4_neon
-    weight_prologue full
-weight4_loop:
-    subs        ip,  #2
-    vld1.32     {d16[0]}, [r2], r3
-    vld1.32     {d16[1]}, [r2], r3
-    vmull.u8    q8,  d16, d0
-    vrshl.s16   q8,  q8,  q2
-    vadd.s16    q8,  q8,  q1
-    vqmovun.s16 d16, q8
-    vst1.32     {d16[0]}, [r0], r1
-    vst1.32     {d16[1]}, [r0], r1
-    bgt         weight4_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w20_nodenom_neon
-    weight_prologue nodenom
-    sub         r1, #16
-weight20_nodenom_loop:
-    subs        ip,  #2
-    vld1.8      {d26-d28}, [r2], r3
-    vmov        q8,  q1
-    vmov        q9,  q1
-    vld1.8      {d29-d31}, [r2], r3
-    vmov        q10, q1
-    vmov        q11, q1
-    vmov        q12, q1
-    vtrn.32     d28, d31
-    vmlal.u8    q8,  d26, d0
-    vmlal.u8    q9,  d27, d0
-    vmlal.u8    q10, d29, d0
-    vmlal.u8    q11, d30, d0
-    vmlal.u8    q12, d28, d0
-    vqmovun.s16 d16, q8
-    vqmovun.s16 d17, q9
-    vqmovun.s16 d18, q10
-    vqmovun.s16 d19, q11
-    vqmovun.s16 d20, q12
-    vst1.8      {d16-d17}, [r0,:128]!
-    vst1.32     {d20[0]},  [r0,:32], r1
-    vst1.8      {d18-d19}, [r0,:128]!
-    vst1.32     {d20[1]},  [r0,:32], r1
-    bgt         weight20_nodenom_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w16_nodenom_neon
-    weight_prologue nodenom
-weight16_nodenom_loop:
-    subs        ip,  #2
-    vld1.8      {d16-d17}, [r2], r3
-    vld1.8      {d18-d19}, [r2], r3
-    vmov        q12, q1
-    vmov        q13, q1
-    vmov        q14, q1
-    vmov        q15, q1
-    vmlal.u8    q12, d16, d0
-    vmlal.u8    q13, d17, d0
-    vmlal.u8    q14, d18, d0
-    vmlal.u8    q15, d19, d0
-    vqmovun.s16 d16, q12
-    vqmovun.s16 d17, q13
-    vqmovun.s16 d18, q14
-    vqmovun.s16 d19, q15
-    vst1.8      {d16-d17}, [r0,:128], r1
-    vst1.8      {d18-d19}, [r0,:128], r1
-    bgt         weight16_nodenom_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w8_nodenom_neon
-    weight_prologue nodenom
-weight8_nodenom_loop:
-    subs        ip,  #2
-    vld1.8      {d16}, [r2], r3
-    vld1.8      {d18}, [r2], r3
-    vmov        q10, q1
-    vmov        q11, q1
-    vmlal.u8    q10, d16, d0
-    vmlal.u8    q11, d18, d0
-    vqmovun.s16 d16, q10
-    vqmovun.s16 d17, q11
-    vst1.8      {d16}, [r0,:64], r1
-    vst1.8      {d17}, [r0,:64], r1
-    bgt         weight8_nodenom_loop
-    pop         {r4-r5,pc}
-endfunc
-
-function x264_mc_weight_w4_nodenom_neon
-    weight_prologue nodenom
-weight4_nodenom_loop:
-    subs        ip,  #2
-    vld1.32     {d16[0]}, [r2], r3
-    vld1.32     {d16[1]}, [r2], r3
-    vmov        q10, q1
-    vmlal.u8    q10, d16, d0
-    vqmovun.s16 d16, q10
-    vst1.32     {d16[0]}, [r0], r1
-    vst1.32     {d16[1]}, [r0], r1
-    bgt         weight4_nodenom_loop
-    pop         {r4-r5,pc}
-endfunc
-
-.macro weight_simple_prologue
-    push        {lr}
-    ldr         lr,  [sp, #4]       // weight_t
-    ldr         ip,  [sp, #8]       // h
-    ldr         lr,  [lr]           // offset
-    vdup.8      q1,  lr
-.endm
-
-.macro weight_simple name op
-function x264_mc_weight_w20_\name\()_neon
-    weight_simple_prologue
-weight20_\name\()_loop:
-    subs        ip,  #2
-    vld1.8      {d16-d18}, [r2], r3
-    vld1.8      {d19-d21}, [r2], r3
-    \op         q8,  q8,  q1
-    \op         q9,  q9,  q1
-    \op         q10, q10, q1
-    vst1.8      {d16-d18}, [r0,:64], r1
-    vst1.8      {d19-d21}, [r0,:64], r1
-    bgt         weight20_\name\()_loop
-    pop         {pc}
-endfunc
-
-function x264_mc_weight_w16_\name\()_neon
-    weight_simple_prologue
-weight16_\name\()_loop:
-    subs        ip,  #2
-    vld1.8      {d16-d17}, [r2], r3
-    vld1.8      {d18-d19}, [r2], r3
-    \op         q8,  q8,  q1
-    \op         q9,  q9,  q1
-    vst1.8      {d16-d17}, [r0,:128], r1
-    vst1.8      {d18-d19}, [r0,:128], r1
-    bgt         weight16_\name\()_loop
-    pop         {pc}
-endfunc
-
-function x264_mc_weight_w8_\name\()_neon
-    weight_simple_prologue
-weight8_\name\()_loop:
-    subs        ip,  #2
-    vld1.8      {d16}, [r2], r3
-    vld1.8      {d17}, [r2], r3
-    \op         q8,  q8,  q1
-    vst1.8      {d16}, [r0,:64], r1
-    vst1.8      {d17}, [r0,:64], r1
-    bgt         weight8_\name\()_loop
-    pop         {pc}
-endfunc
-
-function x264_mc_weight_w4_\name\()_neon
-    weight_simple_prologue
-weight4_\name\()_loop:
-    subs        ip,  #2
-    vld1.32     {d16[]}, [r2], r3
-    vld1.32     {d17[]}, [r2], r3
-    \op         q8,  q8,  q1
-    vst1.32     {d16[0]}, [r0], r1
-    vst1.32     {d17[0]}, [r0], r1
-    bgt         weight4_\name\()_loop
-    pop         {pc}
-endfunc
-.endm
-
-weight_simple offsetadd, vqadd.u8
-weight_simple offsetsub, vqsub.u8
-
-
-// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
-function x264_mc_copy_w4_neon
-    ldr         ip,  [sp]
-copy_w4_loop:
-    subs        ip,  ip,  #4
-    vld1.32     {d0[]},  [r2], r3
-    vld1.32     {d1[]},  [r2], r3
-    vld1.32     {d2[]},  [r2], r3
-    vld1.32     {d3[]},  [r2], r3
-    vst1.32     {d0[0]}, [r0,:32], r1
-    vst1.32     {d1[0]}, [r0,:32], r1
-    vst1.32     {d2[0]}, [r0,:32], r1
-    vst1.32     {d3[0]}, [r0,:32], r1
-    bgt         copy_w4_loop
-    bx          lr
-endfunc
-
-function x264_mc_copy_w8_neon
-    ldr         ip,  [sp]
-copy_w8_loop:
-    subs        ip,  ip,  #4
-    vld1.32     {d0}, [r2], r3
-    vld1.32     {d1}, [r2], r3
-    vld1.32     {d2}, [r2], r3
-    vld1.32     {d3}, [r2], r3
-    vst1.32     {d0}, [r0,:64], r1
-    vst1.32     {d1}, [r0,:64], r1
-    vst1.32     {d2}, [r0,:64], r1
-    vst1.32     {d3}, [r0,:64], r1
-    bgt         copy_w8_loop
-    bx          lr
-endfunc
-
-function x264_mc_copy_w16_neon
-    ldr         ip,  [sp]
-copy_w16_loop:
-    subs        ip,  ip,  #4
-    vld1.32     {d0-d1}, [r2], r3
-    vld1.32     {d2-d3}, [r2], r3
-    vld1.32     {d4-d5}, [r2], r3
-    vld1.32     {d6-d7}, [r2], r3
-    vst1.32     {d0-d1}, [r0,:128], r1
-    vst1.32     {d2-d3}, [r0,:128], r1
-    vst1.32     {d4-d5}, [r0,:128], r1
-    vst1.32     {d6-d7}, [r0,:128], r1
-    bgt         copy_w16_loop
-    bx          lr
-endfunc
-
-function x264_mc_copy_w16_aligned_neon
-    ldr         ip,  [sp]
-copy_w16_aligned_loop:
-    subs        ip,  ip,  #4
-    vld1.32     {d0-d1}, [r2,:128], r3
-    vld1.32     {d2-d3}, [r2,:128], r3
-    vld1.32     {d4-d5}, [r2,:128], r3
-    vld1.32     {d6-d7}, [r2,:128], r3
-    vst1.32     {d0-d1}, [r0,:128], r1
-    vst1.32     {d2-d3}, [r0,:128], r1
-    vst1.32     {d4-d5}, [r0,:128], r1
-    vst1.32     {d6-d7}, [r0,:128], r1
-    bgt         copy_w16_aligned_loop
-    bx          lr
-endfunc
-
-
-// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
-//                           uint8_t *src, intptr_t i_src_stride,
-//                           int dx, int dy, int i_width, int i_height );
-
-function x264_mc_chroma_neon
-    push            {r4-r8, lr}
-    vpush           {d8-d11}
-    ldrd            r4, r5, [sp, #56]
-    ldrd            r6, r7, [sp, #64]
-
-    asr             lr, r6, #3
-    mul             lr, r4, lr
-    add             r3, r3, r5, asr #2
-    cmp             r7, #4
-
-    and             r5, r5, #7
-    and             r6, r6, #7
-
-    add             r3, r3, lr
-    bic             r3, r3, #0x1
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    bgt             mc_chroma_w8
-    beq             mc_chroma_w4
-
-.macro CHROMA_MC_START r00, r01, r10, r11
-    muls            lr, r5, r6
-    rsb             r7, lr, r6, lsl #3
-    rsb             ip, lr, r5, lsl #3
-    sub             r5, lr, r5, lsl #3
-    sub             r5, r5, r6, lsl #3
-    add             r5, r5, #64
-
-    beq             2f
-    vld2.8          {\r00-\r01}, [r3], r4
-
-    vdup.8          d0,    r5
-    vdup.8          d1,    ip
-
-    vdup.8          d2,    r7
-    vld2.8          {\r10-\r11}, [r3], r4
-    vdup.8          d3,    lr
-    ldr             r5,    [sp, #72]
-.endm
-
-.macro CHROMA_MC width, align
-mc_chroma_w\width:
-    CHROMA_MC_START d4, d5,  d8, d9
-    vext.8          d6,  d4,  d6,  #1
-    vext.8          d7,  d5,  d7,  #1
-    vext.8          d10, d8,  d10, #1
-    vext.8          d11, d9,  d11, #1
-// since the element size varies, there's a different index for the 2nd store
-.if \width == 4
-    .set st2, 1
-.else
-    .set st2, 2
-.endif
-
-    vtrn.32         d4, d6
-    vtrn.32         d5, d7
-    vtrn.32         d8, d10
-    vtrn.32         d9, d11
-
-    vtrn.32         d0, d1
-    vtrn.32         d2, d3
-
-1:  // height loop, interpolate xy
-
-    vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d8,  d2
-    vmull.u8        q9,  d5,  d0
-    vmlal.u8        q9,  d9,  d2
-
-    vld2.8          {d4-d5},  [r3], r4
-
-    vext.8          d6,  d4,  d6,  #1
-    vext.8          d7,  d5,  d7,  #1
-
-    vadd.i16        d16, d16, d17
-    vadd.i16        d17, d18, d19
-
-    vtrn.32         d4,  d6
-    vtrn.32         d5,  d7
-
-    vmull.u8        q10, d8,  d0
-    vmlal.u8        q10, d4,  d2
-    vmull.u8        q11, d9,  d0
-    vmlal.u8        q11, d5,  d2
-
-    vld2.8          {d8-d9},  [r3], r4
-
-    vrshrn.u16      d16, q8,  #6
-
-    vext.8          d10, d8,  d10,  #1
-    vext.8          d11, d9,  d11,  #1
-
-    vadd.i16        d18, d20, d21
-    vadd.i16        d19, d22, d23
-
-    vtrn.32         d8, d10
-    vtrn.32         d9, d11
-
-    vrshrn.u16      d18, q9,  #6
-
-    subs            r5,  r5,  #2
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.\align     {d16[0]},   [r0,:\align], r2
-    vst1.\align     {d16[st2]}, [r1,:\align], r2
-    vst1.\align     {d18[0]},   [r0,:\align], r2
-    vst1.\align     {d18[st2]}, [r1,:\align], r2
-    bgt             1b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-
-2:  // dx or dy are 0
-    tst             r7,  r7
-    add             ip,  ip,  r7
-    vdup.8          d0,  r5
-    ldr             r5,  [sp, #72]
-    vdup.8          d1,  ip
-
-    beq             4f
-
-    vld1.64          {d4}, [r3], r4
-    vld1.64          {d6}, [r3], r4
-
-3:  // vertical interpolation loop
-
-    vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d6,  d1
-    vmull.u8        q9,  d6,  d0
-    vld1.64         {d4}, [r3], r4
-    vmlal.u8        q9,  d4,  d1
-    vld1.64         {d6}, [r3], r4
-
-    vrshrn.u16      d16, q8,  #6 // uvuvuvuv
-    vrshrn.u16      d17, q9,  #6 // uvuvuvuv
-    subs            r5,  r5,  #2
-    vuzp.8          d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.\align     {d16[0]},   [r0,:\align], r2
-    vst1.\align     {d16[st2]}, [r0,:\align], r2
-    vst1.\align     {d17[0]},   [r1,:\align], r2
-    vst1.\align     {d17[st2]}, [r1,:\align], r2
-    bgt             3b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-
-4:  // dy is 0
-
-    vld1.64         {d4-d5},  [r3], r4
-    vld1.64         {d6-d7},  [r3], r4
-
-    vext.8          d5,  d4,  d5,  #2
-    vext.8          d7,  d6,  d7,  #2
-
-5:  // horizontal interpolation loop
-
-    vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d5,  d1
-    vmull.u8        q9,  d6,  d0
-    vmlal.u8        q9,  d7,  d1
-
-    subs            r5,  r5,  #2
-    vld1.64         {d4-d5},  [r3], r4
-    vld1.64         {d6-d7},  [r3], r4
-    vext.8          d5,  d4,  d5,  #2
-    vrshrn.u16      d16, q8,  #6
-    vrshrn.u16      d17, q9,  #6
-    vext.8          d7,  d6,  d7,  #2
-    vuzp.8          d16, d17
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.\align     {d16[0]},   [r0,:\align], r2
-    vst1.\align     {d16[st2]}, [r0,:\align], r2
-    vst1.\align     {d17[0]},   [r1,:\align], r2
-    vst1.\align     {d17[st2]}, [r1,:\align], r2
-    bgt             5b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-.endm
-
-   CHROMA_MC 2, 16
-   CHROMA_MC 4, 32
-
-mc_chroma_w8:
-    CHROMA_MC_START d4, d7, d8, d11
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d9,  d8,  d9,  #1
-    vext.8          d7,  d6,  d7,  #1
-    vext.8          d11, d10, d11,  #1
-
-1:  // height loop, interpolate xy
-    vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d5,  d1
-    vmlal.u8        q8,  d8,  d2
-    vmlal.u8        q8,  d9,  d3
-
-    vmull.u8        q9,  d6,  d0
-    vmlal.u8        q9,  d7,  d1
-    vmlal.u8        q9,  d10,  d2
-    vmlal.u8        q9,  d11,  d3
-
-    vld2.8          {d4-d7}, [r3], r4
-
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d7,  d6,  d7,  #1
-
-    vmull.u8        q10, d8,   d0
-    vmlal.u8        q10, d9,   d1
-    vmlal.u8        q10, d4,   d2
-    vmlal.u8        q10, d5,   d3
-
-    vmull.u8        q11, d10,  d0
-    vmlal.u8        q11, d11,  d1
-    vmlal.u8        q11, d6,   d2
-    vmlal.u8        q11, d7,   d3
-
-    subs            r5,  r5,   #2
-    vld2.8          {d8-d11}, [r3], r4
-
-    vrshrn.u16      d16, q8,  #6
-    vrshrn.u16      d17, q9,  #6
-    vrshrn.u16      d18, q10, #6
-    vext.8          d9,  d8,  d9,  #1
-    vrshrn.u16      d19, q11, #6
-    vext.8          d11, d10, d11,  #1
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.64         {d16}, [r0,:64], r2
-    vst1.64         {d17}, [r1,:64], r2
-    vst1.64         {d18}, [r0,:64], r2
-    vst1.64         {d19}, [r1,:64], r2
-
-    bgt             1b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-
-2:  // dx or dy are 0
-    tst             r7,  r7
-    add             ip,  ip,  r7
-    vdup.8          d0,  r5
-    ldr             r5,  [sp, #72]
-    vdup.8          d1,  ip
-
-    beq             4f
-
-    vld2.8          {d4-d5}, [r3], r4
-    vld2.8          {d6-d7}, [r3], r4
-
-3:  // vertical interpolation loop
-    vmull.u8        q8,  d4,  d0 //U
-    vmlal.u8        q8,  d6,  d1
-    vmull.u8        q9,  d5,  d0 //V
-    vmlal.u8        q9,  d7,  d1
-
-    vld2.8          {d4-d5}, [r3], r4
-
-    vmull.u8        q10, d6,  d0
-    vmlal.u8        q10, d4,  d1
-    vmull.u8        q11, d7,  d0
-    vmlal.u8        q11, d5,  d1
-
-    vld2.8          {d6-d7}, [r3], r4
-
-    vrshrn.u16      d16, q8,  #6
-    vrshrn.u16      d17, q9,  #6
-    vrshrn.u16      d18, q10, #6
-    vrshrn.u16      d19, q11, #6
-    subs            r5,  r5,  #2
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.64         {d16}, [r0,:64], r2
-    vst1.64         {d17}, [r1,:64], r2
-    vst1.64         {d18}, [r0,:64], r2
-    vst1.64         {d19}, [r1,:64], r2
-
-    bgt             3b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-
-4:  // dy is 0
-
-    vld2.8          {d4-d7},  [r3], r4
-    vld2.8          {d8-d11}, [r3], r4
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d7,  d6,  d7,  #1
-    vext.8          d9,  d8,  d9,  #1
-    vext.8          d11, d10, d11, #1
-
-5:  // horizontal interpolation loop
-    subs            r5,  r5,  #2
-    vmull.u8        q8,  d4,  d0 //U
-    vmlal.u8        q8,  d5,  d1
-    vmull.u8        q9,  d6,  d0 //V
-    vmlal.u8        q9,  d7,  d1
-
-    vld2.8          {d4-d7}, [r3], r4
-
-    vmull.u8        q10, d8,  d0
-    vmlal.u8        q10, d9,  d1
-    vmull.u8        q11, d10, d0
-    vmlal.u8        q11, d11, d1
-
-    vld2.8          {d8-d11}, [r3], r4
-
-    vext.8          d5,  d4,  d5,  #1
-    vrshrn.u16      d16, q8,  #6
-    vext.8          d7,  d6,  d7,  #1
-    vrshrn.u16      d17, q9,  #6
-    vext.8          d9,  d8,  d9,  #1
-    vrshrn.u16      d18, q10, #6
-    vext.8          d11, d10, d11, #1
-    vrshrn.u16      d19, q11, #6
-
-    pld             [r3]
-    pld             [r3, r4]
-
-    vst1.64         {d16}, [r0,:64], r2
-    vst1.64         {d17}, [r1,:64], r2
-    vst1.64         {d18}, [r0,:64], r2
-    vst1.64         {d19}, [r1,:64], r2
-    bgt             5b
-
-    vpop            {d8-d11}
-    pop             {r4-r8, pc}
-
-endfunc
-
-
-// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width )
-function x264_hpel_filter_v_neon
-    ldr             ip,  [sp]
-    sub             r1,  r1,  r3,  lsl #1
-    push            {lr}
-    add             lr,  r1,  ip
-    vmov.u8         d30, #5
-    vmov.u8         d31, #20
-
-filter_v_loop:
-    subs            ip,  ip,  #16
-    vld1.64         {d0-d1},   [r1,:128], r3
-    vld1.64         {d2-d3},   [r1,:128], r3
-    vld1.64         {d4-d5},   [r1,:128], r3
-    vld1.64         {d6-d7},   [r1,:128], r3
-    vld1.64         {d16-d17}, [r1,:128], r3
-    vld1.64         {d18-d19}, [r1,:128], r3
-    sub             r1,  lr,  ip
-
-    vaddl.u8        q10, d0,  d18
-    vmlsl.u8        q10, d2,  d30
-    vmlal.u8        q10, d4,  d31
-    vmlal.u8        q10, d6,  d31
-    vmlsl.u8        q10, d16, d30
-
-    vaddl.u8        q11, d1,  d19
-    vmlsl.u8        q11, d3,  d30
-    vmlal.u8        q11, d5,  d31
-    vmlal.u8        q11, d7,  d31
-    vmlsl.u8        q11, d17, d30
-
-    vqrshrun.s16    d0,  q10, #5
-    vst1.64         {d20-d21}, [r2,:128]!
-    vqrshrun.s16    d1,  q11, #5
-    vst1.64         {d22-d23}, [r2,:128]!
-    vst1.64         {d0-d1},   [r0,:128]!
-    bgt             filter_v_loop
-    pop             {pc}
-endfunc
-
-// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
-function x264_hpel_filter_c_neon
-    sub             r1,  #16
-    vld1.64         {d0-d3}, [r1,:128]!
-
-    // unrolled 2x: 4% faster
-filter_c_loop:
-    subs            r2,  r2,  #16
-    vld1.64         {d4-d7}, [r1,:128]!
-    vext.16         q8,  q0,  q1,  #6
-    vext.16         q12, q1,  q2,  #3
-    vadd.s16        q8,  q8,  q12
-    vext.16         q9,  q0,  q1,  #7
-    vext.16         q11, q1,  q2,  #2
-    vadd.s16        q9,  q9,  q11
-    vext.16         q10, q1,  q2,  #1
-    vext.16         q11, q1,  q2,  #6
-    vadd.s16        q10, q1,  q10
-    vsub.s16        q8,  q8,  q9    // a-b
-    vext.16         q15, q2,  q3,  #3
-    vsub.s16        q9,  q9,  q10   // b-c
-
-    vext.16         q12, q1,  q2,  #7
-    vshr.s16        q8,  q8,  #2    // (a-b)/4
-    vadd.s16        q11, q11, q15
-    vext.16         q14, q2,  q3,  #2
-    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
-    vadd.s16        q12, q12, q14
-    vext.16         q13, q2,  q3,  #1
-
-    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
-    vadd.s16        q13, q2,  q13
-    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vsub.s16        q11, q11, q12   // a-b
-    vsub.s16        q12, q12, q13   // b-c
-    vshr.s16        q11, q11, #2    // (a-b)/4
-    vqrshrun.s16    d30, q8,  #6
-    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
-    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
-    vld1.64         {d0-d3}, [r1,:128]!
-    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-
-    vext.16         q8,  q2,  q3,  #6
-    vqrshrun.s16    d31, q11,  #6
-    vext.16         q12, q3,  q0,  #3
-    vadd.s16        q8,  q8,  q12
-    vext.16         q9,  q2,  q3,  #7
-    vst1.64         {d30-d31}, [r0,:128]!
-    bxle            lr
-    subs            r2,  r2,  #16
-
-    vext.16         q11, q3,  q0,  #2
-    vadd.s16        q9,  q9,  q11
-    vext.16         q10, q3,  q0,  #1
-    vext.16         q11, q3,  q0,  #6
-    vadd.s16        q10, q3,  q10
-    vsub.s16        q8,  q8,  q9    // a-b
-    vext.16         q15, q0,  q1,  #3
-    vsub.s16        q9,  q9,  q10   // b-c
-
-    vext.16         q12, q3,  q0,  #7
-    vshr.s16        q8,  q8,  #2    // (a-b)/4
-    vadd.s16        q11, q11, q15
-    vext.16         q14, q0,  q1,  #2
-    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
-    vadd.s16        q12, q12, q14
-    vext.16         q13, q0,  q1,  #1
-
-    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
-    vadd.s16        q13, q0,  q13
-    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vsub.s16        q11, q11, q12   // a-b
-    vsub.s16        q12, q12, q13   // b-c
-    vshr.s16        q11, q11, #2    // (a-b)/4
-    vqrshrun.s16    d30, q8,  #6
-    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
-    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
-    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-
-    vqrshrun.s16    d31, q11,  #6
-    vst1.64         {d30-d31}, [r0,:128]!
-    bgt             filter_c_loop
-    bx              lr
-endfunc
-
-// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
-function x264_hpel_filter_h_neon
-    sub             r1,  #16
-    vmov.u8         d30, #5
-    vld1.64         {d0-d3}, [r1,:128]!
-    vmov.u8         d31, #20
-
-    // unrolled 3x because it's 5% faster, due to mitigating
-    // the high latency of multiplication and vqrshrun
-filter_h_loop:
-    subs            r2,  r2,  #16
-    vld1.64         {d4-d5}, [r1,:128]!
-    vext.8          q8,  q0,  q1,  #14
-    vext.8          q12, q1,  q2,  #3
-    vaddl.u8        q13, d16, d24
-    vext.8          q9,  q0,  q1,  #15
-    vaddl.u8        q14, d17, d25
-
-    vext.8          q10, q1,  q2,  #1
-    vmlal.u8        q13, d2,  d31
-    vmlsl.u8        q13, d18, d30
-    vext.8          q11, q1,  q2,  #2
-    vmlal.u8        q13, d20, d31
-    vmlsl.u8        q13, d22, d30
-
-    vmlsl.u8        q14, d19, d30
-    vmlal.u8        q14, d3,  d31
-    vmlal.u8        q14, d21, d31
-    vmlsl.u8        q14, d23, d30
-    vqrshrun.s16    d6,  q13, #5
-
-    vld1.64         {d0-d1}, [r1,:128]!
-    vext.8          q8,  q1,  q2,  #14
-    vext.8          q12, q2,  q0,  #3
-    vaddl.u8        q13, d16, d24
-    vqrshrun.s16    d7,  q14, #5
-    vext.8          q9,  q1,  q2,  #15
-    vaddl.u8        q14, d17, d25
-
-    vst1.64         {d6-d7}, [r0,:128]!
-    bxle            lr
-    subs            r2,  r2,  #16
-
-    vext.8          q10, q2,  q0,  #1
-    vmlal.u8        q13, d4,  d31
-    vmlsl.u8        q13, d18, d30
-    vext.8          q11, q2,  q0,  #2
-    vmlal.u8        q13, d20, d31
-    vmlsl.u8        q13, d22, d30
-
-    vmlsl.u8        q14, d19, d30
-    vmlal.u8        q14, d5,  d31
-    vmlal.u8        q14, d21, d31
-    vmlsl.u8        q14, d23, d30
-    vqrshrun.s16    d6,  q13, #5
-
-    vld1.64         {d2-d3}, [r1,:128]!
-    vext.8          q8,  q2,  q0,  #14
-    vext.8          q12, q0,  q1,  #3
-    vaddl.u8        q13, d16, d24
-    vqrshrun.s16    d7,  q14, #5
-    vext.8          q9,  q2,  q0,  #15
-    vaddl.u8        q14, d17, d25
-
-    vst1.64         {d6-d7}, [r0,:128]!
-    bxle            lr
-    subs            r2,  r2,  #16
-
-    vext.8          q10, q0,  q1,  #1
-    vmlal.u8        q13, d0,  d31
-    vmlsl.u8        q13, d18, d30
-    vext.8          q11, q0,  q1,  #2
-    vmlal.u8        q13, d20, d31
-    vmlsl.u8        q13, d22, d30
-
-    vmlsl.u8        q14, d19, d30
-    vmlal.u8        q14, d1,  d31
-    vmlal.u8        q14, d21, d31
-    vmlsl.u8        q14, d23, d30
-
-    vqrshrun.s16    d6, q13, #5
-    vqrshrun.s16    d7, q14, #5
-    vst1.64         {d6-d7}, [r0,:128]!
-    bgt             filter_h_loop
-    bx              lr
-endfunc
-
-
-// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
-//                         uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width,
-//                         int height )
-function x264_frame_init_lowres_core_neon
-    push            {r4-r10,lr}
-    vpush           {d8-d15}
-    ldrd            r4,  r5,  [sp, #96]
-    ldrd            r6,  r7,  [sp, #104]
-    ldr             lr,  [sp, #112]
-    sub             r10, r6,  r7            // dst_stride - width
-    and             r10, r10, #~15
-
-lowres_yloop:
-    mov             ip,  r7                 // width
-    mov             r6,  r0                 // src0
-    add             r8,  r0,  r5            // src1 = src0 + src_stride
-    add             r9,  r0,  r5,  lsl #1   // src2 = src1 + src_stride
-
-    vld2.8          {d8, d10}, [r6,:128]!
-    vld2.8          {d12,d14}, [r8,:128]!
-    vld2.8          {d16,d18}, [r9,:128]!
-
-lowres_xloop:
-    subs            ip,  ip,  #16
-
-    vld2.8          {d9, d11}, [r6,:128]!
-    vld2.8          {d13,d15}, [r8,:128]!
-    vrhadd.u8       q0,  q4,  q6
-    vld2.8          {d17,d19}, [r9,:128]!
-    vrhadd.u8       q5,  q5,  q7
-    vld2.8          {d20,d22}, [r6,:128]!
-    vrhadd.u8       q1,  q6,  q8
-    vld2.8          {d24,d26}, [r8,:128]!
-    vrhadd.u8       q7,  q7,  q9
-    vext.8          q4,  q4,  q10, #1
-    vrhadd.u8       q0,  q0,  q5
-    vext.8          q6,  q6,  q12, #1
-    vrhadd.u8       q1,  q1,  q7
-    vld2.8          {d28,d30}, [r9,:128]!
-    vrhadd.u8       q4,  q4,  q6
-    vext.8          q8,  q8,  q14, #1
-    vrhadd.u8       q6,  q6,  q8
-    vst1.64         {d0-d1},   [r1,:128]!
-    vrhadd.u8       q2,  q4,  q5
-    vst1.64         {d2-d3},   [r3,:128]!
-    vrhadd.u8       q3,  q6,  q7
-    vst1.64         {d4-d5},   [r2,:128]!
-    vst1.64         {d6-d7},   [r4,:128]!
-
-    ble             lowres_xloop_end
-    subs            ip,  ip,  #16
-
-    vld2.8          {d21,d23}, [r6,:128]!
-    vld2.8          {d25,d27}, [r8,:128]!
-    vrhadd.u8       q0,  q10, q12
-    vld2.8          {d29,d31}, [r9,:128]!
-    vrhadd.u8       q11, q11, q13
-    vld2.8          {d8, d10}, [r6,:128]!
-    vrhadd.u8       q1,  q12, q14
-    vld2.8          {d12,d14}, [r8,:128]!
-    vrhadd.u8       q13, q13, q15
-    vext.8          q10, q10, q4,  #1
-    vrhadd.u8       q0,  q0,  q11
-    vext.8          q12, q12, q6,  #1
-    vrhadd.u8       q1,  q1,  q13
-    vld2.8          {d16,d18}, [r9,:128]!
-    vrhadd.u8       q10, q10, q12
-    vext.8          q14, q14, q8,  #1
-    vrhadd.u8       q12, q12, q14
-    vst1.64         {d0-d1},   [r1,:128]!
-    vrhadd.u8       q2,  q10, q11
-    vst1.64         {d2-d3},   [r3,:128]!
-    vrhadd.u8       q3,  q12, q13
-    vst1.64         {d4-d5},   [r2,:128]!
-    vst1.64         {d6-d7},   [r4,:128]!
-
-    bgt             lowres_xloop
-
-lowres_xloop_end:
-    subs            lr,  lr,  #1
-    add             r0,  r0,  r5,  lsl #1
-    add             r1,  r1,  r10
-    add             r2,  r2,  r10
-    add             r3,  r3,  r10
-    add             r4,  r4,  r10
-    bgt             lowres_yloop
-
-    vpop            {d8-d15}
-    pop             {r4-r10,pc}
-endfunc
-
-function x264_load_deinterleave_chroma_fdec_neon
-    mov             ip,  #FDEC_STRIDE/2
-1:
-    vld2.8          {d0-d1}, [r1,:128], r2
-    subs            r3,  r3,  #1
-    pld             [r1]
-    vst1.8          {d0},    [r0,:64], ip
-    vst1.8          {d1},    [r0,:64], ip
-    bgt             1b
-
-    bx              lr
-endfunc
-
-function x264_load_deinterleave_chroma_fenc_neon
-    mov             ip,  #FENC_STRIDE/2
-1:
-    vld2.8          {d0-d1}, [r1,:128], r2
-    subs            r3,  r3,  #1
-    pld             [r1]
-    vst1.8          {d0},    [r0,:64], ip
-    vst1.8          {d1},    [r0,:64], ip
-    bgt             1b
-
-    bx              lr
-endfunc
-
-function x264_plane_copy_core_neon
-    push            {r4,lr}
-    ldr             r4,  [sp, #8]
-    ldr             lr,  [sp, #12]
-    add             r12, r4,  #15
-    bic             r4,  r12, #15
-    sub             r1,  r1,  r4
-    sub             r3,  r3,  r4
-1:
-    mov             r12, r4
-16:
-    tst             r12, #16
-    beq             32f
-    subs            r12, r12, #16
-    vld1.8          {q0}, [r2]!
-    vst1.8          {q0}, [r0]!
-    beq             0f
-32:
-    subs            r12, r12, #32
-    vld1.8          {q0, q1}, [r2]!
-    vst1.8          {q0, q1}, [r0]!
-    bgt             32b
-0:
-    subs            lr,  lr,  #1
-    add             r2,  r2,  r3
-    add             r0,  r0,  r1
-    bgt             1b
-
-    pop             {r4,pc}
-endfunc
-
-function x264_plane_copy_deinterleave_neon
-    push            {r4-r7, lr}
-    ldrd            r6, r7, [sp, #28]
-    ldrd            r4, r5, [sp, #20]
-    add             lr,  r6,  #15
-    bic             lr,  lr,  #15
-    sub             r1,  r1,  lr
-    sub             r3,  r3,  lr
-    sub             r5,  r5,  lr, lsl #1
-block:
-    vld2.8          {d0-d3}, [r4,:128]!
-    subs            lr,  lr,  #16
-    vst1.8          {q0},    [r0]!
-    vst1.8          {q1},    [r2]!
-    bgt             block
-
-    add             r4,  r4,  r5
-    subs            r7,  r7,  #1
-    add             r0,  r0,  r1
-    add             r2,  r2,  r3
-    mov             lr,  r6
-    bgt             block
-
-    pop             {r4-r7, pc}
-endfunc
-
-function x264_plane_copy_deinterleave_rgb_neon
-    push            {r4-r8, r10, r11, lr}
-    ldrd            r4,  r5,  [sp, #32]
-    ldrd            r6,  r7,  [sp, #40]
-    ldr             r8,  [sp, #48]
-    ldrd            r10, r11, [sp, #52]
-    add             lr,  r10, #7
-    subs            r8,  r8,  #3
-    bic             lr,  lr,  #7
-    sub             r7,  r7,  lr, lsl #1
-    sub             r1,  r1,  lr
-    sub             r3,  r3,  lr
-    sub             r5,  r5,  lr
-    subne           r7,  r7,  lr, lsl #1
-    subeq           r7,  r7,  lr
-    bne             block4
-block3:
-    vld3.8          {d0,d1,d2}, [r6]!
-    subs            lr,  lr,  #8
-    vst1.8          {d0},    [r0]!
-    vst1.8          {d1},    [r2]!
-    vst1.8          {d2},    [r4]!
-    bgt             block3
-
-    subs            r11, r11, #1
-    add             r0,  r0,  r1
-    add             r2,  r2,  r3
-    add             r4,  r4,  r5
-    add             r6,  r6,  r7
-    mov             lr,  r10
-    bgt             block3
-
-    pop             {r4-r8, r10, r11, pc}
-block4:
-    vld4.8          {d0,d1,d2,d3}, [r6]!
-    subs            lr,  lr,  #8
-    vst1.8          {d0},    [r0]!
-    vst1.8          {d1},    [r2]!
-    vst1.8          {d2},    [r4]!
-    bgt             block4
-
-    subs            r11, r11, #1
-    add             r0,  r0,  r1
-    add             r2,  r2,  r3
-    add             r4,  r4,  r5
-    add             r6,  r6,  r7
-    mov             lr,  r10
-    bgt             block4
-
-    pop             {r4-r8, r10, r11, pc}
-endfunc
-
-function x264_plane_copy_interleave_core_neon
-    push            {r4-r7, lr}
-    ldrd            r6, r7, [sp, #28]
-    ldrd            r4, r5, [sp, #20]
-    add             lr,  r6,  #15
-    bic             lr,  lr,  #15
-    sub             r1,  r1,  lr, lsl #1
-    sub             r3,  r3,  lr
-    sub             r5,  r5,  lr
-blocki:
-    vld1.8          {q0}, [r2]!
-    vld1.8          {q1}, [r4]!
-    subs            lr,  lr,  #16
-    vst2.8          {d0,d2}, [r0]!
-    vst2.8          {d1,d3}, [r0]!
-    bgt             blocki
-
-    subs            r7,  r7,  #1
-    add             r0,  r0,  r1
-    add             r2,  r2,  r3
-    add             r4,  r4,  r5
-    mov             lr,  r6
-    bgt             blocki
-
-    pop             {r4-r7, pc}
-endfunc
-
-function x264_plane_copy_swap_core_neon
-    push            {r4-r5, lr}
-    ldrd            r4, r5, [sp, #12]
-    add             lr,  r4,  #15
-    bic             lr,  lr,  #15
-    sub             r1,  r1,  lr, lsl #1
-    sub             r3,  r3,  lr, lsl #1
-1:
-    vld1.8          {q0, q1}, [r2]!
-    subs            lr,  lr,  #16
-    vrev16.8        q0,  q0
-    vrev16.8        q1,  q1
-    vst1.8          {q0, q1}, [r0]!
-    bgt             1b
-
-    subs            r5,  r5,  #1
-    add             r0,  r0,  r1
-    add             r2,  r2,  r3
-    mov             lr,  r4
-    bgt             1b
-
-    pop             {r4-r5, pc}
-endfunc
-
-function x264_store_interleave_chroma_neon
-    push            {lr}
-    ldr             lr,  [sp, #4]
-    mov             ip,  #FDEC_STRIDE
-1:
-    vld1.8          {d0}, [r2], ip
-    vld1.8          {d1}, [r3], ip
-    subs            lr,  lr,  #1
-    vst2.8          {d0,d1}, [r0,:128], r1
-    bgt             1b
-
-    pop             {pc}
-endfunc
-
-.macro integral4h p1, p2
-    vext.8          d1,  \p1, \p2,  #1
-    vext.8          d2,  \p1, \p2,  #2
-    vext.8          d3,  \p1, \p2,  #3
-    vaddl.u8        q0,  \p1, d1
-    vaddl.u8        q1,  d2,  d3
-    vadd.u16        q0,  q0,  q1
-    vadd.u16        q0,  q0,  q2
-.endm
-
-function integral_init4h_neon
-    sub             r3,  r0,  r2, lsl #1
-    vld1.8          {d6, d7}, [r1, :128]!
-1:
-    subs            r2,  r2,  #16
-    vld1.16         {q2},  [r3, :128]!
-    integral4h      d6, d7
-    vld1.8          {d6},  [r1, :64]!
-    vld1.16         {q2},  [r3, :128]!
-    vst1.16         {q0},  [r0, :128]!
-    integral4h      d7, d6
-    vld1.8          {d7},  [r1, :64]!
-    vst1.16         {q0},  [r0, :128]!
-    bgt             1b
-    bx              lr
-endfunc
-
-.macro integral8h p1, p2, s
-    vext.8          d1,  \p1,  \p2,  #1
-    vext.8          d2,  \p1,  \p2,  #2
-    vext.8          d3,  \p1,  \p2,  #3
-    vext.8          d4,  \p1,  \p2,  #4
-    vext.8          d5,  \p1,  \p2,  #5
-    vext.8          d6,  \p1,  \p2,  #6
-    vext.8          d7,  \p1,  \p2,  #7
-    vaddl.u8        q0,  \p1,  d1
-    vaddl.u8        q1,  d2,   d3
-    vaddl.u8        q2,  d4,   d5
-    vaddl.u8        q3,  d6,   d7
-    vadd.u16        q0,  q0,   q1
-    vadd.u16        q2,  q2,   q3
-    vadd.u16        q0,  q0,   q2
-    vadd.u16        q0,  q0,   \s
-.endm
-
-function integral_init8h_neon
-    sub             r3,  r0,  r2, lsl #1
-    vld1.8          {d16, d17}, [r1, :128]!
-1:
-    subs            r2,  r2,  #16
-    vld1.16         {q9},  [r3, :128]!
-    integral8h      d16, d17, q9
-    vld1.8          {d16}, [r1, :64]!
-    vld1.16         {q9},  [r3, :128]!
-    vst1.16         {q0},  [r0, :128]!
-    integral8h      d17, d16, q9
-    vld1.8          {d17}, [r1, :64]!
-    vst1.16         {q0},  [r0, :128]!
-    bgt             1b
-    bx              lr
-endfunc
-
-function integral_init4v_neon
-    push            {r4-r5}
-    mov             r3,   r0
-    add             r4,   r0,   r2,  lsl #3
-    add             r5,   r0,   r2,  lsl #4
-    sub             r2,   r2,   #8
-    vld1.16         {q11, q12}, [r3]!
-    vld1.16         {q8,  q9},  [r5]!
-    vld1.16         {q13}, [r3]!
-    vld1.16         {q10}, [r5]!
-1:
-    subs            r2,   r2,   #16
-    vld1.16         {q14, q15}, [r4]!
-    vext.8          q0,   q11,  q12, #8
-    vext.8          q1,   q12,  q13, #8
-    vext.8          q2,   q8,   q9,  #8
-    vext.8          q3,   q9,   q10, #8
-    vsub.u16        q14,  q14,  q11
-    vsub.u16        q15,  q15,  q12
-    vadd.u16        q0,   q0,   q11
-    vadd.u16        q1,   q1,   q12
-    vadd.u16        q2,   q2,   q8
-    vadd.u16        q3,   q3,   q9
-    vst1.16         {q14},  [r1]!
-    vst1.16         {q15},  [r1]!
-    vmov            q11,  q13
-    vmov            q8,   q10
-    vsub.u16        q0,   q2,   q0
-    vsub.u16        q1,   q3,   q1
-    vld1.16         {q12, q13}, [r3]!
-    vld1.16         {q9,  q10}, [r5]!
-    vst1.16         {q0}, [r0]!
-    vst1.16         {q1}, [r0]!
-    bgt             1b
-2:
-    pop             {r4-r5}
-    bx              lr
-endfunc
-
-function integral_init8v_neon
-    add             r2,  r0,  r1,  lsl #4
-    sub             r1,  r1,  #8
-    ands            r3,  r1,  #16 - 1
-    beq             1f
-    subs            r1,  r1,  #8
-    vld1.16         {q0}, [r0]
-    vld1.16         {q2}, [r2]!
-    vsub.u16        q8,  q2,  q0
-    vst1.16         {q8}, [r0]!
-    ble             2f
-1:
-    subs            r1,  r1,  #16
-    vld1.16         {q0, q1}, [r0]
-    vld1.16         {q2, q3}, [r2]!
-    vsub.u16        q8,  q2,  q0
-    vsub.u16        q9,  q3,  q1
-    vst1.16         {q8},  [r0]!
-    vst1.16         {q9},  [r0]!
-    bgt             1b
-2:
-    bx              lr
-endfunc
-
-function x264_mbtree_propagate_cost_neon
-    push            {r4-r5,lr}
-    ldrd            r4, r5, [sp, #12]
-    ldr             lr, [sp, #20]
-    vld1.32         {d6[], d7[]},  [r5]
-8:
-    subs            lr,  lr,  #8
-    vld1.16         {q8},  [r1]!
-    vld1.16         {q9},  [r2]!
-    vld1.16         {q10}, [r3]!
-    vld1.16         {q11}, [r4]!
-    vbic.u16        q10, #0xc000
-    vmin.u16        q10, q9,  q10
-    vmull.u16       q12, d18, d22           @ propagate_intra
-    vmull.u16       q13, d19, d23           @ propagate_intra
-    vsubl.u16       q14, d18, d20           @ propagate_num
-    vsubl.u16       q15, d19, d21           @ propagate_num
-    vmovl.u16       q10, d18                @ propagate_denom
-    vmovl.u16       q11, d19                @ propagate_denom
-    vmovl.u16       q9,  d17
-    vmovl.u16       q8,  d16
-    vcvt.f32.s32    q12, q12
-    vcvt.f32.s32    q13, q13
-    vcvt.f32.s32    q14, q14
-    vcvt.f32.s32    q15, q15
-    vcvt.f32.s32    q10, q10
-    vcvt.f32.s32    q11, q11
-    vrecpe.f32      q0,  q10
-    vrecpe.f32      q1,  q11
-    vcvt.f32.s32    q8,  q8
-    vcvt.f32.s32    q9,  q9
-    vrecps.f32      q10, q0,  q10
-    vrecps.f32      q11, q1,  q11
-    vmla.f32        q8,  q12, q3            @ propagate_amount
-    vmla.f32        q9,  q13, q3            @ propagate_amount
-    vmul.f32        q0,  q0,  q10
-    vmul.f32        q1,  q1,  q11
-    vmul.f32        q8,  q8,  q14
-    vmul.f32        q9,  q9,  q15
-    vmul.f32        q0,  q8,  q0
-    vmul.f32        q1,  q9,  q1
-    vcvt.s32.f32    q0,  q0
-    vcvt.s32.f32    q1,  q1
-    vqmovn.s32      d0,  q0
-    vqmovn.s32      d1,  q1
-    vst1.16         {q0},  [r0]!
-    bgt             8b
-    pop             {r4-r5,pc}
-endfunc
-
-function x264_mbtree_propagate_list_internal_neon
-    vld2.16         {d4[], d5[]}, [sp]      @ bipred_weight, mb_y
-    movrel          r12, pw_0to15
-    vmov.u16        q10, #0xc000
-    vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
-    vmov.u32        q11, #4
-    vmov.u8         q3,  #32
-    vdup.u16        q8,  d5[0]              @ mb_y
-    vzip.u16        q0,  q8
-    ldr             r12, [sp, #8]
-8:
-    subs            r12, r12,  #8
-    vld1.16         {q14},  [r1, :128]!      @ propagate_amount
-    vld1.16         {q15},  [r2]!            @ lowres_cost
-    vld1.16         {q8, q9},  [r0]!
-    vand            q15, q15, q10
-    vceq.u16        q1,  q15, q10
-    vmull.u16       q12, d28, d4
-    vmull.u16       q13, d29, d4
-    vrshrn.u32      d30, q12, #6
-    vrshrn.u32      d31, q13, #6
-    vbsl            q1,  q15, q14           @ if( lists_used == 3 )
-    @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
-    vshr.s16        q12, q8,  #5
-    vshr.s16        q13, q9,  #5
-    vuzp.16         q8,  q9                 @ x & 31, y & 31
-    vadd.s16        q12, q12, q0
-    vadd.s16        q0,  q0,  q11
-    vmovn.i16       d16, q8
-    vmovn.i16       d17, q9
-    vadd.s16        q13, q13, q0
-    vbic.i16        q8,  #128+64+32
-    vadd.s16        q0,  q0,  q11
-    vbic.i16        q8,  #(128+64+32)<<8
-    vst1.16         {q12, q13},  [r3, :128]!
-    vsub.i8         q9,  q3,  q8
-    vmull.u8        q12, d17, d16           @ idx3weight = y*x
-    vmull.u8        q14, d19, d16           @ idx1weight = (32-y)*x
-    vmull.u8        q15, d19, d18           @ idx0weight = (32-y)*(32-x)
-    vmull.u8        q13, d17, d18           @ idx2weight = y*(32-x)
-    vmull.u16       q9,  d28, d2            @ idx1weight
-    vmull.u16       q8,  d29, d3
-    vmull.u16       q14, d30, d2            @ idx0weight
-    vmull.u16       q15, d31, d3
-    vrshrn.u32      d18, q9,  #10           @ idx1weight
-    vrshrn.u32      d19, q8,  #10
-    vrshrn.u32      d16, q14, #10           @ idx0weight
-    vrshrn.u32      d17, q15, #10
-    vmull.u16       q14, d24, d2            @ idx3weight
-    vmull.u16       q15, d25, d3
-    vzip.16         q8,  q9
-    vmull.u16       q12, d26, d2            @ idx2weight
-    vmull.u16       q13, d27, d3
-    vst1.16         {q8, q9},   [r3, :128]!
-    vrshrn.u32      d19, q15, #10           @ idx3weight
-    vrshrn.u32      d18, q14, #10
-    vrshrn.u32      d16, q12, #10           @ idx2weight
-    vrshrn.u32      d17, q13, #10
-    vzip.16         q8,  q9
-    vst1.16         {q8, q9},   [r3, :128]!
-    bge             8b
-    bx              lr
-endfunc
-
-@ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
-function x264_mbtree_fix8_pack_neon, export=1
-    subs            r3,  r2,  #8
-    blt             2f
-1:
-    subs            r3,  r3,  #8
-    vld1.32         {q0,q1}, [r1,:128]!
-    vcvt.s32.f32    q0,  q0,  #8
-    vcvt.s32.f32    q1,  q1,  #8
-    vqmovn.s32      d4,  q0
-    vqmovn.s32      d5,  q1
-    vrev16.8        q3,  q2
-    vst1.16         {q3}, [r0,:128]!
-    bge             1b
-2:
-    adds            r3,  r3,  #8
-    bxeq            lr
-3:
-    subs            r3,  r3,  #1
-    vld1.32         {d0[0]}, [r1]!
-    vcvt.s32.f32    s0,  s0,  #8
-    vrev16.8        d0,  d0
-    vst1.16         {d0[0]}, [r0]!
-    bgt             3b
-
-    bx              lr
-endfunc
-
-@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
-function x264_mbtree_fix8_unpack_neon, export=1
-    subs            r3,  r2,  #8
-    blt             2f
-1:
-    subs            r3,  r3,  #8
-    vld1.16         {q0}, [r1,:128]!
-    vrev16.8        q1,  q0
-    vmovl.s16       q0,  d2
-    vmovl.s16       q1,  d3
-    vcvt.f32.s32    q0,  q0,  #8
-    vcvt.f32.s32    q1,  q1,  #8
-    vst1.32         {q0,q1}, [r0,:128]!
-    bge             1b
-2:
-    adds            r3,  r3,  #8
-    bxeq            lr
-3:
-    subs            r3,  r3,  #1
-    vld1.16         {d0[0]}, [r1]!
-    vrev16.8        d0,  d0
-    vmovl.s16       q0,  d0
-    vcvt.f32.s32    d0,  d0,  #8
-    vst1.32         {d0[0]}, [r0]!
-    bgt             3b
-
-    bx              lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/mc-c.c b/android/src/main/libenc/jni/libx264/common/arm/mc-c.c
deleted file mode 100755
index ae1a686..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/mc-c.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*****************************************************************************
- * mc-c.c: arm motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "mc.h"
-
-void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
-void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_neon( void *dst, size_t n );
-
-void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
-
-void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
-                                pixel *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
-                                         pixel *dstv, intptr_t i_dstv,
-                                         pixel *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
-                                            pixel *dstb, intptr_t i_dstb,
-                                            pixel *dstc, intptr_t i_dstc,
-                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
-                                           pixel *srcu, intptr_t i_srcu,
-                                           pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
-                                     pixel *src, intptr_t i_src, int w, int h );
-
-void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
-
-#if !HIGH_BIT_DEPTH
-#define MC_WEIGHT(func)\
-void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
-\
-static weight_fn_t x264_mc##func##_wtab_neon[6] =\
-{\
-    x264_mc_weight_w4##func##_neon,\
-    x264_mc_weight_w4##func##_neon,\
-    x264_mc_weight_w8##func##_neon,\
-    x264_mc_weight_w16##func##_neon,\
-    x264_mc_weight_w16##func##_neon,\
-    x264_mc_weight_w20##func##_neon,\
-};
-
-MC_WEIGHT()
-MC_WEIGHT(_nodenom)
-MC_WEIGHT(_offsetadd)
-MC_WEIGHT(_offsetsub)
-#endif
-
-void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-
-void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
-
-void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
-void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
-void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
-
-void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
-void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
-void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
-void integral_init8v_neon( uint16_t *, intptr_t );
-
-void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
-
-void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
-void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
-
-#if !HIGH_BIT_DEPTH
-static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
-{
-    if( w->i_scale == 1<<w->i_denom )
-    {
-        if( w->i_offset < 0 )
-        {
-            w->weightfn = x264_mc_offsetsub_wtab_neon;
-            w->cachea[0] = -w->i_offset;
-        }
-        else
-        {
-            w->weightfn = x264_mc_offsetadd_wtab_neon;
-            w->cachea[0] = w->i_offset;
-        }
-    }
-    else if( !w->i_denom )
-        w->weightfn = x264_mc_nodenom_wtab_neon;
-    else
-        w->weightfn = x264_mc_wtab_neon;
-}
-
-static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
-{
-    NULL,
-    x264_pixel_avg2_w4_neon,
-    x264_pixel_avg2_w8_neon,
-    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
-    x264_pixel_avg2_w16_neon,
-    x264_pixel_avg2_w20_neon,
-};
-
-static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
-{
-    NULL,
-    x264_mc_copy_w4_neon,
-    x264_mc_copy_w8_neon,
-    NULL,
-    x264_mc_copy_w16_neon,
-};
-
-static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
-                          uint8_t *src[4], intptr_t i_src_stride,
-                          int mvx, int mvy,
-                          int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if ( (mvy&3) == 3 )             // explict if() to force conditional add
-        src1 += i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_neon[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
-    }
-    else if( weight->weightfn )
-        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
-    else
-        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
-}
-
-static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
-                              uint8_t *src[4], intptr_t i_src_stride,
-                              int mvx, int mvy,
-                              int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if ( (mvy&3) == 3 )             // explict if() to force conditional add
-        src1 += i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_neon[i_width>>2](
-                dst, *i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
-        return dst;
-    }
-    else if( weight->weightfn )
-    {
-        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
-        return dst;
-    }
-    else
-    {
-        *i_dst_stride = i_src_stride;
-        return src1;
-    }
-}
-
-static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                              intptr_t stride, int width, int height, int16_t *buf )
-{
-    intptr_t realign = (intptr_t)src & 15;
-    src -= realign;
-    dstv -= realign;
-    dstc -= realign;
-    dsth -= realign;
-    width += realign;
-    while( height-- )
-    {
-        x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
-        x264_hpel_filter_c_neon( dstc, buf+8, width );
-        x264_hpel_filter_h_neon( dsth, src, width );
-        dsth += stride;
-        dstv += stride;
-        dstc += stride;
-        src  += stride;
-    }
-}
-
-PLANE_COPY(16, neon)
-PLANE_COPY_SWAP(16, neon)
-PLANE_INTERLEAVE(neon)
-#endif // !HIGH_BIT_DEPTH
-
-PROPAGATE_LIST(neon)
-
-void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
-{
-    if( !(cpu&X264_CPU_ARMV6) )
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
-    pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
-    pf->prefetch_ref  = x264_prefetch_ref_arm;
-#endif // !HIGH_BIT_DEPTH
-
-    if( !(cpu&X264_CPU_NEON) )
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
-    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
-    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
-
-    pf->plane_copy              = x264_plane_copy_neon;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
-    pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
-    pf->plane_copy_swap = x264_plane_copy_swap_neon;
-
-    pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
-
-    pf->weight    = x264_mc_wtab_neon;
-    pf->offsetadd = x264_mc_offsetadd_wtab_neon;
-    pf->offsetsub = x264_mc_offsetsub_wtab_neon;
-    pf->weight_cache = x264_weight_cache_neon;
-
-    pf->mc_chroma = x264_mc_chroma_neon;
-    pf->mc_luma = mc_luma_neon;
-    pf->get_ref = get_ref_neon;
-    pf->hpel_filter = hpel_filter_neon;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
-
-    pf->integral_init4h = integral_init4h_neon;
-    pf->integral_init8h = integral_init8h_neon;
-    pf->integral_init4v = integral_init4v_neon;
-    pf->integral_init8v = integral_init8v_neon;
-
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
-#endif // !HIGH_BIT_DEPTH
-
-// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-#ifndef SYS_MACOSX
-    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
-#endif
-    pf->memzero_aligned = x264_memzero_aligned_neon;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/arm/mc.h b/android/src/main/libenc/jni/libx264/common/arm/mc.h
deleted file mode 100755
index c4d8033..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/mc.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * mc.h: arm motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ARM_MC_H
-#define X264_ARM_MC_H
-
-void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/arm/pixel-a.S b/android/src/main/libenc/jni/libx264/common/arm/pixel-a.S
deleted file mode 100755
index b158e61..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/pixel-a.S
+++ /dev/null
@@ -1,1511 +0,0 @@
-/*****************************************************************************
- * pixel.S: arm pixel metrics
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-
-.rept 16
-        .byte 0xff
-.endr
-mask_ff:
-.rept 16
-        .byte 0
-.endr
-
-mask_ac4:
-.short 0, -1, -1, -1,  0, -1, -1, -1
-mask_ac8:
-.short 0, -1, -1, -1, -1, -1, -1, -1
-
-.text
-
-.macro SAD4_ARMV6 h
-function x264_pixel_sad_4x\h\()_armv6
-    push        {r4-r6,lr}
-    ldr         r4, [r2], r3
-    ldr         r5, [r0], r1
-    ldr         r6, [r2], r3
-    ldr         lr, [r0], r1
-    usad8       ip, r4, r5
-.rept (\h - 2)/2
-    ldr         r4, [r2], r3
-    ldr         r5, [r0], r1
-    usada8      ip, r6, lr, ip
-    ldr         r6, [r2], r3
-    ldr         lr, [r0], r1
-    usada8      ip, r4, r5, ip
-.endr
-    usada8      r0, r6, lr, ip
-    pop         {r4-r6,pc}
-endfunc
-.endm
-
-SAD4_ARMV6 4
-SAD4_ARMV6 8
-
-
-.macro SAD_START_4 align:vararg
-    vld1.32     {d1[]}, [r2\align], r3
-    vld1.32     {d0[]}, [r0,:32], r1
-    vabdl.u8    q8,  d0,  d1
-.endm
-
-.macro SAD_4 align:vararg
-    vld1.32     {d1[]}, [r2\align], r3
-    vld1.32     {d0[]}, [r0,:32], r1
-    vabal.u8    q8,  d0,  d1
-.endm
-
-.macro SAD_START_8 align:vararg
-    vld1.64     {d1}, [r2\align], r3
-    vld1.64     {d0}, [r0,:64], r1
-    vabdl.u8    q8,  d0,  d1
-.endm
-
-.macro SAD_8 align:vararg
-    vld1.64     {d1}, [r2\align], r3
-    vld1.64     {d0}, [r0,:64], r1
-    vabal.u8    q8,  d0,  d1
-.endm
-
-.macro SAD_START_16 align:vararg
-    vld1.64     {d2-d3}, [r2\align], r3
-    vld1.64     {d0-d1}, [r0,:128], r1
-    vabdl.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2\align], r3
-    vabdl.u8    q9,  d1,  d3
-    vld1.64     {d4-d5}, [r0,:128], r1
-.endm
-
-.macro SAD_16 align:vararg
-    vabal.u8    q8,  d4,  d6
-    vld1.64     {d2-d3}, [r2\align], r3
-    vabal.u8    q9,  d5,  d7
-    vld1.64     {d0-d1}, [r0,:128], r1
-    vabal.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2\align], r3
-    vabal.u8    q9,  d1,  d3
-    vld1.64     {d4-d5}, [r0,:128], r1
-.endm
-
-.macro SAD_FUNC w, h, name, align:vararg
-function x264_pixel_sad\name\()_\w\()x\h\()_neon
-    SAD_START_\w \align
-
-.if \w == 16
-.rept \h / 2 - 1
-    SAD_\w \align
-.endr
-.else
-.rept \h - 1
-    SAD_\w \align
-.endr
-.endif
-
-.if \w > 8
-    vabal.u8    q8,  d4,  d6
-    vabal.u8    q9,  d5,  d7
-    vadd.u16    q8,  q8,  q9
-.endif
-.if \w > 4
-    vadd.u16    d16, d16, d17
-.endif
-    vpadd.u16   d0,  d16, d16
-    vpaddl.u16  d0,  d0
-    vmov.u32    r0,  d0[0]
-    bx          lr
-endfunc
-.endm
-
-SAD_FUNC  4,  4
-SAD_FUNC  4,  8
-SAD_FUNC  8,  4
-SAD_FUNC  8,  8
-SAD_FUNC  8,  16
-SAD_FUNC  16, 8
-SAD_FUNC  16, 16
-
-SAD_FUNC  4,  4,  _aligned, ,:32
-SAD_FUNC  4,  8,  _aligned, ,:32
-SAD_FUNC  8,  4,  _aligned, ,:64
-SAD_FUNC  8,  8,  _aligned, ,:64
-SAD_FUNC  8,  16, _aligned, ,:64
-SAD_FUNC  16, 8,  _aligned, ,:128
-SAD_FUNC  16, 16, _aligned, ,:128
-
-// If dual issue is possible, use additional accumulators to avoid
-// stalls from vadal's latency. This only matters for aligned.
-.macro SAD_DUAL_START_8
-    SAD_START_8 ,:64
-    vld1.64     {d3}, [r2,:64], r3
-    vld1.64     {d2}, [r0,:64], r1
-    vabdl.u8    q9,  d2,  d3
-.endm
-
-.macro SAD_DUAL_8 align:vararg
-    vld1.64     {d1}, [r2,:64], r3
-    vld1.64     {d0}, [r0,:64], r1
-    vabal.u8    q8,  d0,  d1
-    vld1.64     {d3}, [r2,:64], r3
-    vld1.64     {d2}, [r0,:64], r1
-    vabal.u8    q9,  d2,  d3
-.endm
-
-.macro SAD_DUAL_START_16
-    SAD_START_16 ,:128
-    vabdl.u8    q10, d4,  d6
-    vld1.64     {d2-d3}, [r2,:128], r3
-    vabdl.u8    q11, d5,  d7
-    vld1.64     {d0-d1}, [r0,:128], r1
-.endm
-
-.macro SAD_DUAL_16
-    vabal.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2,:128], r3
-    vabal.u8    q9,  d1,  d3
-    vld1.64     {d4-d5}, [r0,:128], r1
-    vabal.u8    q10, d4,  d6
-    vld1.64     {d2-d3}, [r2,:128], r3
-    vabal.u8    q11, d5,  d7
-    vld1.64     {d0-d1}, [r0,:128], r1
-.endm
-
-.macro SAD_DUAL_END_16
-    vabal.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2,:128], r3
-    vabal.u8    q9,  d1,  d3
-    vld1.64     {d4-d5}, [r0,:128], r1
-    vabal.u8    q10, d4,  d6
-    vabal.u8    q11, d5,  d7
-.endm
-
-.macro SAD_FUNC_DUAL w, h
-function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
-    SAD_DUAL_START_\w
-.rept \h / 2 - \w / 8
-    SAD_DUAL_\w
-.endr
-
-.if \w > 8
-    SAD_DUAL_END_16
-    vadd.u16    q8,  q8,  q9
-    vadd.u16    q9,  q10, q11
-.endif
-.if \w > 4
-    vadd.u16    q8,  q8,  q9
-    vadd.u16    d16, d16, d17
-.endif
-    vpadd.u16   d0,  d16, d16
-    vpaddl.u16  d0,  d0
-    vmov.u32    r0,  d0[0]
-    bx          lr
-endfunc
-.endm
-
-SAD_FUNC_DUAL  8,  4
-SAD_FUNC_DUAL  8,  8
-SAD_FUNC_DUAL  8,  16
-SAD_FUNC_DUAL  16, 8
-SAD_FUNC_DUAL  16, 16
-
-
-.macro SAD_X_START_4 x
-    vld1.32     {d0[]}, [r0,:32], lr
-    vld1.32     {d1[]}, [r1], r6
-    vabdl.u8    q8,  d1,  d0
-    vld1.32     {d2[]}, [r2], r6
-    vabdl.u8    q9,  d2,  d0
-    vld1.32     {d3[]}, [r3], r6
-    vabdl.u8    q10, d3,  d0
-.if \x == 4
-    vld1.32     {d4[]}, [r12], r6
-    vabdl.u8    q11, d4,  d0
-.endif
-.endm
-
-.macro SAD_X_4 x
-    vld1.32     {d0[]}, [r0,:32], lr
-    vld1.32     {d1[]}, [r1], r6
-    vabal.u8    q8,  d1,  d0
-    vld1.32     {d2[]}, [r2], r6
-    vabal.u8    q9,  d2,  d0
-    vld1.32     {d3[]}, [r3], r6
-    vabal.u8    q10, d3,  d0
-.if \x == 4
-    vld1.32     {d4[]}, [r12], r6
-    vabal.u8    q11, d4,  d0
-.endif
-.endm
-
-.macro SAD_X_START_8 x
-    vld1.64     {d0}, [r0,:64], lr
-    vld1.64     {d1}, [r1], r6
-    vabdl.u8    q8,  d1,  d0
-    vld1.64     {d2}, [r2], r6
-    vabdl.u8    q9,  d2,  d0
-    vld1.64     {d3}, [r3], r6
-    vabdl.u8    q10, d3,  d0
-.if \x == 4
-    vld1.64     {d4}, [r12], r6
-    vabdl.u8    q11, d4,  d0
-.endif
-.endm
-
-.macro SAD_X_8 x
-    vld1.64     {d0}, [r0,:64], lr
-    vld1.64     {d1}, [r1], r6
-    vabal.u8    q8,  d1,  d0
-    vld1.64     {d2}, [r2], r6
-    vabal.u8    q9,  d2,  d0
-    vld1.64     {d3}, [r3], r6
-    vabal.u8    q10, d3,  d0
-.if \x == 4
-    vld1.64     {d4}, [r12], r6
-    vabal.u8    q11, d4,  d0
-.endif
-.endm
-
-.macro SAD_X_START_16 x
-    vld1.64     {d0-d1}, [r0,:128], lr
-    vld1.64     {d2-d3}, [r1], r6
-    vabdl.u8    q8,  d2,  d0
-    vabdl.u8    q12, d3,  d1
-    vld1.64     {d4-d5}, [r2], r6
-    vabdl.u8    q9,  d4,  d0
-    vabdl.u8    q13, d5,  d1
-    vld1.64     {d6-d7}, [r3], r6
-    vabdl.u8    q10, d6,  d0
-    vabdl.u8    q14, d7,  d1
-.if \x == 4
-    vld1.64     {d2-d3}, [r12], r6
-    vabdl.u8    q11, d2,  d0
-    vabdl.u8    q15, d3,  d1
-.endif
-.endm
-
-.macro SAD_X_16 x
-    vld1.64     {d0-d1}, [r0,:128], lr
-    vld1.64     {d2-d3}, [r1], r6
-    vabal.u8    q8,  d2,  d0
-    vabal.u8    q12, d3,  d1
-    vld1.64     {d4-d5}, [r2], r6
-    vabal.u8    q9,  d4,  d0
-    vabal.u8    q13, d5,  d1
-    vld1.64     {d6-d7}, [r3], r6
-    vabal.u8    q10, d6,  d0
-    vabal.u8    q14, d7,  d1
-.if \x == 4
-    vld1.64     {d2-d3}, [r12], r6
-    vabal.u8    q11, d2,  d0
-    vabal.u8    q15, d3,  d1
-.endif
-.endm
-
-.macro SAD_X_FUNC x, w, h
-function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
-    push        {r6-r7,lr}
-.if \x == 3
-    ldrd        r6,  r7,  [sp, #12]
-.else
-    ldrd        r6,  r7,  [sp, #16]
-    ldr         r12, [sp, #12]
-.endif
-    mov         lr,  #FENC_STRIDE
-
-    SAD_X_START_\w \x
-.rept \h - 1
-    SAD_X_\w \x
-.endr
-
-// add up the sads
-.if \w > 8
-    vadd.u16    q8,  q8,  q12
-    vadd.u16    q9,  q9,  q13
-    vadd.u16    q10, q10, q14
-.if \x == 4
-    vadd.u16    q11, q11, q15
-.endif
-.endif
-.if \w > 4
-    vadd.u16    d16, d16, d17
-    vadd.u16    d18, d18, d19
-    vadd.u16    d20, d20, d21
-.if \x == 4
-    vadd.u16    d22, d22, d23
-.endif
-.endif
-    vpadd.u16   d0,  d16, d18
-    vpadd.u16   d1,  d20, d22
-    vpaddl.u16  q0,  q0
-
-.if \x == 3
-    vst1.32     {d0},    [r7]!
-    vst1.32     {d1[0]}, [r7,:32]
-.else
-    vst1.32     {d0-d1}, [r7]
-.endif
-    pop         {r6-r7,pc}
-endfunc
-.endm
-
-SAD_X_FUNC  3, 4,  4
-SAD_X_FUNC  3, 4,  8
-SAD_X_FUNC  3, 8,  4
-SAD_X_FUNC  3, 8,  8
-SAD_X_FUNC  3, 8,  16
-SAD_X_FUNC  3, 16, 8
-SAD_X_FUNC  3, 16, 16
-
-SAD_X_FUNC  4, 4,  4
-SAD_X_FUNC  4, 4,  8
-SAD_X_FUNC  4, 8,  4
-SAD_X_FUNC  4, 8,  8
-SAD_X_FUNC  4, 8,  16
-SAD_X_FUNC  4, 16, 8
-SAD_X_FUNC  4, 16, 16
-
-function x264_pixel_vsad_neon
-    subs        r2,  r2,    #2
-    vld1.8     {q0}, [r0],  r1
-    vld1.8     {q1}, [r0],  r1
-    vabdl.u8    q2,  d0,    d2
-    vabdl.u8    q3,  d1,    d3
-    ble         2f
-1:
-    subs        r2,  r2,    #2
-    vld1.8     {q0}, [r0],  r1
-    vabal.u8    q2,  d2,    d0
-    vabal.u8    q3,  d3,    d1
-    vld1.8     {q1}, [r0],  r1
-    blt         2f
-    vabal.u8    q2,  d0,    d2
-    vabal.u8    q3,  d1,    d3
-    bgt         1b
-2:
-    vadd.u16    q0,  q2,    q3
-    HORIZ_ADD   d0,  d0,    d1
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_pixel_asd8_neon
-    ldr         r12, [sp,  #0]
-    sub         r12,  r12, #2
-    vld1.8     {d0}, [r0], r1
-    vld1.8     {d1}, [r2], r3
-    vld1.8     {d2}, [r0], r1
-    vld1.8     {d3}, [r2], r3
-    vsubl.u8    q8,   d0,  d1
-1:
-    subs        r12,  r12,  #2
-    vld1.8     {d4}, [r0], r1
-    vld1.8     {d5}, [r2], r3
-    vsubl.u8    q9,   d2,  d3
-    vsubl.u8    q10,  d4,  d5
-    vadd.s16    q8,   q9
-    vld1.8     {d2}, [r0], r1
-    vld1.8     {d3}, [r2], r3
-    vadd.s16    q8,   q10
-    bgt         1b
-    vsubl.u8    q9,   d2,  d3
-    vadd.s16    q8,   q9
-    vpaddl.s16  q8,   q8
-    vpadd.s32   d16,  d16, d17
-    vpadd.s32   d16,  d16, d17
-    vabs.s32    d16,  d16
-    vmov.32     r0,   d16[0]
-    bx          lr
-endfunc
-
-
-.macro SSD_START_4
-    vld1.32     {d16[]}, [r0,:32], r1
-    vld1.32     {d17[]}, [r2,:32], r3
-    vsubl.u8    q2, d16, d17
-    vld1.32     {d16[]}, [r0,:32], r1
-    vmull.s16   q0, d4, d4
-    vld1.32     {d17[]}, [r2,:32], r3
-.endm
-
-.macro SSD_4
-    vsubl.u8    q2, d16, d17
-    vld1.32     {d16[]}, [r0,:32], r1
-    vmlal.s16   q0, d4, d4
-    vld1.32     {d17[]}, [r2,:32], r3
-.endm
-
-.macro SSD_END_4
-    vsubl.u8    q2, d16, d17
-    vmlal.s16   q0, d4, d4
-.endm
-
-.macro SSD_START_8
-    vld1.64     {d16}, [r0,:64], r1
-    vld1.64     {d17}, [r2,:64], r3
-    vsubl.u8    q2, d16, d17
-    vld1.64     {d16}, [r0,:64], r1
-    vmull.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-    vld1.64     {d17}, [r2,:64], r3
-.endm
-
-.macro SSD_8
-    vsubl.u8    q2, d16, d17
-    vld1.64     {d16}, [r0,:64], r1
-    vmlal.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-    vld1.64     {d17}, [r2,:64], r3
-.endm
-
-.macro SSD_END_8
-    vsubl.u8    q2, d16, d17
-    vmlal.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-.endm
-
-.macro SSD_START_16
-    vld1.64     {d16-d17}, [r0,:128], r1
-    vld1.64     {d18-d19}, [r2,:128], r3
-    vsubl.u8    q2, d16, d18
-    vsubl.u8    q3, d17, d19
-    vld1.64     {d16-d17}, [r0,:128], r1
-    vmull.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-    vld1.64     {d18-d19}, [r2,:128], r3
-    vmlal.s16   q0, d6, d6
-    vmlal.s16   q0, d7, d7
-.endm
-
-.macro SSD_16
-    vsubl.u8    q2, d16, d18
-    vsubl.u8    q3, d17, d19
-    vld1.64     {d16-d17}, [r0,:128], r1
-    vmlal.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-    vld1.64     {d18-d19}, [r2,:128], r3
-    vmlal.s16   q0, d6, d6
-    vmlal.s16   q0, d7, d7
-.endm
-
-.macro SSD_END_16
-    vsubl.u8    q2, d16, d18
-    vsubl.u8    q3, d17, d19
-    vmlal.s16   q0, d4, d4
-    vmlal.s16   q0, d5, d5
-    vmlal.s16   q0, d6, d6
-    vmlal.s16   q0, d7, d7
-.endm
-
-.macro SSD_FUNC w h
-function x264_pixel_ssd_\w\()x\h\()_neon
-    SSD_START_\w
-.rept \h-2
-    SSD_\w
-.endr
-    SSD_END_\w
-    vadd.s32    d0, d0, d1
-    vpadd.s32   d0, d0, d0
-    vmov.32     r0, d0[0]
-    bx          lr
-endfunc
-.endm
-
-SSD_FUNC   4, 4
-SSD_FUNC   4, 8
-SSD_FUNC   8, 4
-SSD_FUNC   8, 8
-SSD_FUNC   8, 16
-SSD_FUNC  16, 8
-SSD_FUNC  16, 16
-
-function x264_pixel_ssd_nv12_core_neon
-    push       {r4-r5}
-    ldrd        r4,  r5,  [sp, #8]
-    add         r12, r4,  #8
-    bic         r12, r12, #15
-    vmov.u64    q8,  #0
-    vmov.u64    q9,  #0
-    sub         r1,  r1,  r12, lsl #1
-    sub         r3,  r3,  r12, lsl #1
-1:
-    subs        r12, r4,  #16
-    vld2.8     {d0,d1},   [r0]!
-    vld2.8     {d2,d3},   [r2]!
-    vld2.8     {d4,d5},   [r0]!
-    vld2.8     {d6,d7},   [r2]!
-
-    vsubl.u8    q10, d0,  d2
-    vsubl.u8    q11, d1,  d3
-    vmull.s16   q14, d20, d20
-    vmull.s16   q15, d22, d22
-    vsubl.u8    q12, d4,  d6
-    vsubl.u8    q13, d5,  d7
-    vmlal.s16   q14, d21, d21
-    vmlal.s16   q15, d23, d23
-
-    blt         4f
-    beq         3f
-2:
-    vmlal.s16   q14, d24, d24
-    vmlal.s16   q15, d26, d26
-    vld2.8     {d0,d1},   [r0]!
-    vld2.8     {d2,d3},   [r2]!
-    vmlal.s16   q14, d25, d25
-    vmlal.s16   q15, d27, d27
-
-    subs        r12, r12, #16
-    vsubl.u8    q10, d0,  d2
-    vsubl.u8    q11, d1,  d3
-    vmlal.s16   q14, d20, d20
-    vmlal.s16   q15, d22, d22
-    vld2.8     {d4,d5},   [r0]!
-    vld2.8     {d6,d7},   [r2]!
-    vmlal.s16   q14, d21, d21
-    vmlal.s16   q15, d23, d23
-    blt         4f
-
-    vsubl.u8    q12, d4,  d6
-    vsubl.u8    q13, d5,  d7
-    bgt         2b
-3:
-    vmlal.s16   q14, d24, d24
-    vmlal.s16   q15, d26, d26
-    vmlal.s16   q14, d25, d25
-    vmlal.s16   q15, d27, d27
-4:
-    subs        r5,  r5,  #1
-    vaddw.s32   q8,  q8,  d28
-    vaddw.s32   q9,  q9,  d30
-    add         r0,  r0,  r1
-    add         r2,  r2,  r3
-    vaddw.s32   q8,  q8,  d29
-    vaddw.s32   q9,  q9,  d31
-    bgt         1b
-
-    vadd.u64    d16, d16, d17
-    vadd.u64    d18, d18, d19
-    ldrd        r4,  r5, [sp, #16]
-    vst1.64    {d16}, [r4]
-    vst1.64    {d18}, [r5]
-
-    pop        {r4-r5}
-    bx          lr
-endfunc
-
-.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
-    vmull.u8        \qsqr, \dsrc, \dsrc
-    vaddw.u8        q0, q0, \dsrc
-    \vpadal         \qsqr_sum, \qsqr_last
-.endm
-
-function x264_pixel_var_8x8_neon
-    vld1.64         {d16}, [r0,:64], r1
-    vmull.u8        q1,  d16, d16
-    vmovl.u8        q0,  d16
-    vld1.64         {d18}, [r0,:64], r1
-    vmull.u8        q2,  d18, d18
-    vaddw.u8        q0,  q0,  d18
-
-    vld1.64         {d20}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q1,   q3,  d20, vpaddl.u16
-    vld1.64         {d22}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q2,   q8,  d22, vpaddl.u16
-
-    vld1.64         {d24}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q3,   q9,  d24
-    vld1.64         {d26}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q8,   q10, d26
-    vld1.64         {d24}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q9,   q14, d24
-    vld1.64         {d26}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q10,  q15, d26
-    b               x264_var_end
-endfunc
-
-function x264_pixel_var_8x16_neon
-    vld1.64         {d16}, [r0,:64], r1
-    vld1.64         {d18}, [r0,:64], r1
-    vmull.u8        q1,  d16, d16
-    vmovl.u8        q0,  d16
-    vld1.64         {d20}, [r0,:64], r1
-    vmull.u8        q2,  d18, d18
-    vaddw.u8        q0,  q0,  d18
-
-    mov             ip,  #12
-
-    vld1.64         {d22}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q1,   q14,  d20, vpaddl.u16
-    vld1.64         {d16}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q2,   q15,  d22, vpaddl.u16
-
-1:  subs            ip,  ip,  #4
-    vld1.64         {d18}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q14,  q12, d16
-    vld1.64         {d20}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q15,  q13, d18
-    vld1.64         {d22}, [r0,:64], r1
-    VAR_SQR_SUM     q1,  q12,  q14, d20
-    beq             2f
-    vld1.64         {d16}, [r0,:64], r1
-    VAR_SQR_SUM     q2,  q13,  q15, d22
-    b               1b
-2:
-    VAR_SQR_SUM     q2,  q13,  q15, d22
-    b               x264_var_end
-endfunc
-
-function x264_pixel_var_16x16_neon
-    vld1.64         {d16-d17}, [r0,:128], r1
-    vmull.u8        q12, d16, d16
-    vmovl.u8        q0,  d16
-    vmull.u8        q13, d17, d17
-    vaddw.u8        q0,  q0,  d17
-
-    vld1.64         {d18-d19}, [r0,:128], r1
-    VAR_SQR_SUM     q1,  q12,  q14, d18, vpaddl.u16
-    VAR_SQR_SUM     q2,  q13,  q15, d19, vpaddl.u16
-
-    mov             ip,  #7
-var16_loop:
-    subs            ip,  ip,  #1
-    vld1.64         {d16-d17}, [r0,:128], r1
-    VAR_SQR_SUM     q1,  q14,  q12, d16
-    VAR_SQR_SUM     q2,  q15,  q13, d17
-
-    vld1.64         {d18-d19}, [r0,:128], r1
-    VAR_SQR_SUM     q1,  q12,  q14, d18
-    VAR_SQR_SUM     q2,  q13,  q15, d19
-    bgt             var16_loop
-endfunc
-
-function x264_var_end, export=0
-    vpaddl.u16      q8,  q14
-    vpaddl.u16      q9,  q15
-    vadd.u32        q1,  q1,  q8
-    vadd.u16        d0,  d0,  d1
-    vadd.u32        q1,  q1,  q9
-    vadd.u32        q1,  q1,  q2
-    vpaddl.u16      d0,  d0
-    vadd.u32        d2,  d2,  d3
-    vpadd.u32       d0,  d0,  d2
-
-    vmov            r0,  r1,  d0
-    bx              lr
-endfunc
-
-.macro DIFF_SUM diff da db lastdiff
-    vld1.64         {\da}, [r0,:64], r1
-    vld1.64         {\db}, [r2,:64], r3
-.ifnb \lastdiff
-    vadd.s16        q0,  q0,  \lastdiff
-.endif
-    vsubl.u8        \diff, \da, \db
-.endm
-
-.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
-    \vmlal          \acc, \d0, \d0
-    vmlal.s16       \acc, \d1, \d1
-.endm
-
-function x264_pixel_var2_8x8_neon
-    DIFF_SUM        q0,  d0,  d1
-    DIFF_SUM        q8,  d16, d17
-    SQR_ACC         q1,  d0,  d1,  vmull.s16
-    DIFF_SUM        q9,  d18, d19, q8
-    SQR_ACC         q2,  d16, d17, vmull.s16
-.rept 2
-    DIFF_SUM        q8,  d16, d17, q9
-    SQR_ACC         q1,  d18, d19
-    DIFF_SUM        q9,  d18, d19, q8
-    SQR_ACC         q2,  d16, d17
-.endr
-    DIFF_SUM        q8,  d16, d17, q9
-    SQR_ACC         q1,  d18, d19
-    vadd.s16        q0,  q0,  q8
-    SQR_ACC         q2,  d16, d17
-
-    ldr             ip,  [sp]
-    vadd.s16        d0,  d0,  d1
-    vadd.s32        q1,  q1,  q2
-    vpaddl.s16      d0,  d0
-    vadd.s32        d1,  d2,  d3
-    vpadd.s32       d0,  d0,  d1
-
-    vmov            r0,  r1,  d0
-    vst1.32         {d0[1]}, [ip,:32]
-    mul             r0,  r0,  r0
-    sub             r0,  r1,  r0,  lsr #6
-    bx              lr
-endfunc
-
-function x264_pixel_var2_8x16_neon
-    vld1.64         {d16}, [r0,:64], r1
-    vld1.64         {d17}, [r2,:64], r3
-    vld1.64         {d18}, [r0,:64], r1
-    vld1.64         {d19}, [r2,:64], r3
-    vsubl.u8        q10, d16, d17
-    vsubl.u8        q11, d18, d19
-    SQR_ACC         q1,  d20, d21,  vmull.s16
-    vld1.64         {d16}, [r0,:64], r1
-    vadd.s16        q0,  q10, q11
-    vld1.64         {d17}, [r2,:64], r3
-    SQR_ACC         q2,  d22, d23,  vmull.s16
-    mov             ip,  #14
-1:  subs            ip,  ip,  #2
-    vld1.64         {d18}, [r0,:64], r1
-    vsubl.u8        q10, d16, d17
-    vld1.64         {d19}, [r2,:64], r3
-    vadd.s16        q0,  q0,  q10
-    SQR_ACC         q1,  d20, d21
-    vsubl.u8        q11, d18, d19
-    beq             2f
-    vld1.64         {d16}, [r0,:64], r1
-    vadd.s16        q0,  q0,  q11
-    vld1.64         {d17}, [r2,:64], r3
-    SQR_ACC         q2,  d22, d23
-    b               1b
-2:
-    vadd.s16        q0,  q0,  q11
-    SQR_ACC         q2,  d22, d23
-
-    ldr             ip,  [sp]
-    vadd.s16        d0,  d0,  d1
-    vadd.s32        q1,  q1,  q2
-    vpaddl.s16      d0,  d0
-    vadd.s32        d1,  d2,  d3
-    vpadd.s32       d0,  d0,  d1
-
-    vmov            r0,  r1,  d0
-    vst1.32         {d0[1]}, [ip,:32]
-    mul             r0,  r0,  r0
-    sub             r0,  r1,  r0,  lsr #7
-    bx              lr
-endfunc
-
-.macro LOAD_DIFF_8x4 q0 q1 q2 q3
-    vld1.32     {d1}, [r2], r3
-    vld1.32     {d0}, [r0,:64], r1
-    vsubl.u8    \q0, d0,  d1
-    vld1.32     {d3}, [r2], r3
-    vld1.32     {d2}, [r0,:64], r1
-    vsubl.u8    \q1, d2,  d3
-    vld1.32     {d5}, [r2], r3
-    vld1.32     {d4}, [r0,:64], r1
-    vsubl.u8    \q2, d4,  d5
-    vld1.32     {d7}, [r2], r3
-    vld1.32     {d6}, [r0,:64], r1
-    vsubl.u8    \q3, d6,  d7
-.endm
-
-function x264_pixel_satd_4x4_neon
-    vld1.32     {d1[]},  [r2], r3
-    vld1.32     {d0[]},  [r0,:32], r1
-    vld1.32     {d3[]},  [r2], r3
-    vld1.32     {d2[]},  [r0,:32], r1
-    vld1.32     {d1[1]}, [r2], r3
-    vld1.32     {d0[1]}, [r0,:32], r1
-    vld1.32     {d3[1]}, [r2], r3
-    vld1.32     {d2[1]}, [r0,:32], r1
-    vsubl.u8    q0,  d0,  d1
-    vsubl.u8    q1,  d2,  d3
-
-    SUMSUB_AB   q2, q3, q0, q1
-    SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7
-    HADAMARD    1, sumsub, q2, q3, q0, q1
-    HADAMARD    2, amax,   q0,,    q2, q3
-
-    HORIZ_ADD   d0,  d0,  d1
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_pixel_satd_4x8_neon
-    vld1.32     {d1[]},  [r2], r3
-    vld1.32     {d0[]},  [r0,:32], r1
-    vld1.32     {d3[]},  [r2], r3
-    vld1.32     {d2[]},  [r0,:32], r1
-    vld1.32     {d5[]},  [r2], r3
-    vld1.32     {d4[]},  [r0,:32], r1
-    vld1.32     {d7[]},  [r2], r3
-    vld1.32     {d6[]},  [r0,:32], r1
-
-    vld1.32     {d1[1]}, [r2], r3
-    vld1.32     {d0[1]}, [r0,:32], r1
-    vsubl.u8    q0,  d0,  d1
-    vld1.32     {d3[1]}, [r2], r3
-    vld1.32     {d2[1]}, [r0,:32], r1
-    vsubl.u8    q1,  d2,  d3
-    vld1.32     {d5[1]}, [r2], r3
-    vld1.32     {d4[1]}, [r0,:32], r1
-    vsubl.u8    q2,  d4,  d5
-    vld1.32     {d7[1]}, [r2], r3
-    SUMSUB_AB   q8,  q9,  q0,  q1
-    vld1.32     {d6[1]}, [r0,:32], r1
-    vsubl.u8    q3,  d6,  d7
-    SUMSUB_AB   q10, q11, q2,  q3
-    b           x264_satd_4x8_8x4_end_neon
-endfunc
-
-function x264_pixel_satd_8x4_neon
-    vld1.64     {d1}, [r2], r3
-    vld1.64     {d0}, [r0,:64], r1
-    vsubl.u8    q0,  d0,  d1
-    vld1.64     {d3}, [r2], r3
-    vld1.64     {d2}, [r0,:64], r1
-    vsubl.u8    q1,  d2,  d3
-    vld1.64     {d5}, [r2], r3
-    vld1.64     {d4}, [r0,:64], r1
-    vsubl.u8    q2,  d4,  d5
-    vld1.64     {d7}, [r2], r3
-    SUMSUB_AB   q8,  q9,  q0,  q1
-    vld1.64     {d6}, [r0,:64], r1
-    vsubl.u8    q3,  d6,  d7
-    SUMSUB_AB   q10, q11, q2,  q3
-endfunc
-
-function x264_satd_4x8_8x4_end_neon, export=0
-    vadd.s16    q0,  q8,  q10
-    vadd.s16    q1,  q9,  q11
-    vsub.s16    q2,  q8,  q10
-    vsub.s16    q3,  q9,  q11
-
-    vtrn.16     q0,  q1
-    vadd.s16    q8,  q0,  q1
-    vtrn.16     q2,  q3
-    vsub.s16    q9,  q0,  q1
-    vadd.s16    q10, q2,  q3
-    vsub.s16    q11, q2,  q3
-    vtrn.32     q8,  q10
-    vabs.s16    q8,  q8
-    vtrn.32     q9,  q11
-    vabs.s16    q10, q10
-    vabs.s16    q9,  q9
-    vabs.s16    q11, q11
-    vmax.u16    q0,  q8,  q10
-    vmax.u16    q1,  q9,  q11
-
-    vadd.u16    q0,  q0,  q1
-    HORIZ_ADD   d0,  d0,  d1
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_pixel_satd_8x8_neon
-    mov         ip,  lr
-
-    bl x264_satd_8x8_neon
-    vadd.u16    q0,  q12, q13
-    vadd.u16    q1,  q14, q15
-
-    vadd.u16    q0,  q0,  q1
-    HORIZ_ADD   d0,  d0,  d1
-    mov         lr,  ip
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_pixel_satd_8x16_neon
-    vpush       {d8-d11}
-    mov         ip,  lr
-
-    bl x264_satd_8x8_neon
-    vadd.u16    q4,  q12, q13
-    vadd.u16    q5,  q14, q15
-
-    bl x264_satd_8x8_neon
-    vadd.u16    q4,  q4,  q12
-    vadd.u16    q5,  q5,  q13
-    vadd.u16    q4,  q4,  q14
-    vadd.u16    q5,  q5,  q15
-
-    vadd.u16    q0,  q4,  q5
-    HORIZ_ADD   d0,  d0,  d1
-    vpop        {d8-d11}
-    mov         lr,  ip
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_satd_8x8_neon, export=0
-    LOAD_DIFF_8x4 q8,  q9,  q10, q11
-    vld1.64     {d7}, [r2], r3
-    SUMSUB_AB   q0,  q1,  q8,  q9
-    vld1.64     {d6}, [r0,:64], r1
-    vsubl.u8    q12, d6,  d7
-    vld1.64     {d17}, [r2], r3
-    SUMSUB_AB   q2,  q3,  q10, q11
-    vld1.64     {d16}, [r0,:64], r1
-    vsubl.u8    q13, d16, d17
-    vld1.64     {d19}, [r2], r3
-    SUMSUB_AB   q8,  q10, q0,  q2
-    vld1.64     {d18}, [r0,:64], r1
-    vsubl.u8    q14, d18, d19
-    vld1.64     {d1}, [r2], r3
-    SUMSUB_AB   q9,  q11, q1,  q3
-    vld1.64     {d0}, [r0,:64], r1
-    vsubl.u8    q15, d0,  d1
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function x264_satd_8x4v_8x8h_neon, export=0
-    SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
-    vtrn.16     q8,  q9
-    SUMSUB_AB   q12, q14, q0,  q2
-    vtrn.16     q10, q11
-    SUMSUB_AB   q13, q15, q1,  q3
-    SUMSUB_AB   q0,  q1,  q8,  q9
-    vtrn.16     q12, q13
-    SUMSUB_AB   q2,  q3,  q10, q11
-    vtrn.16     q14, q15
-    SUMSUB_AB   q8,  q9,  q12, q13
-    vtrn.32     q0,  q2
-    SUMSUB_AB   q10, q11, q14, q15
-
-    vtrn.32     q1,  q3
-    ABS2        q0,  q2
-    vtrn.32     q8,  q10
-    ABS2        q1,  q3
-    vtrn.32     q9,  q11
-    ABS2        q8,  q10
-    ABS2        q9,  q11
-    vmax.s16    q12, q0,  q2
-    vmax.s16    q13, q1,  q3
-    vmax.s16    q14, q8,  q10
-    vmax.s16    q15, q9,  q11
-    bx          lr
-endfunc
-
-function x264_pixel_satd_16x8_neon
-    vpush       {d8-d11}
-    mov         ip, lr
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q12, q13
-    vadd.u16    q5,  q14, q15
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q4,  q12
-    vadd.u16    q5,  q5,  q13
-    vadd.u16    q4,  q4,  q14
-    vadd.u16    q5,  q5,  q15
-
-    vadd.u16    q0,  q4,  q5
-    HORIZ_ADD   d0,  d0,  d1
-    vpop        {d8-d11}
-    mov         lr,  ip
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_pixel_satd_16x16_neon
-    vpush       {d8-d11}
-    mov         ip, lr
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q12, q13
-    vadd.u16    q5,  q14, q15
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q4,  q12
-    vadd.u16    q5,  q5,  q13
-    vadd.u16    q4,  q4,  q14
-    vadd.u16    q5,  q5,  q15
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q4,  q12
-    vadd.u16    q5,  q5,  q13
-    vadd.u16    q4,  q4,  q14
-    vadd.u16    q5,  q5,  q15
-
-    bl          x264_satd_16x4_neon
-    vadd.u16    q4,  q4,  q12
-    vadd.u16    q5,  q5,  q13
-    vadd.u16    q4,  q4,  q14
-    vadd.u16    q5,  q5,  q15
-
-    vadd.u16    q0,  q4,  q5
-    HORIZ_ADD   d0,  d0,  d1
-    vpop        {d8-d11}
-    mov         lr,  ip
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
-
-function x264_satd_16x4_neon, export=0
-    vld1.64     {d2-d3}, [r2], r3
-    vld1.64     {d0-d1}, [r0,:128], r1
-    vsubl.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2], r3
-    vsubl.u8    q12, d1,  d3
-    vld1.64     {d4-d5}, [r0,:128], r1
-    vsubl.u8    q9,  d4,  d6
-    vld1.64     {d2-d3}, [r2], r3
-    vsubl.u8    q13, d5,  d7
-    vld1.64     {d0-d1}, [r0,:128], r1
-    vsubl.u8    q10, d0,  d2
-    vld1.64     {d6-d7}, [r2], r3
-    vsubl.u8    q14, d1,  d3
-    vadd.s16    q0,  q8,  q9
-    vld1.64     {d4-d5}, [r0,:128], r1
-    vsub.s16    q1,  q8,  q9
-    vsubl.u8    q11, d4,  d6
-    vsubl.u8    q15, d5,  d7
-    SUMSUB_AB   q2,  q3,  q10, q11
-    SUMSUB_ABCD q8,  q10, q9,  q11, q0,  q2,  q1,  q3
-    b           x264_satd_8x4v_8x8h_neon
-endfunc
-
-
-function x264_pixel_sa8d_8x8_neon
-    mov             ip,  lr
-    bl              x264_sa8d_8x8_neon
-    vadd.u16        q0,  q8,  q9
-    HORIZ_ADD       d0,  d0,  d1
-    mov             lr,  ip
-    vmov.32         r0,  d0[0]
-    add             r0,  r0,  #1
-    lsr             r0,  r0,  #1
-    bx              lr
-endfunc
-
-function x264_pixel_sa8d_16x16_neon
-    vpush           {d8-d11}
-    mov             ip,  lr
-    bl              x264_sa8d_8x8_neon
-    vpaddl.u16      q4,  q8
-    vpaddl.u16      q5,  q9
-    bl              x264_sa8d_8x8_neon
-    vpadal.u16      q4,  q8
-    vpadal.u16      q5,  q9
-    sub             r0,  r0,  r1,  lsl #4
-    sub             r2,  r2,  r3,  lsl #4
-    add             r0,  r0,  #8
-    add             r2,  r2,  #8
-    bl              x264_sa8d_8x8_neon
-    vpadal.u16      q4,  q8
-    vpadal.u16      q5,  q9
-    bl              x264_sa8d_8x8_neon
-    vpaddl.u16      q8,  q8
-    vpaddl.u16      q9,  q9
-    vadd.u32        q0,  q4,  q8
-    vadd.u32        q1,  q5,  q9
-    vadd.u32        q0,  q0,  q1
-    vadd.u32        d0,  d0,  d1
-    vpadd.u32       d0,  d0,  d0
-    vpop            {d8-d11}
-    mov             lr,  ip
-    vmov.32         r0,  d0[0]
-    add             r0,  r0,  #1
-    lsr             r0,  r0,  #1
-    bx              lr
-endfunc
-
-.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
-    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
-    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
-.endm
-
-.macro integrated_satd dst, s0, s1, s2, s3
-    vmov            q0,  \s0
-    vmov            q1,  \s1
-    vmov            q2,  \s2
-    vmov            q3,  \s3
-
-    vtrn.16         q0,  q1
-    vtrn.16         q2,  q3
-
-    SUMSUB_AB       q6,  q7,  q0,  q1
-    SUMSUB_AB       q0,  q1,  q2,  q3
-
-    vtrn.32         q6,  q0
-    vtrn.32         q7,  q1
-
-    vabs.s16        q6,  q6
-    vabs.s16        q0,  q0
-    vabs.s16        q7,  q7
-    vabs.s16        q1,  q1
-
-    vmax.u16        q6,  q6,  q0
-    vmax.u16        q7,  q7,  q1
-
-    vadd.i16        q6,  q6,  q7
-    vpadal.u16      \dst,  q6
-.endm
-
-.macro sa8d_satd_8x8 satd=
-function x264_sa8d_\satd\()8x8_neon, export=0
-    LOAD_DIFF_8x4   q8,  q9,  q10, q11
-    vld1.64         {d7}, [r2], r3
-    SUMSUB_AB       q0,  q1,  q8,  q9
-    vld1.64         {d6}, [r0,:64], r1
-    vsubl.u8        q12, d6,  d7
-    vld1.64         {d17}, [r2], r3
-    SUMSUB_AB       q2,  q3,  q10, q11
-    vld1.64         {d16}, [r0,:64], r1
-    vsubl.u8        q13, d16, d17
-    vld1.64         {d19}, [r2], r3
-    SUMSUB_AB       q8,  q10, q0,  q2
-    vld1.64         {d18}, [r0,:64], r1
-    vsubl.u8        q14, d18, d19
-    vld1.64         {d1}, [r2], r3
-    SUMSUB_AB       q9,  q11, q1,  q3
-    vld1.64         {d0}, [r0,:64], r1
-    vsubl.u8        q15, d0,  d1
-
-    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
-
-.ifc \satd, satd_
-    integrated_satd q4,  q8,  q9,  q10, q11
-    integrated_satd q4,  q12, q13, q14, q15
-.endif
-
-    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
-    SUMSUB_AB       q2,  q10, q10, q14
-    vtrn.16         q8,  q9
-    SUMSUB_AB       q3,  q11, q11, q15
-    vtrn.16         q0,  q1
-    SUMSUB_AB       q12, q13, q8,  q9
-    vtrn.16         q10, q11
-    SUMSUB_AB       q8,  q9,  q0,  q1
-    vtrn.16         q2,  q3
-    SUMSUB_AB       q14, q15, q10, q11
-    vadd.i16        q10, q2,  q3
-    vtrn.32         q12, q14
-    vsub.i16        q11, q2,  q3
-    vtrn.32         q13, q15
-    SUMSUB_AB       q0,  q2,  q12, q14
-    vtrn.32         q8,  q10
-    SUMSUB_AB       q1,  q3,  q13, q15
-    vtrn.32         q9,  q11
-    SUMSUB_AB       q12, q14, q8,  q10
-    SUMSUB_AB       q13, q15, q9,  q11
-
-    vswp            d1,  d24
-    ABS2            q0,  q12
-    vswp            d3,  d26
-    ABS2            q1,  q13
-    vswp            d5,  d28
-    ABS2            q2,  q14
-    vswp            d7,  d30
-    ABS2            q3,  q15
-    vmax.s16        q8,  q0,  q12
-    vmax.s16        q9,  q1,  q13
-    vmax.s16        q10, q2,  q14
-    vmax.s16        q11, q3,  q15
-    vadd.i16        q8,  q8,  q9
-    vadd.i16        q9,  q10, q11
-.ifc \satd, satd_
-    vpadal.u16      q5,  q8
-    vpadal.u16      q5,  q9
-.endif
-    bx              lr
-endfunc
-.endm
-
-sa8d_satd_8x8
-sa8d_satd_8x8 satd_
-
-function x264_pixel_sa8d_satd_16x16_neon
-    push            {lr}
-    vpush           {q4-q7}
-    vmov.u32        q4,  #0
-    vmov.u32        q5,  #0
-    bl              x264_sa8d_satd_8x8_neon
-    bl              x264_sa8d_satd_8x8_neon
-    sub             r0,  r0,  r1,  lsl #4
-    sub             r2,  r2,  r3,  lsl #4
-    add             r0,  r0,  #8
-    add             r2,  r2,  #8
-    bl              x264_sa8d_satd_8x8_neon
-    bl              x264_sa8d_satd_8x8_neon
-    vadd.u32        d1,  d10, d11
-    vadd.u32        d0,  d8,  d9
-    vpadd.u32       d1,  d1,  d1
-    vpadd.u32       d0,  d0,  d0
-    vrshr.u32       d1,  d1,  #1
-    vmov.32         r1,  d0[0]
-    vmov.32         r0,  d1[0]
-    vpop            {q4-q7}
-    pop             {pc}
-endfunc
-
-
-.macro HADAMARD_AC w h
-function x264_pixel_hadamard_ac_\w\()x\h\()_neon
-    vpush           {d8-d15}
-    movrel          ip, mask_ac4
-    vmov.i8         q4, #0
-    // note: this assumes mask_ac8 is after mask_ac4 (so don't move it)
-    vld1.64         {d12-d15}, [ip,:128]
-    vmov.i8         q5, #0
-
-    mov             ip,  lr
-    bl              x264_hadamard_ac_8x8_neon
-.if \h > 8
-    bl              x264_hadamard_ac_8x8_neon
-.endif
-.if \w > 8
-    sub             r0,  r0,  r1,  lsl #3
-    add             r0,  r0,  #8
-    bl              x264_hadamard_ac_8x8_neon
-.endif
-.if \w * \h == 256
-    sub             r0,  r0,  r1,  lsl #4
-    bl              x264_hadamard_ac_8x8_neon
-.endif
-
-    vadd.s32        d8,  d8,  d9
-    vadd.s32        d10, d10, d11
-    vpadd.s32       d0,  d8,  d10
-    vpop            {d8-d15}
-    mov             lr,  ip
-    vmov            r0,  r1,  d0
-    lsr             r0,  r0,  #1
-    lsr             r1,  r1,  #2
-    bx              lr
-endfunc
-.endm
-
-HADAMARD_AC  8, 8
-HADAMARD_AC  8, 16
-HADAMARD_AC 16, 8
-HADAMARD_AC 16, 16
-
-// q4: satd  q5: sa8d  q6: mask_ac4  q7: mask_ac8
-function x264_hadamard_ac_8x8_neon, export=0
-    vld1.64         {d2},  [r0,:64], r1
-    vld1.64         {d3},  [r0,:64], r1
-    vaddl.u8        q0,  d2,  d3
-    vld1.64         {d6},  [r0,:64], r1
-    vsubl.u8        q1,  d2,  d3
-    vld1.64         {d7},  [r0,:64], r1
-    vaddl.u8        q2,  d6,  d7
-    vld1.64         {d18}, [r0,:64], r1
-    vsubl.u8        q3,  d6,  d7
-    vld1.64         {d19}, [r0,:64], r1
-    vaddl.u8        q8,  d18, d19
-    vld1.64         {d22}, [r0,:64], r1
-    vsubl.u8        q9,  d18, d19
-    vld1.64         {d23}, [r0,:64], r1
-
-    SUMSUB_ABCD     q12, q14, q13, q15, q0,  q2,  q1,  q3
-    vaddl.u8        q10, d22, d23
-    vsubl.u8        q11, d22, d23
-    vtrn.16         q12, q13
-    SUMSUB_ABCD     q0,  q2,  q1,  q3,  q8,  q10, q9,  q11
-
-    vtrn.16         q14, q15
-    SUMSUB_AB       q8,  q9,  q12, q13
-    vtrn.16         q0,  q1
-    SUMSUB_AB       q10, q11, q14, q15
-    vtrn.16         q2,  q3
-    SUMSUB_AB       q12, q13, q0,  q1
-    vtrn.32         q8,  q10
-    SUMSUB_AB       q14, q15, q2,  q3
-    vtrn.32         q9,  q11
-    SUMSUB_AB       q0,  q2,  q8,  q10
-    vtrn.32         q12, q14
-    SUMSUB_AB       q1,  q3,  q9,  q11
-    vtrn.32         q13, q15
-    SUMSUB_ABCD     q8,  q10, q9,  q11, q12, q14, q13, q15
-
-    vabs.s16        q12, q0
-    vabs.s16        q13, q8
-    vabs.s16        q15, q1
-    vadd.s16        q12, q12, q13
-    vabs.s16        q14, q2
-    vand.s16        q12, q12, q6
-    vabs.s16        q13, q3
-    vadd.s16        q12, q12, q15
-    vabs.s16        q15, q9
-    vadd.s16        q12, q12, q14
-    vabs.s16        q14, q10
-    vadd.s16        q12, q12, q13
-    vabs.s16        q13, q11
-    vadd.s16        q12, q12, q15
-    vsub.s16        q15, q11, q3
-    vadd.s16        q12, q12, q14
-    vadd.s16        q14, q11, q3
-    vadd.s16        q12, q12, q13
-    vsub.s16        q13, q10, q2
-    vadd.s16        q2,  q10, q2
-    vpadal.u16      q4,  q12
-
-    SUMSUB_AB       q10, q11, q9,  q1
-    SUMSUB_AB       q9,  q8,  q0,  q8
-    vswp            d29, d30
-    vabs.s16        q14, q14
-    vabs.s16        q15, q15
-    vswp            d5,  d26
-    vabs.s16        q2,  q2
-    vabs.s16        q13, q13
-    vswp            d21, d22
-    vabs.s16        q10, q10
-    vabs.s16        q11, q11
-    vmax.s16        q3,  q14, q15
-    vmax.s16        q2,  q2,  q13
-    vmax.s16        q1,  q10, q11
-    vswp            d19, d16
-    SUMSUB_AB       q14, q15, q9,  q8
-
-    vadd.s16        q2,  q2,  q3
-    vadd.s16        q2,  q2,  q1
-    vand            q14, q14, q7
-    vadd.s16        q2,  q2,  q2
-    vabs.s16        q15, q15
-    vabs.s16        q14, q14
-    vadd.s16        q2,  q2,  q15
-    vadd.s16        q2,  q2,  q14
-    vpadal.u16      q5,  q2
-    bx              lr
-endfunc
-
-
-.macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext
-    vld1.64     {\db}, [r2], r3
-    vmull.u8    \ssa,  \da, \da
-    vmull.u8    \s12,  \da, \db
-.if \n == 1
-    vpaddl.u16  q2,  \lastssa
-    vpaddl.u16  q3,  \lasts12
-    vaddl.u8    q0,  d0,  \da
-.else
-    vpadal.u16  q2,  \lastssa
-    vpadal.u16  q3,  \lasts12
-    vaddw.u8    q0,  q0,  \da
-.endif
-    vpadal.u16  q2,  \lastssb
-.if \n < 3
-    vld1.64     {\dnext}, [r0], r1
-.endif
-.if \n == 1
-    vaddl.u8    q1,  d2,  \db
-.else
-    vaddw.u8    q1,  q1,  \db
-.endif
-    vmull.u8    \ssb, \db, \db
-.endm
-
-function x264_pixel_ssim_4x4x2_core_neon
-    ldr         ip, [sp]
-    vld1.64     {d0}, [r0], r1
-    vld1.64     {d2}, [r2], r3
-    vmull.u8    q2,  d0,  d0
-    vmull.u8    q3,  d0,  d2
-    vld1.64     {d28}, [r0], r1
-    vmull.u8    q15, d2,  d2
-
-    SSIM_ITER 1, q8, q9, q14,  q2, q3, q15,  d28, d29, d26
-    SSIM_ITER 2, q10,q11,q13,  q8, q9, q14,  d26, d27, d28
-    SSIM_ITER 3, q8, q9, q15,  q10,q11,q13,  d28, d29
-
-    vpadal.u16  q2,  q8
-    vpaddl.u16  q0,  q0
-    vpaddl.u16  q1,  q1
-    vpadal.u16  q2,  q15
-    vpadal.u16  q3,  q9
-
-    vpadd.u32   d0,  d0,  d1
-    vpadd.u32   d1,  d2,  d3
-    vpadd.u32   d2,  d4,  d5
-    vpadd.u32   d3,  d6,  d7
-
-    vst4.32     {d0-d3}, [ip]
-    bx          lr
-endfunc
-
-// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
-function x264_pixel_ssim_end4_neon
-    vld1.32     {d16-d19}, [r0,:128]!
-    vld1.32     {d20-d23}, [r1,:128]!
-    vadd.s32    q0,  q8,  q10
-    vadd.s32    q1,  q9,  q11
-    vld1.32     {d24-d27}, [r0,:128]!
-    vadd.s32    q0,  q0,  q1
-    vld1.32     {d28-d31}, [r1,:128]!
-    vadd.s32    q2,  q12, q14
-    vadd.s32    q3,  q13, q15
-    vld1.32     {d16-d17}, [r0,:128]
-    vadd.s32    q1,  q1,  q2
-    vld1.32     {d18-d19}, [r1,:128]
-    vadd.s32    q8,  q8,  q9
-    vadd.s32    q2,  q2,  q3
-    vadd.s32    q3,  q3,  q8
-
-    vtrn.32     q0,  q1
-    vtrn.32     q2,  q3
-    vswp        d1,  d4
-    vswp        d3,  d6
-
-//  s1=q0, s2=q1, ss=q2, s12=q3
-    vmul.s32    q8,  q0,  q1    // s1*s2
-    vmul.s32    q0,  q0,  q0
-    vmla.s32    q0,  q1,  q1    // s1*s1 + s2*s2
-
-    vshl.s32    q3,  q3,  #7
-    vshl.s32    q2,  q2,  #6
-    vadd.s32    q1,  q8,  q8
-
-    mov         r3, #416        // ssim_c1 = .01*.01*255*255*64
-    movconst    ip, 235963      // ssim_c2 = .03*.03*255*255*64*63
-    vdup.32     q14, r3
-    vdup.32     q15, ip
-
-    vsub.s32    q2,  q2,  q0    // vars
-    vsub.s32    q3,  q3,  q1    // covar*2
-    vadd.s32    q0,  q0,  q14
-    vadd.s32    q2,  q2,  q15
-    vadd.s32    q1,  q1,  q14
-    vadd.s32    q3,  q3,  q15
-
-    vcvt.f32.s32    q0,  q0
-    vcvt.f32.s32    q2,  q2
-    vcvt.f32.s32    q1,  q1
-    vcvt.f32.s32    q3,  q3
-
-    vmul.f32    q0,  q0,  q2
-    vmul.f32    q1,  q1,  q3
-
-    cmp         r2,  #4
-
-    vdiv.f32    s0,  s4,  s0
-    vdiv.f32    s1,  s5,  s1
-    vdiv.f32    s2,  s6,  s2
-    vdiv.f32    s3,  s7,  s3
-
-    beq         ssim_skip
-    movrel      r3,  mask_ff
-    sub         r3,  r3,  r2,  lsl #2
-    vld1.64     {d6-d7}, [r3]
-    vand        q0,  q0,  q3
-ssim_skip:
-    vadd.f32    d0,  d0,  d1
-    vpadd.f32   d0,  d0,  d0
-    vmov.32     r0,  d0[0]
-    bx          lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/pixel.h b/android/src/main/libenc/jni/libx264/common/arm/pixel.h
deleted file mode 100755
index b408326..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/pixel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*****************************************************************************
- * pixel.h: arm pixel metrics
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ARM_PIXEL_H
-#define X264_ARM_PIXEL_H
-
-#define DECL_PIXELS( ret, name, suffix, args ) \
-    ret x264_pixel_##name##_16x16_##suffix args;\
-    ret x264_pixel_##name##_16x8_##suffix args;\
-    ret x264_pixel_##name##_8x16_##suffix args;\
-    ret x264_pixel_##name##_8x8_##suffix args;\
-    ret x264_pixel_##name##_8x4_##suffix args;\
-    ret x264_pixel_##name##_4x8_##suffix args;\
-    ret x264_pixel_##name##_4x4_##suffix args;\
-
-#define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
-
-#define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
-
-int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
-
-DECL_X1( sad, neon )
-DECL_X1( sad_aligned, neon )
-DECL_X1( sad_aligned, neon_dual )
-DECL_X4( sad, neon )
-DECL_X1( satd, neon )
-DECL_X1( ssd, neon )
-
-void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
-
-int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
-
-int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
-uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
-
-uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
-uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-
-uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
-uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
-
-void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
-                                      const uint8_t *, intptr_t,
-                                      int sums[2][4] );
-float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
-
-int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/arm/predict-a.S b/android/src/main/libenc/jni/libx264/common/arm/predict-a.S
deleted file mode 100755
index 30b3679..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/predict-a.S
+++ /dev/null
@@ -1,809 +0,0 @@
-/*****************************************************************************
- * predict.S: arm intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Mans Rullgard <mans@mansr.com>
- *          Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-
-p16weight: .short 1,2,3,4,5,6,7,8
-
-.text
-
-.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
-.if \n == 8 || \hi == 0
-    vld1.8          {\rd[0]}, [\rs], \rt
-    vld1.8          {\rd[1]}, [\rs], \rt
-    vld1.8          {\rd[2]}, [\rs], \rt
-    vld1.8          {\rd[3]}, [\rs], \rt
-.endif
-.if \n == 8 || \hi == 1
-    vld1.8          {\rd[4]}, [\rs], \rt
-    vld1.8          {\rd[5]}, [\rs], \rt
-    vld1.8          {\rd[6]}, [\rs], \rt
-    vld1.8          {\rd[7]}, [\rs], \rt
-.endif
-.endm
-
-.macro ldcol.16  rd1,  rd2,  rs,  rt,  ru
-    add             \ru, \rs, \rt, lsl #3
-    vld1.8          {\rd1[0]}, [\rs], \rt
-    vld1.8          {\rd2[0]}, [\ru], \rt
-    vld1.8          {\rd1[1]}, [\rs], \rt
-    vld1.8          {\rd2[1]}, [\ru], \rt
-    vld1.8          {\rd1[2]}, [\rs], \rt
-    vld1.8          {\rd2[2]}, [\ru], \rt
-    vld1.8          {\rd1[3]}, [\rs], \rt
-    vld1.8          {\rd2[3]}, [\ru], \rt
-    vld1.8          {\rd1[4]}, [\rs], \rt
-    vld1.8          {\rd2[4]}, [\ru], \rt
-    vld1.8          {\rd1[5]}, [\rs], \rt
-    vld1.8          {\rd2[5]}, [\ru], \rt
-    vld1.8          {\rd1[6]}, [\rs], \rt
-    vld1.8          {\rd2[6]}, [\ru], \rt
-    vld1.8          {\rd1[7]}, [\rs], \rt
-    vld1.8          {\rd2[7]}, [\ru], \rt
-.endm
-
-.macro add16x8  dq,  dl,  dh,  rl,  rh
-    vaddl.u8        \dq, \rl, \rh
-    vadd.u16        \dl, \dl, \dh
-    vpadd.u16       \dl, \dl, \dl
-    vpadd.u16       \dl, \dl, \dl
-.endm
-
-
-// because gcc doesn't believe in using the free shift in add
-function x264_predict_4x4_h_armv6
-    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
-    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
-    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
-    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
-    add     r1, r1, r1, lsl #8
-    add     r2, r2, r2, lsl #8
-    add     r3, r3, r3, lsl #8
-    add     ip, ip, ip, lsl #8
-    add     r1, r1, r1, lsl #16
-    str     r1, [r0, #0*FDEC_STRIDE]
-    add     r2, r2, r2, lsl #16
-    str     r2, [r0, #1*FDEC_STRIDE]
-    add     r3, r3, r3, lsl #16
-    str     r3, [r0, #2*FDEC_STRIDE]
-    add     ip, ip, ip, lsl #16
-    str     ip, [r0, #3*FDEC_STRIDE]
-    bx      lr
-endfunc
-
-function x264_predict_4x4_v_armv6
-    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
-    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
-    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
-    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
-    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
-    bx      lr
-endfunc
-
-function x264_predict_4x4_dc_armv6
-    mov     ip, #0
-    ldr     r1, [r0, #-FDEC_STRIDE]
-    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
-    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
-    usad8   r1, r1, ip
-    add     r2, r2, #4
-    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
-    add     r2, r2, r3
-    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
-    add     r2, r2, ip
-    add     r2, r2, r3
-    add     r1, r1, r2
-    lsr     r1, r1, #3
-    add     r1, r1, r1, lsl #8
-    add     r1, r1, r1, lsl #16
-    str     r1, [r0, #0*FDEC_STRIDE]
-    str     r1, [r0, #1*FDEC_STRIDE]
-    str     r1, [r0, #2*FDEC_STRIDE]
-    str     r1, [r0, #3*FDEC_STRIDE]
-    bx      lr
-endfunc
-
-function x264_predict_4x4_dc_top_neon
-    mov         r12, #FDEC_STRIDE
-    sub         r1, r0, #FDEC_STRIDE
-    vld1.32     d1[], [r1,:32]
-    vpaddl.u8   d1, d1
-    vpadd.u16   d1, d1, d1
-    vrshr.u16   d1, d1, #2
-    vdup.8      d1, d1[0]
-    vst1.32     d1[0], [r0,:32], r12
-    vst1.32     d1[0], [r0,:32], r12
-    vst1.32     d1[0], [r0,:32], r12
-    vst1.32     d1[0], [r0,:32], r12
-    bx          lr
-endfunc
-
-// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
-.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
-    uhadd8  \a1, \a1, \c1
-    uhadd8  \a2, \a2, \c2
-    uhadd8  \c1, \a1, \b1
-    uhadd8  \c2, \a2, \b2
-    eor     \a1, \a1, \b1
-    eor     \a2, \a2, \b2
-    and     \a1, \a1, \pb_1
-    and     \a2, \a2, \pb_1
-    uadd8   \a1, \a1, \c1
-    uadd8   \a2, \a2, \c2
-.endm
-
-function x264_predict_4x4_ddr_armv6
-    ldr     r1, [r0, # -FDEC_STRIDE]
-    ldrb    r2, [r0, # -FDEC_STRIDE-1]
-    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
-    push    {r4-r6,lr}
-    add     r2, r2, r1, lsl #8
-    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
-    add     r3, r3, r2, lsl #8
-    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
-    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
-    add     r4, r4, r3, lsl #8
-    add     r5, r5, r4, lsl #8
-    add     r6, r6, r5, lsl #8
-    ldr     ip, =0x01010101
-    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
-    str     r1, [r0, #0*FDEC_STRIDE]
-    lsl     r2, r1, #8
-    lsl     r3, r1, #16
-    lsl     r4, r4, #8
-    lsl     r5, r1, #24
-    add     r2, r2, r4, lsr #24
-    str     r2, [r0, #1*FDEC_STRIDE]
-    add     r3, r3, r4, lsr #16
-    str     r3, [r0, #2*FDEC_STRIDE]
-    add     r5, r5, r4, lsr #8
-    str     r5, [r0, #3*FDEC_STRIDE]
-    pop     {r4-r6,pc}
-endfunc
-
-function x264_predict_4x4_ddl_neon
-    sub         r0, #FDEC_STRIDE
-    mov         ip, #FDEC_STRIDE
-    vld1.64     {d0}, [r0], ip
-    vdup.8      d3, d0[7]
-    vext.8      d1, d0, d0, #1
-    vext.8      d2, d0, d3, #2
-    vhadd.u8    d0, d0, d2
-    vrhadd.u8   d0, d0, d1
-    vst1.32     {d0[0]}, [r0,:32], ip
-    vext.8      d1, d0, d0, #1
-    vext.8      d2, d0, d0, #2
-    vst1.32     {d1[0]}, [r0,:32], ip
-    vext.8      d3, d0, d0, #3
-    vst1.32     {d2[0]}, [r0,:32], ip
-    vst1.32     {d3[0]}, [r0,:32], ip
-    bx          lr
-endfunc
-
-function x264_predict_8x8_dc_neon
-    mov     ip, #0
-    ldrd    r2, r3, [r1, #8]
-    push    {r4-r5,lr}
-    ldrd    r4, r5, [r1, #16]
-    lsl     r3, r3, #8
-    ldrb    lr, [r1, #7]
-    usad8   r2, r2, ip
-    usad8   r3, r3, ip
-    usada8  r2, r4, ip, r2
-    add     lr, lr, #8
-    usada8  r3, r5, ip, r3
-    add     r2, r2, lr
-    mov     ip, #FDEC_STRIDE
-    add     r2, r2, r3
-    lsr     r2, r2, #4
-
-    vdup.8   d0, r2
-.rept 8
-    vst1.64 {d0}, [r0,:64], ip
-.endr
-    pop    {r4-r5,pc}
-endfunc
-
-function x264_predict_8x8_h_neon
-    add         r1, r1, #7
-    mov         ip, #FDEC_STRIDE
-    vld1.64     {d16}, [r1]
-    vdup.8      d0, d16[7]
-    vdup.8      d1, d16[6]
-    vst1.64     {d0}, [r0,:64], ip
-    vdup.8      d2, d16[5]
-    vst1.64     {d1}, [r0,:64], ip
-    vdup.8      d3, d16[4]
-    vst1.64     {d2}, [r0,:64], ip
-    vdup.8      d4, d16[3]
-    vst1.64     {d3}, [r0,:64], ip
-    vdup.8      d5, d16[2]
-    vst1.64     {d4}, [r0,:64], ip
-    vdup.8      d6, d16[1]
-    vst1.64     {d5}, [r0,:64], ip
-    vdup.8      d7, d16[0]
-    vst1.64     {d6}, [r0,:64], ip
-    vst1.64     {d7}, [r0,:64], ip
-    bx          lr
-endfunc
-
-function x264_predict_8x8_v_neon
-    add         r1, r1, #16
-    mov         r12, #FDEC_STRIDE
-    vld1.8      {d0}, [r1,:64]
-.rept 8
-    vst1.8      {d0}, [r0,:64], r12
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x8_ddl_neon
-    add         r1, #16
-    vld1.8      {d0, d1}, [r1,:128]
-    vmov.i8     q3, #0
-    vrev64.8    d2, d1
-    vext.8      q8, q3, q0, #15
-    vext.8      q2, q0, q1, #1
-    vhadd.u8    q8, q2
-    mov         r12, #FDEC_STRIDE
-    vrhadd.u8   q0, q8
-    vext.8      d2, d0, d1, #1
-    vext.8      d3, d0, d1, #2
-    vst1.8      d2, [r0,:64], r12
-    vext.8      d2, d0, d1, #3
-    vst1.8      d3, [r0,:64], r12
-    vext.8      d3, d0, d1, #4
-    vst1.8      d2, [r0,:64], r12
-    vext.8      d2, d0, d1, #5
-    vst1.8      d3, [r0,:64], r12
-    vext.8      d3, d0, d1, #6
-    vst1.8      d2, [r0,:64], r12
-    vext.8      d2, d0, d1, #7
-    vst1.8      d3, [r0,:64], r12
-    vst1.8      d2, [r0,:64], r12
-    vst1.8      d1, [r0,:64], r12
-    bx          lr
-endfunc
-
-function x264_predict_8x8_ddr_neon
-    vld1.8      {d0-d3}, [r1,:128]
-    vext.8      q2, q0, q1, #7
-    vext.8      q3, q0, q1, #9
-
-    vhadd.u8    q2, q2, q3
-    vrhadd.u8   d0, d1, d4
-    vrhadd.u8   d1, d2, d5
-
-    add         r0, #7*FDEC_STRIDE
-    mov         r12, #-1*FDEC_STRIDE
-
-    vext.8      d2, d0, d1, #1
-    vst1.8      {d0}, [r0,:64], r12
-    vext.8      d4, d0, d1, #2
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d5, d0, d1, #3
-    vst1.8      {d4}, [r0,:64], r12
-    vext.8      d4, d0, d1, #4
-    vst1.8      {d5}, [r0,:64], r12
-    vext.8      d5, d0, d1, #5
-    vst1.8      {d4}, [r0,:64], r12
-    vext.8      d4, d0, d1, #6
-    vst1.8      {d5}, [r0,:64], r12
-    vext.8      d5, d0, d1, #7
-    vst1.8      {d4}, [r0,:64], r12
-    vst1.8      {d5}, [r0,:64], r12
-    bx          lr
-endfunc
-
-function x264_predict_8x8_vl_neon
-    add         r1, #16
-    mov         r12, #FDEC_STRIDE
-
-    vld1.8      {d0, d1}, [r1,:128]
-    vext.8      q1, q1, q0, #15
-    vext.8      q2, q0, q2, #1
-
-    vrhadd.u8   q3, q0, q2
-
-    vhadd.u8    q1, q1, q2
-    vrhadd.u8   q0, q0, q1
-
-    vext.8      d2, d0, d1, #1
-    vst1.8      {d6}, [r0,:64], r12
-    vext.8      d3, d6, d7, #1
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d0, d1, #2
-    vst1.8      {d3}, [r0,:64], r12
-    vext.8      d3, d6, d7, #2
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d0, d1, #3
-    vst1.8      {d3}, [r0,:64], r12
-    vext.8      d3, d6, d7, #3
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d0, d1, #4
-    vst1.8      {d3}, [r0,:64], r12
-    vst1.8      {d2}, [r0,:64], r12
-    bx          lr
-endfunc
-
-function x264_predict_8x8_vr_neon
-    add         r1, #8
-    mov         r12, #FDEC_STRIDE
-    vld1.8      {d4,d5}, [r1,:64]
-
-    vext.8      q1, q2, q2, #14
-    vext.8      q0, q2, q2, #15
-
-    vhadd.u8    q3, q2, q1
-    vrhadd.u8   q2, q2, q0
-    vrhadd.u8   q0, q0, q3
-
-    vmov        d2, d0
-
-    vst1.8      {d5}, [r0,:64], r12
-    vuzp.8      d2, d0
-    vst1.8      {d1}, [r0,:64], r12
-    vext.8      d6, d0, d5, #7
-    vext.8      d3, d2, d1, #7
-    vst1.8      {d6}, [r0,:64], r12
-    vst1.8      {d3}, [r0,:64], r12
-    vext.8      d6, d0, d5, #6
-    vext.8      d3, d2, d1, #6
-    vst1.8      {d6}, [r0,:64], r12
-    vst1.8      {d3}, [r0,:64], r12
-    vext.8      d6, d0, d5, #5
-    vext.8      d3, d2, d1, #5
-    vst1.8      {d6}, [r0,:64], r12
-    vst1.8      {d3}, [r0,:64], r12
-    bx          lr
-endfunc
-
-function x264_predict_8x8_hd_neon
-    mov         r12, #FDEC_STRIDE
-    add         r1, #7
-
-    vld1.8      {d2,d3}, [r1]
-    vext.8      q3, q1, q1, #1
-    vext.8      q2, q1, q1, #2
-
-    vrhadd.u8   q8, q1, q3
-
-    vhadd.u8    q1, q2
-    vrhadd.u8   q0, q1, q3
-
-    vzip.8      d16, d0
-
-    vext.8      d2, d0, d1, #6
-    vext.8      d3, d0, d1, #4
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d0, d1, #2
-    vst1.8      {d3}, [r0,:64], r12
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d16, d0, #6
-    vst1.8      {d0}, [r0,:64], r12
-    vext.8      d3, d16, d0, #4
-    vst1.8      {d2}, [r0,:64], r12
-    vext.8      d2, d16, d0, #2
-    vst1.8      {d3}, [r0,:64], r12
-    vst1.8      {d2}, [r0,:64], r12
-    vst1.8      {d16}, [r0,:64], r12
-
-    bx          lr
-endfunc
-
-function x264_predict_8x8_hu_neon
-    mov         r12, #FDEC_STRIDE
-    add         r1, #7
-    vld1.8      {d7}, [r1]
-    vdup.8      d6, d7[0]
-    vrev64.8    d7, d7
-
-    vext.8      d4, d7, d6, #2
-    vext.8      d2, d7, d6, #1
-
-    vhadd.u8    d16, d7, d4
-    vrhadd.u8   d0, d2, d7
-    vrhadd.u8   d1, d16, d2
-
-    vzip.8      d0, d1
-
-    vdup.16     q1, d1[3]
-
-    vext.8      q2, q0, q1, #2
-    vext.8      q3, q0, q1, #4
-    vext.8      q8, q0, q1, #6
-    vst1.8      {d0}, [r0,:64], r12
-    vst1.8      {d4}, [r0,:64], r12
-    vst1.8      {d6}, [r0,:64], r12
-    vst1.8      {d16}, [r0,:64], r12
-
-    vst1.8      {d1}, [r0,:64], r12
-    vst1.8      {d5}, [r0,:64], r12
-    vst1.8      {d7}, [r0,:64], r12
-    vst1.8      {d17}, [r0,:64]
-    bx          lr
-endfunc
-
-function x264_predict_8x8c_dc_top_neon
-    sub         r2,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    vld1.8      {d0}, [r2,:64]
-    vpaddl.u8   d0,  d0
-    vpadd.u16   d0,  d0,  d0
-    vrshrn.u16  d0,  q0,  #2
-    vdup.8      d1,  d0[1]
-    vdup.8      d0,  d0[0]
-    vtrn.32     d0,  d1
-    b           pred8x8_dc_end
-endfunc
-
-function x264_predict_8x8c_dc_left_neon
-    mov         r1,  #FDEC_STRIDE
-    sub         r2,  r0,  #1
-    ldcol.8     d0,  r2,  r1
-    vpaddl.u8   d0,  d0
-    vpadd.u16   d0,  d0,  d0
-    vrshrn.u16  d0,  q0,  #2
-    vdup.8      d1,  d0[1]
-    vdup.8      d0,  d0[0]
-    b           pred8x8_dc_end
-endfunc
-
-function x264_predict_8x8c_dc_neon
-    sub         r2,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    vld1.8      {d0}, [r2,:64]
-    sub         r2,  r0,  #1
-    ldcol.8     d1,  r2,  r1
-    vtrn.32     d0,  d1
-    vpaddl.u8   q0,  q0
-    vpadd.u16   d0,  d0,  d1
-    vpadd.u16   d1,  d0,  d0
-    vrshrn.u16  d2,  q0,  #3
-    vrshrn.u16  d3,  q0,  #2
-    vdup.8      d0,  d2[4]
-    vdup.8      d1,  d3[3]
-    vdup.8      d4,  d3[2]
-    vdup.8      d5,  d2[5]
-    vtrn.32     q0,  q2
-pred8x8_dc_end:
-    add         r2,  r0,  r1,  lsl #2
-.rept 4
-    vst1.8      {d0}, [r0,:64], r1
-    vst1.8      {d1}, [r2,:64], r1
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x8c_h_neon
-    sub         r1, r0, #1
-    mov         ip, #FDEC_STRIDE
-.rept 4
-    vld1.8      {d0[]}, [r1], ip
-    vld1.8      {d2[]}, [r1], ip
-    vst1.64     {d0}, [r0,:64], ip
-    vst1.64     {d2}, [r0,:64], ip
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x8c_v_neon
-    sub         r0, r0, #FDEC_STRIDE
-    mov         ip, #FDEC_STRIDE
-    vld1.64     {d0}, [r0,:64], ip
-.rept 8
-    vst1.64     {d0}, [r0,:64], ip
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x8c_p_neon
-    sub         r3,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    add         r2,  r3,  #4
-    sub         r3,  r3,  #1
-    vld1.32     {d0[0]}, [r3]
-    vld1.32     {d2[0]}, [r2,:32], r1
-    ldcol.8     d0,  r3,  r1,  4,  hi=1
-    add         r3,  r3,  r1
-    ldcol.8     d3,  r3,  r1,  4
-    vaddl.u8    q8,  d2,  d3
-    vrev32.8    d0,  d0
-    vtrn.32     d2,  d3
-    vsubl.u8    q2,  d2,  d0
-    movrel      r3,  p16weight
-    vld1.16     {q0}, [r3,:128]
-    vmul.s16    d4,  d4,  d0
-    vmul.s16    d5,  d5,  d0
-    vpadd.i16   d4,  d4,  d5
-    vpaddl.s16  d4,  d4
-    vshl.i32    d5,  d4,  #4
-    vadd.s32    d4,  d4,  d5
-    vrshrn.s32  d4,  q2,  #5
-    mov         r3,  #0
-    vtrn.16     d4,  d5
-    vadd.i16    d2,  d4,  d5
-    vshl.i16    d3,  d2,  #2
-    vrev64.16   d16, d16
-    vsub.i16    d3,  d3,  d2
-    vadd.i16    d16, d16, d0
-    vshl.i16    d2,  d16, #4
-    vsub.i16    d2,  d2,  d3
-    vext.16     q0,  q0,  q0,  #7
-    vmov.16     d0[0], r3
-    vmul.i16    q0,  q0,  d4[0]
-    vdup.16     q1,  d2[0]
-    vdup.16     q3,  d5[0]
-    vadd.i16    q1,  q1,  q0
-    mov         r3,  #8
-1:
-    vqshrun.s16 d0,  q1,  #5
-    vadd.i16    q1,  q1,  q3
-    vst1.8      {d0}, [r0,:64], r1
-    subs        r3,  r3,  #1
-    bne         1b
-    bx          lr
-endfunc
-
-
-function x264_predict_8x16c_dc_top_neon
-    sub         r2,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    vld1.8      {d0}, [r2,:64]
-    vpaddl.u8   d0,  d0
-    vpadd.u16   d0,  d0,  d0
-    vrshrn.u16  d0,  q0,  #2
-    vdup.8      d1,  d0[1]
-    vdup.8      d0,  d0[0]
-    vtrn.32     d0,  d1
-
-    add         r2,  r0,  r1,  lsl #2
-.rept 4
-    vst1.8      {d0}, [r0,:64], r1
-    vst1.8      {d1}, [r2,:64], r1
-.endr
-    add         r2,  r2,  r1,  lsl #2
-    add         r0,  r0,  r1,  lsl #2
-.rept 4
-    vst1.8      {d0}, [r0,:64], r1
-    vst1.8      {d1}, [r2,:64], r1
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x16c_h_neon
-    sub         r1, r0, #1
-    mov         ip, #FDEC_STRIDE
-.rept 8
-    vld1.8      {d0[]}, [r1], ip
-    vld1.8      {d2[]}, [r1], ip
-    vst1.64     {d0}, [r0,:64], ip
-    vst1.64     {d2}, [r0,:64], ip
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_8x16c_p_neon
-    sub         r3,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    add         r2,  r3,  #4
-    sub         r3,  r3,  #1
-    vld1.32     {d0[0]}, [r3]
-    vld1.32     {d2[0]}, [r2,:32], r1
-    ldcol.8     d1,  r3,  r1
-    add         r3,  r3,  r1
-    ldcol.8     d3,  r3,  r1
-    vrev64.32   d16, d3
-    vaddl.u8    q8,  d2,  d16
-    vrev32.8    d0,  d0
-    vsubl.u8    q2,  d2,  d0
-    vrev64.8    d1,  d1
-    vsubl.u8    q3,  d3,  d1
-    movrel      r3,  p16weight
-    vld1.16     {q0}, [r3,:128]
-    vmul.s16    d4,  d4,  d0
-    vmul.s16    q3,  q3,  q0
-    vpadd.i16   d4,  d4,  d5
-    vpadd.i16   d6,  d6,  d7
-    vpaddl.s16  d4,  d4        @ d4[0] = H
-    vpaddl.s16  d6,  d6
-    vpadd.s32   d6,  d6        @ d6[0] = V
-    vshl.i32    d5,  d4,  #4
-    vadd.s32    d4,  d4,  d5   @ d4[0] = 17*H
-    vshl.i32    d7,  d6,  #2
-    vrshrn.s32  d4,  q2,  #5   @ d4[0] = b
-    vadd.s32    d6,  d6,  d7   @ d6[0] = 5*V
-    vrshrn.s32  d6,  q3,  #6   @ d6[0] = c
-    mov         r3,  #0
-    vshl.i16    d3,  d4,  #2
-    vsub.i16    d3,  d3,  d4   @ d2[0] = 3 * b
-    vshl.i16    d2,  d6,  #3
-    vadd.i16    d3,  d3,  d2   @ d2[0] = 3 * b + 8 * c
-    vsub.i16    d3,  d3,  d6   @ d2[0] = 3 * b + 7 * c
-    vrev64.16   d16, d16
-    vadd.i16    d16, d16, d0   @ d16[0] = src[]+src[] + 1
-    vshl.i16    d2,  d16, #4   @ d3[0] = a + 16
-    vsub.i16    d2,  d2,  d3   @ i00
-    vext.16     q0,  q0,  q0,  #7
-    vmov.16     d0[0], r3
-    vmul.i16    q0,  q0,  d4[0]
-    vdup.16     q1,  d2[0]
-    vdup.16     q3,  d6[0]
-    vadd.i16    q1,  q1,  q0
-    mov         r3,  #16
-1:
-    vqshrun.s16 d0,  q1,  #5
-    vadd.i16    q1,  q1,  q3
-    vst1.8      {d0}, [r0,:64], r1
-    subs        r3,  r3,  #1
-    bne         1b
-    bx          lr
-endfunc
-
-
-function x264_predict_16x16_dc_top_neon
-    sub         r2,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    vld1.8      {q0}, [r2,:128]
-    add16x8     q0,  d0,  d1,  d0,  d1
-    vrshrn.u16  d0,  q0,  #4
-    vdup.8      q0,  d0[0]
-    b           pred16x16_dc_end
-endfunc
-
-function x264_predict_16x16_dc_left_neon
-    mov         r1,  #FDEC_STRIDE
-    sub         r2,  r0,  #1
-    ldcol.8     d0,  r2,  r1
-    ldcol.8     d1,  r2,  r1
-    add16x8     q0,  d0,  d1,  d0,  d1
-    vrshrn.u16  d0,  q0,  #4
-    vdup.8      q0,  d0[0]
-    b           pred16x16_dc_end
-endfunc
-
-function x264_predict_16x16_dc_neon
-    sub         r3, r0, #FDEC_STRIDE
-    sub         r0, r0, #1
-    vld1.64     {d0-d1}, [r3,:128]
-    ldrb        ip, [r0], #FDEC_STRIDE
-    vaddl.u8    q0, d0, d1
-    ldrb        r1, [r0], #FDEC_STRIDE
-    vadd.u16    d0, d0, d1
-    vpadd.u16   d0, d0, d0
-    vpadd.u16   d0, d0, d0
-.rept 4
-    ldrb        r2, [r0], #FDEC_STRIDE
-    add         ip, ip, r1
-    ldrb        r3, [r0], #FDEC_STRIDE
-    add         ip, ip, r2
-    ldrb        r1, [r0], #FDEC_STRIDE
-    add         ip, ip, r3
-.endr
-    ldrb        r2, [r0], #FDEC_STRIDE
-    add         ip, ip, r1
-    ldrb        r3, [r0], #FDEC_STRIDE
-    add         ip, ip, r2
-
-    sub         r0, r0, #FDEC_STRIDE*16
-    add         ip, ip, r3
-    vdup.16     d1, ip
-    vadd.u16    d0, d0, d1
-    mov         r1, #FDEC_STRIDE
-    add         r0, r0, #1
-    vrshr.u16   d0, d0, #5
-    vdup.8      q0, d0[0]
-pred16x16_dc_end:
-.rept 16
-    vst1.64     {d0-d1}, [r0,:128], r1
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_16x16_h_neon
-    sub         r1, r0, #1
-    mov         ip, #FDEC_STRIDE
-.rept 8
-    vld1.8      {d0[]}, [r1], ip
-    vmov        d1, d0
-    vld1.8      {d2[]}, [r1], ip
-    vmov        d3, d2
-    vst1.64     {d0-d1}, [r0,:128], ip
-    vst1.64     {d2-d3}, [r0,:128], ip
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_16x16_v_neon
-    sub         r0, r0, #FDEC_STRIDE
-    mov         ip, #FDEC_STRIDE
-    vld1.64     {d0-d1}, [r0,:128], ip
-.rept 16
-    vst1.64     {d0-d1}, [r0,:128], ip
-.endr
-    bx          lr
-endfunc
-
-function x264_predict_16x16_p_neon
-    sub         r3,  r0,  #FDEC_STRIDE
-    mov         r1,  #FDEC_STRIDE
-    add         r2,  r3,  #8
-    sub         r3,  r3,  #1
-    vld1.8      {d0}, [r3]
-    vld1.8      {d2}, [r2,:64], r1
-    ldcol.8     d1,  r3,  r1
-    add         r3,  r3,  r1
-    ldcol.8     d3,  r3,  r1
-    vrev64.8    q0,  q0
-    vaddl.u8    q8,  d2,  d3
-    vsubl.u8    q2,  d2,  d0
-    vsubl.u8    q3,  d3,  d1
-    movrel      r3,  p16weight
-    vld1.8      {q0}, [r3,:128]
-    vmul.s16    q2,  q2,  q0
-    vmul.s16    q3,  q3,  q0
-    vadd.i16    d4,  d4,  d5
-    vadd.i16    d5,  d6,  d7
-    vpadd.i16   d4,  d4,  d5
-    vpadd.i16   d4,  d4,  d4
-    vshll.s16   q3,  d4,  #2
-    vaddw.s16   q2,  q3,  d4
-    vrshrn.s32  d4,  q2,  #6
-    mov         r3,  #0
-    vtrn.16     d4,  d5
-    vadd.i16    d2,  d4,  d5
-    vshl.i16    d3,  d2,  #3
-    vrev64.16   d16, d17
-    vsub.i16    d3,  d3,  d2
-    vadd.i16    d16, d16, d0
-    vshl.i16    d2,  d16, #4
-    vsub.i16    d2,  d2,  d3
-    vshl.i16    d3,  d4,  #4
-    vext.16     q0,  q0,  q0,  #7
-    vsub.i16    d6,  d5,  d3
-    vmov.16     d0[0], r3
-    vmul.i16    q0,  q0,  d4[0]
-    vdup.16     q1,  d2[0]
-    vdup.16     q2,  d4[0]
-    vdup.16     q3,  d6[0]
-    vshl.i16    q2,  q2,  #3
-    vadd.i16    q1,  q1,  q0
-    vadd.i16    q3,  q3,  q2
-    mov         r3,  #16
-1:
-    vqshrun.s16 d0,  q1,  #5
-    vadd.i16    q1,  q1,  q2
-    vqshrun.s16 d1,  q1,  #5
-    vadd.i16    q1,  q1,  q3
-    vst1.8      {q0}, [r0,:128], r1
-    subs        r3,  r3,  #1
-    bne         1b
-    bx          lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/predict-c.c b/android/src/main/libenc/jni/libx264/common/arm/predict-c.c
deleted file mode 100755
index eb52d39..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/predict-c.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*****************************************************************************
- * predict.c: arm intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "predict.h"
-#include "pixel.h"
-
-void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
-{
-    if (!(cpu&X264_CPU_ARMV6))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
-    pf[I_PRED_4x4_V]   = x264_predict_4x4_v_armv6;
-    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
-
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-    pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
-    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
-    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
-    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
-    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
-    pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    /* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_neon;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_neon;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
-    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
-    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
-    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
-    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
-    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
-    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
-    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
-    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
-{
-    if (!(cpu&X264_CPU_NEON))
-        return;
-
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
-    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
-    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
-    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
-    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
-    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/arm/predict.h b/android/src/main/libenc/jni/libx264/common/arm/predict.h
deleted file mode 100755
index 1ce4ea6..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/predict.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************
- * predict.h: arm intra prediction
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ARM_PREDICT_H
-#define X264_ARM_PREDICT_H
-
-void x264_predict_4x4_dc_armv6( uint8_t *src );
-void x264_predict_4x4_dc_top_neon( uint8_t *src );
-void x264_predict_4x4_v_armv6( uint8_t *src );
-void x264_predict_4x4_h_armv6( uint8_t *src );
-void x264_predict_4x4_ddr_armv6( uint8_t *src );
-void x264_predict_4x4_ddl_neon( uint8_t *src );
-
-void x264_predict_8x8c_dc_neon( uint8_t *src );
-void x264_predict_8x8c_dc_top_neon( uint8_t *src );
-void x264_predict_8x8c_dc_left_neon( uint8_t *src );
-void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x8c_v_neon( uint8_t *src );
-void x264_predict_8x8c_p_neon( uint8_t *src );
-
-void x264_predict_8x16c_h_neon( uint8_t *src );
-void x264_predict_8x16c_dc_top_neon( uint8_t *src );
-void x264_predict_8x16c_p_neon( uint8_t *src );
-
-void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
-
-void x264_predict_16x16_dc_neon( uint8_t *src );
-void x264_predict_16x16_dc_top_neon( uint8_t *src );
-void x264_predict_16x16_dc_left_neon( uint8_t *src );
-void x264_predict_16x16_h_neon( uint8_t *src );
-void x264_predict_16x16_v_neon( uint8_t *src );
-void x264_predict_16x16_p_neon( uint8_t *src );
-
-void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
-void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
-void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/arm/quant-a.S b/android/src/main/libenc/jni/libx264/common/arm/quant-a.S
deleted file mode 100755
index cc2f40d..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/quant-a.S
+++ /dev/null
@@ -1,573 +0,0 @@
-/****************************************************************************
- * quant.S: arm quantization and level-run
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *          Janne Grunau <janne-x264@jannau.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "asm.S"
-
-.section .rodata
-.align 4
-pmovmskb_byte:
-.byte 1,2,4,8,16,32,64,128
-.byte 1,2,4,8,16,32,64,128
-
-mask_2bit:
-.byte 3,12,48,192,3,12,48,192
-.byte 3,12,48,192,3,12,48,192
-
-mask_1bit:
-.byte 128,64,32,16,8,4,2,1
-.byte 128,64,32,16,8,4,2,1
-
-.text
-
-.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
-    vadd.u16    q8,  q8,  \bias0
-    vadd.u16    q9,  q9,  \bias1
-.ifc \load_mf, yes
-    vld1.64     {\mf0-\mf3}, [r1,:128]!
-.endif
-    vmull.u16   q10, d16, \mf0
-    vmull.u16   q11, d17, \mf1
-    vmull.u16   q12, d18, \mf2
-    vmull.u16   q13, d19, \mf3
-    vshr.s16    q14, q14, #15
-    vshr.s16    q15, q15, #15
-    vshrn.u32   d16, q10, #16
-    vshrn.u32   d17, q11, #16
-    vshrn.u32   d18, q12, #16
-    vshrn.u32   d19, q13, #16
-    veor        q8,  q8,  q14
-    veor        q9,  q9,  q15
-    vsub.s16    q8,  q8,  q14
-    vsub.s16    q9,  q9,  q15
-    vorr        \mask, q8,  q9
-    vst1.64     {d16-d19}, [r0,:128]!
-.endm
-
-.macro QUANT_END d
-    vmov        r2,  r3,  \d
-    orrs        r0,  r2,  r3
-    movne       r0,  #1
-    bx          lr
-.endm
-
-// quant_2x2_dc( int16_t dct[4], int mf, int bias )
-function x264_quant_2x2_dc_neon
-    vld1.64     {d0}, [r0,:64]
-    vabs.s16    d3,  d0
-    vdup.16     d2,  r2
-    vdup.16     d1,  r1
-    vadd.u16    d3,  d3,  d2
-    vmull.u16   q3,  d3,  d1
-    vshr.s16    d0,  d0,  #15
-    vshrn.u32   d3,  q3,  #16
-    veor        d3,  d3,  d0
-    vsub.s16    d3,  d3,  d0
-    vst1.64     {d3}, [r0,:64]
-    QUANT_END   d3
-endfunc
-
-// quant_4x4_dc( int16_t dct[16], int mf, int bias )
-function x264_quant_4x4_dc_neon
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    vdup.16     q0,  r2
-    vdup.16     q2,  r1
-    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
-    vorr        d0,  d0,  d1
-    QUANT_END   d0
-endfunc
-
-// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4_neon
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    vld1.64     {d0-d3}, [r2,:128]
-    vld1.64     {d4-d7}, [r1,:128]
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
-    vorr        d0,  d0,  d1
-    QUANT_END   d0
-endfunc
-
-// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4x4_neon
-    vpush       {d8-d15}
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    vld1.64     {d0-d3},   [r2,:128]
-    vld1.64     {d4-d7},   [r1,:128]
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
-    vorr        d8,  d8,  d9
-    vorr       d10, d10, d11
-    vorr       d12, d12, d13
-    vorr       d14, d14, d15
-    vmov        r0,  r1,  d8
-    vmov        r2,  r3, d10
-    orrs        r0,  r1
-    movne       r0,  #1
-    orrs        r2,  r3
-    orrne       r0,  #2
-    vmov        r1,  r2, d12
-    vmov        r3,  ip, d14
-    orrs        r1,  r2
-    orrne       r0,  #4
-    orrs        r3,  ip
-    orrne       r0,  #8
-    vpop        {d8-d15}
-    bx          lr
-endfunc
-
-// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-function x264_quant_8x8_neon
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    vld1.64     {d0-d3},   [r2,:128]!
-    vld1.64     {d4-d7},   [r1,:128]!
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
-.rept 3
-    vld1.64     {d28-d31}, [r0,:128]
-    vabs.s16    q8,  q14
-    vabs.s16    q9,  q15
-    vld1.64     {d2-d5},   [r2,:128]!
-    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
-    vorr        q0,  q0,  q1
-.endr
-    vorr        d0,  d0,  d1
-    QUANT_END   d0
-endfunc
-
-.macro DEQUANT_START mf_size offset dc=no
-    mov         r3,  #0x2b
-    mul         r3,  r3,  r2
-    lsr         r3,  r3,  #8            // i_qbits = i_qp / 6
-    add         ip,  r3,  r3,  lsl #1
-    sub         r2,  r2,  ip,  lsl #1   // i_mf = i_qp % 6
-.ifc \dc,no
-    add         r1,  r1,  r2, lsl #\mf_size  // dequant_mf[i_mf]
-.else
-    ldr         r1, [r1,  r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
-.endif
-    subs        r3,  r3,  #\offset      // 6 for 8x8
-.endm
-
-// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-.macro DEQUANT size bits
-function x264_dequant_\size\()_neon
-    DEQUANT_START \bits+2, \bits
-.ifc \size, 8x8
-    mov         r2,  #4
-.endif
-    blt         dequant_\size\()_rshift
-
-    vdup.16     q15, r3
-dequant_\size\()_lshift_loop:
-.ifc \size, 8x8
-    subs        r2,  r2,  #1
-.endif
-    vld1.32     {d16-d17}, [r1,:128]!
-    vld1.32     {d18-d19}, [r1,:128]!
-    vmovn.s32   d4,  q8
-    vld1.32     {d20-d21}, [r1,:128]!
-    vmovn.s32   d5,  q9
-    vld1.32     {d22-d23}, [r1,:128]!
-    vmovn.s32   d6,  q10
-    vld1.16     {d0-d3},   [r0,:128]
-    vmovn.s32   d7,  q11
-    vmul.s16    q0,  q0,  q2
-    vmul.s16    q1,  q1,  q3
-    vshl.s16    q0,  q0,  q15
-    vshl.s16    q1,  q1,  q15
-    vst1.16     {d0-d3},   [r0,:128]!
-.ifc \size, 8x8
-    bgt         dequant_\size\()_lshift_loop
-.endif
-    bx          lr
-
-dequant_\size\()_rshift:
-    vdup.32     q15, r3
-    rsb         r3,  r3,  #0
-    mov         ip,  #1
-    sub         r3,  r3,  #1
-    lsl         ip,  ip,  r3
-
-.ifc \size, 8x8
-dequant_\size\()_rshift_loop:
-    subs        r2,  r2,  #1
-.endif
-    vdup.32     q10, ip
-    vld1.32     {d16-d17}, [r1,:128]!
-    vdup.32     q11, ip
-    vld1.32     {d18-d19}, [r1,:128]!
-    vmovn.s32   d4,  q8
-    vld1.32     {d16-d17}, [r1,:128]!
-    vmovn.s32   d5,  q9
-    vld1.32     {d18-d19}, [r1,:128]!
-    vmovn.s32   d6,  q8
-    vld1.16     {d0-d3},   [r0,:128]
-    vmovn.s32   d7,  q9
-    vdup.32     q12, ip
-    vdup.32     q13, ip
-
-    vmlal.s16   q10, d0,  d4
-    vmlal.s16   q11, d1,  d5
-    vmlal.s16   q12, d2,  d6
-    vmlal.s16   q13, d3,  d7
-    vshl.s32    q10, q10, q15
-    vshl.s32    q11, q11, q15
-    vshl.s32    q12, q12, q15
-    vshl.s32    q13, q13, q15
-
-    vmovn.s32   d0,  q10
-    vmovn.s32   d1,  q11
-    vmovn.s32   d2,  q12
-    vmovn.s32   d3,  q13
-    vst1.16     {d0-d3},   [r0,:128]!
-.ifc \size, 8x8
-    bgt         dequant_\size\()_rshift_loop
-.endif
-    bx          lr
-endfunc
-.endm
-
-DEQUANT 4x4, 4
-DEQUANT 8x8, 6
-
-// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-function x264_dequant_4x4_dc_neon
-    DEQUANT_START 6, 6, yes
-    blt         dequant_4x4_dc_rshift
-
-    lsl         r1,  r1,  r3
-    vdup.16     q2,  r1
-    vld1.16     {d0-d3},   [r0,:128]
-    vdup.16     q15, r3
-
-    vmul.s16    q0,  q0,  q2
-    vmul.s16    q1,  q1,  q2
-    vst1.16     {d0-d3},   [r0,:128]
-    bx          lr
-
-dequant_4x4_dc_rshift:
-    vdup.16     d4,  r1
-    vdup.32     q15, r3
-    rsb         r3,  r3,  #0
-    mov         ip,  #1
-    sub         r3,  r3,  #1
-    lsl         ip,  ip,  r3
-
-    vdup.32     q10, ip
-    vdup.32     q11, ip
-    vld1.16     {d0-d3},   [r0,:128]
-    vdup.32     q12, ip
-    vdup.32     q13, ip
-
-    vmlal.s16   q10, d0,  d4
-    vmlal.s16   q11, d1,  d4
-    vmlal.s16   q12, d2,  d4
-    vmlal.s16   q13, d3,  d4
-    vshl.s32    q10, q10, q15
-    vshl.s32    q11, q11, q15
-    vshl.s32    q12, q12, q15
-    vshl.s32    q13, q13, q15
-
-    vmovn.s32   d0,  q10
-    vmovn.s32   d1,  q11
-    vmovn.s32   d2,  q12
-    vmovn.s32   d3,  q13
-    vst1.16     {d0-d3},   [r0,:128]
-    bx          lr
-endfunc
-
-.macro decimate_score_1x size
-function x264_decimate_score\size\()_neon
-    vld1.16     {q0, q1}, [r0, :128]
-    movrel      r3, mask_2bit
-    vmov.s8     q3,  #0x01
-    vqmovn.s16  d0,  q0
-    vqmovn.s16  d1,  q1
-    vqabs.s8    q2,  q0
-    vld1.8      {q8}, [r3, :128]
-    vceq.s8     q1,  q0,  #0
-    vcgt.s8     q2,  q2,  q3
-    vand.u8     q1,  q1,  q8
-    vshrn.u16   d4,  q2,  #4
-    vpadd.u8    d2,  d2,  d3
-    vpadd.u8    d4,  d4,  d4
-    vpadd.u8    d2,  d2,  d2
-    vmov.32     r2,  d4[0]
-    vmov.32     r1,  d2[0]
-    cmp         r2,  #0
-    beq         0f
-    mov         r0,  #9
-    bx          lr
-0:
-    mvns        r1,  r1
-    mov         r0,  #0
-    bxeq        lr
-.ifc \size, 15
-    lsr         r1,  r1,  #2
-.endif
-    rbit        r1,  r1
-    movrelx     r3,  X(x264_decimate_table4), r2
-1:
-    clz         r2,  r1
-    lsl         r1,  r1,  r2
-    lsr         r12, r2,  #1
-    ldrb        r2,  [r3, r12]
-    lsls        r1,  r1,  #2
-    add         r0,  r0,  r2
-    bne         1b
-    bx          lr
-endfunc
-.endm
-
-decimate_score_1x 15
-decimate_score_1x 16
-
-function x264_decimate_score64_neon
-    push        {lr}
-    vld1.16     {q8,  q9},  [r0, :128]!
-    vld1.16     {q10, q11}, [r0, :128]!
-    vld1.16     {q12, q13}, [r0, :128]!
-    vld1.16     {q14, q15}, [r0, :128]
-    movrel      r3, mask_1bit
-    vmov.s8     q3,  #0x01
-    vqmovn.s16  d17, q8
-    vqmovn.s16  d16, q9
-    vqmovn.s16  d19, q10
-    vqmovn.s16  d18, q11
-    vqmovn.s16  d21, q12
-    vqmovn.s16  d20, q13
-    vqmovn.s16  d23, q14
-    vqmovn.s16  d22, q15
-    vqabs.s8    q12, q8
-    vqabs.s8    q13, q9
-    vqabs.s8    q14, q10
-    vqabs.s8    q15, q11
-    vld1.8      {q2}, [r3, :128]
-    vceq.s8     q8,  q8,  #0
-    vceq.s8     q9,  q9,  #0
-    vceq.s8     q10, q10, #0
-    vceq.s8     q11, q11, #0
-    vmax.s8     q12, q12, q13
-    vmax.s8     q14, q14, q15
-    vand.u8     q8,  q8,  q2
-    vand.u8     q9,  q9,  q2
-    vand.u8     q10, q10, q2
-    vand.u8     q11, q11, q2
-    vmax.s8     q12, q12, q14
-    vpadd.u8    d18, d18, d19
-    vpadd.u8    d19, d16, d17
-    vcgt.s8     q12, q12, q3
-    vpadd.u8    d22, d22, d23
-    vpadd.u8    d23, d20, d21
-    vshrn.u16   d24, q12, #4
-    vpadd.u8    d16, d22, d23
-    vpadd.u8    d17, d18, d19
-    vpadd.u8    d24, d24, d24
-    vpadd.u8    d16, d16, d17
-    vmov.32     r2,  d24[0]
-    vmov        r12, r1,  d16
-    cmp         r2,  #0
-    beq         0f
-    mov         r0,  #9
-    pop         {pc}
-0:
-    mvns        r1,  r1
-    mvn         r12, r12
-    mov         r0,  #0
-    mov         lr,  #32
-    movrelx     r3,  X(x264_decimate_table8), r2
-    beq         2f
-1:
-    clz         r2,  r1
-    lsl         r1,  r1,  r2
-    sub         lr,  lr,  r2
-    ldrb        r2,  [r3, r2]
-    lsls        r1,  r1,  #1
-    sub         lr,  lr,  #1
-    add         r0,  r0,  r2
-    bne         1b
-2:
-    cmp         r12, #0
-    popeq       {pc}
-
-    clz         r2,  r12
-    lsl         r1,  r12, r2
-    add         r2,  r2,  lr
-    ldrb        r2,  [r3, r2]
-    lsls        r1,  r1,  #1
-    add         r0,  r0,  r2
-    popeq       {pc}
-3:
-    clz         r2,  r1
-    lsl         r1,  r1,  r2
-    ldrb        r2,  [r3, r2]
-    lsls        r1,  r1,  #1
-    add         r0,  r0,  r2
-    bne         3b
-    pop         {pc}
-endfunc
-
-// int coeff_last( int16_t *l )
-function x264_coeff_last4_arm
-    ldrd        r2,  r3,  [r0]
-    subs        r0,  r3,  #0
-    movne       r0,  #2
-    movne       r2,  r3
-    lsrs        r2,  r2,  #16
-    addne       r0,  r0,  #1
-    bx          lr
-endfunc
-
-function x264_coeff_last8_arm
-    ldrd        r2,  r3,  [r0, #8]
-    orrs        ip,  r2,  r3
-    movne       r0,  #4
-    ldrdeq      r2,  r3,  [r0]
-    moveq       r0,  #0
-    tst         r3,  r3
-    addne       r0,  #2
-    movne       r2,  r3
-    lsrs        r2,  r2,  #16
-    addne       r0,  r0,  #1
-    bx          lr
-endfunc
-
-.macro COEFF_LAST_1x size
-function x264_coeff_last\size\()_neon
-.if \size == 15
-    sub         r0,  r0,  #2
-.endif
-    vld1.64     {d0-d3}, [r0,:128]
-    vtst.16     q0,  q0
-    vtst.16     q1,  q1
-    vshrn.u16   d0,  q0,  #8
-    vshrn.u16   d1,  q1,  #8
-    vshrn.u16   d0,  q0,  #4
-    vclz.i32    d0,  d0
-    mov         ip,  #7
-    mov         r3,  #\size - 9
-    vmov        r0,  r1,  d0
-
-    subs        r1,  ip,  r1,  lsr #2
-    addge       r0,  r1,  #\size - 8
-    subslt      r0,  r3,  r0,  lsr #2
-    movlt       r0,  #0
-    bx          lr
-endfunc
-.endm
-
-COEFF_LAST_1x 15
-COEFF_LAST_1x 16
-
-function x264_coeff_last64_neon
-    vld1.64     {d16-d19}, [r0,:128]!
-    vqmovn.u16  d16, q8
-    vqmovn.u16  d17, q9
-    vld1.64     {d20-d23}, [r0,:128]!
-    vqmovn.u16  d18, q10
-    vqmovn.u16  d19, q11
-    vld1.64     {d24-d27}, [r0,:128]!
-    vqmovn.u16  d20, q12
-    vqmovn.u16  d21, q13
-    vld1.64     {d28-d31}, [r0,:128]!
-    vqmovn.u16  d22, q14
-    vqmovn.u16  d23, q15
-
-    movrel      r1, pmovmskb_byte
-    vld1.64     {d0-d1}, [r1,:128]
-
-    vtst.8      q8,  q8
-    vtst.8      q9,  q9
-    vtst.8      q10, q10
-    vtst.8      q11, q11
-
-    vand        q8,  q8,  q0
-    vand        q9,  q9,  q0
-    vand        q10, q10, q0
-    vand        q11, q11, q0
-
-    vpadd.u8    d0,  d16, d17
-    vpadd.u8    d1,  d18, d19
-    vpadd.u8    d2,  d20, d21
-    vpadd.u8    d3,  d22, d23
-    vpadd.u8    d0,  d0,  d1
-    vpadd.u8    d1,  d2,  d3
-    vpadd.u8    d0,  d0,  d1
-    vclz.i32    d0,  d0
-    mov         ip,  #31
-    vmov        r0,  r1,  d0
-
-    subs        r1,  ip,  r1
-    addge       r0,  r1,  #32
-    subslt      r0,  ip,  r0
-    movlt       r0,  #0
-    bx          lr
-endfunc
-
-function x264_denoise_dct_neon
-1:  subs        r3,  r3,  #16
-    vld1.16     {q0,  q1},  [r0]
-    vld1.32     {q12, q13}, [r1]!
-    vld1.32     {q14, q15}, [r1]
-    sub         r1,  #32
-    vabs.s16    q8,  q0
-    vabs.s16    q9,  q1
-    vld1.16     {q2, q3}, [r2]!
-    vclt.s16    q10, q0,  #0
-    vclt.s16    q11, q1,  #0
-    vaddw.u16   q12, q12, d16
-    vaddw.u16   q13, q13, d17
-    vqsub.u16   q0,  q8,  q2
-    vqsub.u16   q1,  q9,  q3
-    vaddw.u16   q14, q14, d18
-    vaddw.u16   q15, q15, d19
-    vneg.s16    q8,  q0
-    vneg.s16    q9,  q1
-    vbsl        q10, q8,  q0
-    vbsl        q11, q9,  q1
-    vst1.32     {q12, q13}, [r1]!
-    vst1.32     {q14, q15}, [r1]!
-    vst1.16     {q10, q11}, [r0]!
-    bgt         1b
-    bx          lr
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/common/arm/quant.h b/android/src/main/libenc/jni/libx264/common/arm/quant.h
deleted file mode 100755
index dff3f9e..0000000
--- a/android/src/main/libenc/jni/libx264/common/arm/quant.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*****************************************************************************
- * quant.h: arm quantization and level-run
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ARM_QUANT_H
-#define X264_ARM_QUANT_H
-
-int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
-
-int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
-int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
-int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
-
-void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-
-int x264_decimate_score15_neon( int16_t * );
-int x264_decimate_score16_neon( int16_t * );
-int x264_decimate_score64_neon( int16_t * );
-
-int x264_coeff_last4_arm( int16_t * );
-int x264_coeff_last8_arm( int16_t * );
-int x264_coeff_last15_neon( int16_t * );
-int x264_coeff_last16_neon( int16_t * );
-int x264_coeff_last64_neon( int16_t * );
-
-void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/bitstream.c b/android/src/main/libenc/jni/libx264/common/bitstream.c
deleted file mode 100755
index f1a4996..0000000
--- a/android/src/main/libenc/jni/libx264/common/bitstream.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*****************************************************************************
- * bitstream.c: bitstream writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
-{
-    if( src < end ) *dst++ = *src++;
-    if( src < end ) *dst++ = *src++;
-    while( src < end )
-    {
-        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
-            *dst++ = 0x03;
-        *dst++ = *src++;
-    }
-    return dst;
-}
-
-uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
-void x264_cabac_block_residual_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_8x8_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_8x8_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-
-uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
-
-/****************************************************************************
- * x264_nal_encode:
- ****************************************************************************/
-void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
-{
-    uint8_t *src = nal->p_payload;
-    uint8_t *end = nal->p_payload + nal->i_payload;
-    uint8_t *orig_dst = dst;
-
-    if( h->param.b_annexb )
-    {
-        if( nal->b_long_startcode )
-            *dst++ = 0x00;
-        *dst++ = 0x00;
-        *dst++ = 0x00;
-        *dst++ = 0x01;
-    }
-    else /* save room for size later */
-        dst += 4;
-
-    /* nal header */
-    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
-
-    dst = h->bsf.nal_escape( dst, src, end );
-    int size = dst - orig_dst;
-
-    /* Apply AVC-Intra padding */
-    if( h->param.i_avcintra_class )
-    {
-        int padding = nal->i_payload + nal->i_padding + NALU_OVERHEAD - size;
-        if( padding > 0 )
-        {
-            memset( dst, 0, padding );
-            size += padding;
-        }
-        nal->i_padding = X264_MAX( padding, 0 );
-    }
-
-    /* Write the size header for mp4/etc */
-    if( !h->param.b_annexb )
-    {
-        /* Size doesn't include the size of the header we're writing now. */
-        int chunk_size = size - 4;
-        orig_dst[0] = chunk_size >> 24;
-        orig_dst[1] = chunk_size >> 16;
-        orig_dst[2] = chunk_size >> 8;
-        orig_dst[3] = chunk_size >> 0;
-    }
-
-    nal->i_payload = size;
-    nal->p_payload = orig_dst;
-    x264_emms();
-}
-
-void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
-{
-    memset( pf, 0, sizeof(*pf) );
-
-    pf->nal_escape = x264_nal_escape_c;
-#if HAVE_MMX
-#if ARCH_X86_64
-    pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
-    pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
-    pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
-#endif
-
-    if( cpu&X264_CPU_MMX2 )
-        pf->nal_escape = x264_nal_escape_mmx2;
-    if( cpu&X264_CPU_SSE2 )
-    {
-#if ARCH_X86_64
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
-            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
-            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
-        }
-#endif
-        if( cpu&X264_CPU_SSE2_IS_FAST )
-            pf->nal_escape = x264_nal_escape_sse2;
-    }
-#if ARCH_X86_64
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
-        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
-            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
-        }
-    }
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf->nal_escape = x264_nal_escape_avx2;
-        if( cpu&X264_CPU_BMI2 )
-            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
-    }
-#endif
-#endif
-#if HAVE_ARMV6
-    if( cpu&X264_CPU_NEON )
-        pf->nal_escape = x264_nal_escape_neon;
-#endif
-#if ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-        pf->nal_escape = x264_nal_escape_neon;
-#endif
-}
diff --git a/android/src/main/libenc/jni/libx264/common/bitstream.h b/android/src/main/libenc/jni/libx264/common/bitstream.h
deleted file mode 100755
index a58746b..0000000
--- a/android/src/main/libenc/jni/libx264/common/bitstream.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/*****************************************************************************
- * bitstream.h: bitstream writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_BS_H
-#define X264_BS_H
-
-typedef struct
-{
-    uint8_t i_bits;
-    uint8_t i_size;
-} vlc_t;
-
-typedef struct
-{
-    uint16_t i_bits;
-    uint8_t  i_size;
-    /* Next level table to use */
-    uint8_t  i_next;
-} vlc_large_t;
-
-typedef struct bs_s
-{
-    uint8_t *p_start;
-    uint8_t *p;
-    uint8_t *p_end;
-
-    uintptr_t cur_bits;
-    int     i_left;    /* i_count number of available bits */
-    int     i_bits_encoded; /* RD only */
-} bs_t;
-
-typedef struct
-{
-    int32_t last;
-    int32_t mask;
-    ALIGNED_16( dctcoef level[18] );
-} x264_run_level_t;
-
-extern const vlc_t x264_coeff0_token[6];
-extern const vlc_t x264_coeff_token[6][16][4];
-extern const vlc_t x264_total_zeros[15][16];
-extern const vlc_t x264_total_zeros_2x2_dc[3][4];
-extern const vlc_t x264_total_zeros_2x4_dc[7][8];
-
-typedef struct
-{
-    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
-    void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced,
-                                           intptr_t ctx_block_cat, x264_cabac_t *cb );
-    void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced,
-                                              intptr_t ctx_block_cat, x264_cabac_t *cb );
-    void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced,
-                                                  intptr_t ctx_block_cat, x264_cabac_t *cb );
-} x264_bitstream_function_t;
-
-void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
-
-/* A larger level table size theoretically could help a bit at extremely
- * high bitrates, but the cost in cache is usually too high for it to be
- * useful.
- * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
- * FIXME: Do further testing? */
-#define LEVEL_TABLE_SIZE 128
-extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
-
-/* The longest possible set of zero run codes sums to 25 bits.  This leaves
- * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
-
-extern uint32_t x264_run_before[1<<16];
-
-static inline void bs_init( bs_t *s, void *p_data, int i_data )
-{
-    int offset = ((intptr_t)p_data & 3);
-    s->p       = s->p_start = (uint8_t*)p_data - offset;
-    s->p_end   = (uint8_t*)p_data + i_data;
-    s->i_left  = (WORD_SIZE - offset)*8;
-    s->cur_bits = endian_fix32( M32(s->p) );
-    s->cur_bits >>= (4-offset)*8;
-}
-static inline int bs_pos( bs_t *s )
-{
-    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
-}
-
-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
-static inline void bs_flush( bs_t *s )
-{
-    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
-    s->p += WORD_SIZE - (s->i_left >> 3);
-    s->i_left = WORD_SIZE*8;
-}
-/* The inverse of bs_flush: prepare the bitstream to be written to again. */
-static inline void bs_realign( bs_t *s )
-{
-    int offset = ((intptr_t)s->p & 3);
-    if( offset )
-    {
-        s->p       = (uint8_t*)s->p - offset;
-        s->i_left  = (WORD_SIZE - offset)*8;
-        s->cur_bits = endian_fix32( M32(s->p) );
-        s->cur_bits >>= (4-offset)*8;
-    }
-}
-
-static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
-{
-    if( WORD_SIZE == 8 )
-    {
-        s->cur_bits = (s->cur_bits << i_count) | i_bits;
-        s->i_left -= i_count;
-        if( s->i_left <= 32 )
-        {
-#if WORDS_BIGENDIAN
-            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
-#else
-            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
-#endif
-            s->i_left += 32;
-            s->p += 4;
-        }
-    }
-    else
-    {
-        if( i_count < s->i_left )
-        {
-            s->cur_bits = (s->cur_bits << i_count) | i_bits;
-            s->i_left -= i_count;
-        }
-        else
-        {
-            i_count -= s->i_left;
-            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
-            M32( s->p ) = endian_fix( s->cur_bits );
-            s->p += 4;
-            s->cur_bits = i_bits;
-            s->i_left = 32 - i_count;
-        }
-    }
-}
-
-/* Special case to eliminate branch in normal bs_write. */
-/* Golomb never writes an even-size code, so this is only used in slice headers. */
-static inline void bs_write32( bs_t *s, uint32_t i_bits )
-{
-    bs_write( s, 16, i_bits >> 16 );
-    bs_write( s, 16, i_bits );
-}
-
-static inline void bs_write1( bs_t *s, uint32_t i_bit )
-{
-    s->cur_bits <<= 1;
-    s->cur_bits |= i_bit;
-    s->i_left--;
-    if( s->i_left == WORD_SIZE*8-32 )
-    {
-        M32( s->p ) = endian_fix32( s->cur_bits );
-        s->p += 4;
-        s->i_left = WORD_SIZE*8;
-    }
-}
-
-static inline void bs_align_0( bs_t *s )
-{
-    bs_write( s, s->i_left&7, 0 );
-    bs_flush( s );
-}
-static inline void bs_align_1( bs_t *s )
-{
-    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
-    bs_flush( s );
-}
-static inline void bs_align_10( bs_t *s )
-{
-    if( s->i_left&7 )
-        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
-}
-
-/* golomb functions */
-
-static const uint8_t x264_ue_size_tab[256] =
-{
-     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
-     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
-    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-};
-
-static inline void bs_write_ue_big( bs_t *s, unsigned int val )
-{
-    int size = 0;
-    int tmp = ++val;
-    if( tmp >= 0x10000 )
-    {
-        size = 32;
-        tmp >>= 16;
-    }
-    if( tmp >= 0x100 )
-    {
-        size += 16;
-        tmp >>= 8;
-    }
-    size += x264_ue_size_tab[tmp];
-    bs_write( s, size>>1, 0 );
-    bs_write( s, (size>>1)+1, val );
-}
-
-/* Only works on values under 255. */
-static inline void bs_write_ue( bs_t *s, int val )
-{
-    bs_write( s, x264_ue_size_tab[val+1], val+1 );
-}
-
-static inline void bs_write_se( bs_t *s, int val )
-{
-    int size = 0;
-    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
-    /* 4 instructions on x86, 3 on ARM */
-    int tmp = 1 - val*2;
-    if( tmp < 0 ) tmp = val*2;
-    val = tmp;
-
-    if( tmp >= 0x100 )
-    {
-        size = 16;
-        tmp >>= 8;
-    }
-    size += x264_ue_size_tab[tmp];
-    bs_write( s, size, val );
-}
-
-static inline void bs_write_te( bs_t *s, int x, int val )
-{
-    if( x == 1 )
-        bs_write1( s, 1^val );
-    else //if( x > 1 )
-        bs_write_ue( s, val );
-}
-
-static inline void bs_rbsp_trailing( bs_t *s )
-{
-    bs_write1( s, 1 );
-    bs_write( s, s->i_left&7, 0  );
-}
-
-static ALWAYS_INLINE int bs_size_ue( unsigned int val )
-{
-    return x264_ue_size_tab[val+1];
-}
-
-static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
-{
-    if( val < 255 )
-        return x264_ue_size_tab[val+1];
-    else
-        return x264_ue_size_tab[(val+1)>>8] + 16;
-}
-
-static ALWAYS_INLINE int bs_size_se( int val )
-{
-    int tmp = 1 - val*2;
-    if( tmp < 0 ) tmp = val*2;
-    if( tmp < 256 )
-        return x264_ue_size_tab[tmp];
-    else
-        return x264_ue_size_tab[tmp>>8]+16;
-}
-
-static ALWAYS_INLINE int bs_size_te( int x, int val )
-{
-    if( x == 1 )
-        return 1;
-    else //if( x > 1 )
-        return x264_ue_size_tab[val+1];
-}
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/cabac.c b/android/src/main/libenc/jni/libx264/common/cabac.c
deleted file mode 100755
index 9b611c0..0000000
--- a/android/src/main/libenc/jni/libx264/common/cabac.c
+++ /dev/null
@@ -1,1485 +0,0 @@
-/*****************************************************************************
- * cabac.c: arithmetic coder
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-
-static const int8_t x264_cabac_context_init_I[1024][2] =
-{
-    /* 0 - 10 */
-    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
-    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
-    { -6,  53 }, { -1, 54 },  {  7,  51 },
-
-    /* 11 - 23 unused for I */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },
-
-    /* 24- 39 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-
-    /* 40 - 53 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 54 - 59 */
-    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
-    { 0, 0 },    { 0, 0 },
-
-    /* 60 - 69 */
-    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-    { 13, 41 },  { 3, 62 },
-
-    /* 70 -> 87 */
-    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
-    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
-    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
-    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
-    { -12, 115 },{ -16, 122 },
-
-    /* 88 -> 104 */
-    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
-    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
-    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
-    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
-    { -22, 125 },
-
-    /* 105 -> 135 */
-    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
-    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
-    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
-    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
-    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
-    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
-    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
-    { 14, 62 },  { -13, 108 },{ -15, 100 },
-
-    /* 136 -> 165 */
-    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
-    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
-    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
-    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
-    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
-    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
-    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
-    { 0, 62 },   { 12, 72 },
-
-    /* 166 -> 196 */
-    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
-    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
-    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
-    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
-    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
-    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
-    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
-    { 0, 89 },   { 26, -19 }, { 22, -17 },
-
-    /* 197 -> 226 */
-    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
-    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
-    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
-    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
-    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
-    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
-    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
-    { 12, 68 },  { 2, 97 },
-
-    /* 227 -> 251 */
-    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
-    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
-    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
-    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
-    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
-    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
-    { -4, 65 },
-
-    /* 252 -> 275 */
-    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
-    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
-    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
-    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
-    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
-    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
-
-    /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
-    { 0, 0 },
-
-    /* 277 -> 307 */
-    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
-    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
-    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
-    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
-    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
-    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
-    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
-    { 9, 64 },   { -12, 104 },{ -11, 97 },
-
-    /* 308 -> 337 */
-    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
-    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
-    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
-    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
-    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
-    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
-    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
-    { 5, 64 },   { 12, 70 },
-
-    /* 338 -> 368 */
-    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
-    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
-    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
-    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
-    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
-    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
-    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
-    { -12, 109 },{ 36, -35 }, { 36, -34 },
-
-    /* 369 -> 398 */
-    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
-    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
-    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
-    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
-    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
-    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
-    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
-    { 29, 39 },  { 19, 66 },
-
-    /* 399 -> 435 */
-    {  31,  21 }, {  31,  31 }, {  25,  50 },
-    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
-    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
-    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
-    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
-    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
-    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
-    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
-    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
-    {   0,  68 }, {  -9,  92 },
-
-    /* 436 -> 459 */
-    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
-    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
-    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
-    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
-    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
-
-    /* 460 -> 1024 */
-    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
-    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
-    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
-    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
-    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
-    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
-    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
-    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
-    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
-    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
-    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
-    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
-    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
-    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
-    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
-    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
-    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
-    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
-    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
-    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
-    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
-    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
-    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
-    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
-    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
-    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
-    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
-    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
-    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
-    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
-    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
-    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
-    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
-    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
-    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
-    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
-    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
-    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
-    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
-    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
-    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
-    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
-    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
-    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
-    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
-    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
-    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
-    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
-    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
-    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
-    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
-    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
-    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
-    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
-    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
-    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
-    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
-    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
-    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
-    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
-    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
-    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
-    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
-    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
-    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
-    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
-    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
-    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
-    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
-    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
-    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
-    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
-    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
-    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
-    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
-    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
-    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
-    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
-    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
-    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
-    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
-    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
-    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
-    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
-    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
-    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
-    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
-    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
-    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
-    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
-    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
-    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
-    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
-    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
-    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
-    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
-    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
-    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
-    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
-    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
-    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
-    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
-    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
-    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
-    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
-    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
-    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
-    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
-    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
-    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
-    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
-    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
-    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
-    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
-    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
-    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
-    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
-    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
-    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
-    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
-    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
-    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
-    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
-    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
-    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
-    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
-    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
-    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
-    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
-    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
-    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
-    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
-    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
-};
-
-static const int8_t x264_cabac_context_init_PB[3][1024][2] =
-{
-    /* i_cabac_init_idc == 0 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
-        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
-        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
-        {  17,  50 },
-
-        /* 24 - 39 */
-        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
-        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
-        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
-        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
-
-        /* 40 - 53 */
-        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
-        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
-        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
-        {  -3,  81 }, {   0,  88 },
-
-        /* 54 - 59 */
-        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
-        {  -7,  72 }, {   1,  58 },
-
-        /* 60 - 69 */
-        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
-        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
-        {  13,  41 }, {   3,  62 },
-
-        /* 70 - 87 */
-        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
-        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
-        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
-        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
-        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
-        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
-        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
-        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
-        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
-
-        /* 105 -> 165 */
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
-        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
-        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
-        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
-        {   9,  69 },
-
-        /* 166 - 226 */
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
-        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
-        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
-        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
-        {  -9, 108 },
-
-        /* 227 - 275 */
-        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
-        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
-        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
-        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
-        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
-        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
-        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
-        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
-        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
-        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
-        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
-        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
-        {  -8,  85 },
-
-        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
-        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
-        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
-        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
-        {  26,  43 },
-
-        /* 338 - 398 */
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
-        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
-        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
-        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
-        {  11,  86 },
-
-        /* 399 -> 435 */
-        {  12,  40 }, {  11,  51 }, {  14,  59 },
-        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
-        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
-        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
-        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
-        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
-        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
-        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
-        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
-        {  -8,  66 }, {  -8,  76 },
-
-        /* 436 -> 459 */
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
-        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
-        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-
-        /* 460 - 1024 */
-        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
-        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
-        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
-        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
-        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
-        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
-        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
-        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
-        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
-        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
-        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
-        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
-        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
-        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
-        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
-        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
-        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
-        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
-        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
-        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
-        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
-        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
-        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
-        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
-        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
-        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
-        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
-        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
-        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
-        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
-        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
-        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
-        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
-        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
-        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
-        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
-        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
-        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
-        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
-        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
-        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
-        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
-        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
-        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
-    },
-
-    /* i_cabac_init_idc == 1 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
-        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
-        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
-        {  10,  54 },
-
-        /* 24 - 39 */
-        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
-        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
-        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
-        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
-
-        /* 40 - 53 */
-        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
-        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
-        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
-        {  -7,  86 },{  -5,  95 },
-
-        /* 54 - 59 */
-        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
-        {  -5,  72 },{   0,  61 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
-        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
-        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
-        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
-        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
-        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
-        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
-        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
-        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
-
-        /* 105 -> 165 */
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
-        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
-        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
-        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
-        {   0,  89 },
-
-        /* 166 - 226 */
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
-        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
-        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
-        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
-        { -10, 116 },
-
-        /* 227 - 275 */
-        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
-        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
-        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
-        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
-        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
-        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
-        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
-        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
-        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
-        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
-        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
-        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
-        {  -4,  78 },
-
-        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
-        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
-        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
-        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
-        {  18,  50 },
-
-        /* 338 - 398 */
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
-        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
-        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
-        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
-        {  11,  83 },
-
-        /* 399 -> 435 */
-        {  25,  32 }, {  21,  49 }, {  21,  54 },
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
-        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
-        {  -4,  67 }, {  -7,  82 },
-
-        /* 436 -> 459 */
-        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
-        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
-        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
-        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-
-        /* 460 - 1024 */
-        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
-        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
-        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
-        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
-        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
-        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
-        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
-        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
-        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
-        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
-        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
-        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
-        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
-        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
-        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
-        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
-        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
-        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
-        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
-        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
-        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
-        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
-        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
-        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
-        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
-        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
-        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
-        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
-        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
-        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
-        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
-        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
-        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
-        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
-        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
-        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
-        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
-        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
-        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
-    },
-
-    /* i_cabac_init_idc == 2 */
-    {
-        /* 0 - 10 */
-        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
-        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
-        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
-
-        /* 11 - 23 */
-        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
-        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
-        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
-        {  14,  57 },
-
-        /* 24 - 39 */
-        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
-        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
-        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
-        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
-
-        /* 40 - 53 */
-        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
-        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
-        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
-        {  -3,  90 },{  -1,  101 },
-
-        /* 54 - 59 */
-        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
-        {  -7,  50 },{   1,  60 },
-
-        /* 60 - 69 */
-        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
-        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
-        { 13, 41 },  { 3, 62 },
-
-        /* 70 - 104 */
-        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
-        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
-        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
-        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
-        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
-        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
-        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
-        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
-        {   3,  68 }, {  -8,  71 }, { -13,  98 },
-
-        /* 105 -> 165 */
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
-        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
-        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
-        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
-        { -22, 127 },
-
-        /* 166 - 226 */
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
-        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
-        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
-        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
-        { -24, 127 },
-
-        /* 227 - 275 */
-        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
-        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
-        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
-        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
-        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
-        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
-        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
-        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
-        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
-        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
-        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
-        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
-        { -10,  87 },
-
-        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
-        { 0, 0 },
-
-        /* 277 - 337 */
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
-        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
-        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
-        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
-        {  25,  42 },
-
-        /* 338 - 398 */
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
-        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
-        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
-        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
-        {  25,  61 },
-
-        /* 399 -> 435 */
-        {  21,  33 }, {  19,  50 }, {  17,  61 },
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
-        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
-        {  -6,  68 }, { -10,  79 },
-
-        /* 436 -> 459 */
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-
-        /* 460 - 1024 */
-        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
-        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
-        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
-        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
-        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
-        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
-        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
-        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
-        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
-        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
-        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
-        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
-        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
-        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
-        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
-        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
-        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
-        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
-        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
-        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
-        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
-        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
-        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
-        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
-        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
-        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
-        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
-        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
-        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
-        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
-        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
-        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
-        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
-        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
-        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
-        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
-        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
-        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
-        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
-    }
-};
-
-const uint8_t x264_cabac_range_lps[64][4] =
-{
-    {  2,   2,   2,   2}, {  6,   7,   8,   9}, {  6,   7,   9,  10}, {  6,   8,   9,  11},
-    {  7,   8,  10,  11}, {  7,   9,  10,  12}, {  7,   9,  11,  12}, {  8,   9,  11,  13},
-    {  8,  10,  12,  14}, {  9,  11,  12,  14}, {  9,  11,  13,  15}, { 10,  12,  14,  16},
-    { 10,  12,  15,  17}, { 11,  13,  15,  18}, { 11,  14,  16,  19}, { 12,  14,  17,  20},
-    { 12,  15,  18,  21}, { 13,  16,  19,  22}, { 14,  17,  20,  23}, { 14,  18,  21,  24},
-    { 15,  19,  22,  25}, { 16,  20,  23,  27}, { 17,  21,  25,  28}, { 18,  22,  26,  30},
-    { 19,  23,  27,  31}, { 20,  24,  29,  33}, { 21,  26,  30,  35}, { 22,  27,  32,  37},
-    { 23,  28,  33,  39}, { 24,  30,  35,  41}, { 26,  31,  37,  43}, { 27,  33,  39,  45},
-    { 29,  35,  41,  48}, { 30,  37,  43,  50}, { 32,  39,  46,  53}, { 33,  41,  48,  56},
-    { 35,  43,  51,  59}, { 37,  45,  54,  62}, { 39,  48,  56,  65}, { 41,  50,  59,  69},
-    { 43,  53,  63,  72}, { 46,  56,  66,  76}, { 48,  59,  69,  80}, { 51,  62,  73,  85},
-    { 53,  65,  77,  89}, { 56,  69,  81,  94}, { 59,  72,  86,  99}, { 62,  76,  90, 104},
-    { 66,  80,  95, 110}, { 69,  85, 100, 116}, { 73,  89, 105, 122}, { 77,  94, 111, 128},
-    { 81,  99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158},
-    {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195},
-    {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240}
-};
-
-const uint8_t x264_cabac_transition[128][2] =
-{
-    {  0,   0}, {  1,   1}, {  2,  50}, { 51,   3}, {  2,  50}, { 51,   3}, {  4,  52}, { 53,   5},
-    {  6,  52}, { 53,   7}, {  8,  52}, { 53,   9}, { 10,  54}, { 55,  11}, { 12,  54}, { 55,  13},
-    { 14,  54}, { 55,  15}, { 16,  56}, { 57,  17}, { 18,  56}, { 57,  19}, { 20,  56}, { 57,  21},
-    { 22,  58}, { 59,  23}, { 24,  58}, { 59,  25}, { 26,  60}, { 61,  27}, { 28,  60}, { 61,  29},
-    { 30,  60}, { 61,  31}, { 32,  62}, { 63,  33}, { 34,  62}, { 63,  35}, { 36,  64}, { 65,  37},
-    { 38,  66}, { 67,  39}, { 40,  66}, { 67,  41}, { 42,  66}, { 67,  43}, { 44,  68}, { 69,  45},
-    { 46,  68}, { 69,  47}, { 48,  70}, { 71,  49}, { 50,  72}, { 73,  51}, { 52,  72}, { 73,  53},
-    { 54,  74}, { 75,  55}, { 56,  74}, { 75,  57}, { 58,  76}, { 77,  59}, { 60,  78}, { 79,  61},
-    { 62,  78}, { 79,  63}, { 64,  80}, { 81,  65}, { 66,  82}, { 83,  67}, { 68,  82}, { 83,  69},
-    { 70,  84}, { 85,  71}, { 72,  84}, { 85,  73}, { 74,  88}, { 89,  75}, { 76,  88}, { 89,  77},
-    { 78,  90}, { 91,  79}, { 80,  90}, { 91,  81}, { 82,  94}, { 95,  83}, { 84,  94}, { 95,  85},
-    { 86,  96}, { 97,  87}, { 88,  96}, { 97,  89}, { 90, 100}, {101,  91}, { 92, 100}, {101,  93},
-    { 94, 102}, {103,  95}, { 96, 104}, {105,  97}, { 98, 104}, {105,  99}, {100, 108}, {109, 101},
-    {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109},
-    {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117},
-    {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125}
-};
-
-const uint8_t x264_cabac_renorm_shift[64] =
-{
-    6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-};
-
-/* -ln2(probability) */
-const uint16_t x264_cabac_entropy[128] =
-{
-    FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618),
-    FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114),
-    FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610),
-    FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106),
-    FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602),
-    FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099),
-    FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595),
-    FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091),
-    FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588),
-    FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083),
-    FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580),
-    FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076),
-    FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572),
-    FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068),
-    FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565),
-    FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061),
-    FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557),
-    FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053),
-    FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549),
-    FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046),
-    FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542),
-    FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038),
-    FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534),
-    FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030),
-    FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527),
-    FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023),
-    FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519),
-    FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015),
-    FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511),
-    FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008),
-    FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504),
-    FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
-};
-
-uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][1024];
-
-void x264_cabac_init( x264_t *h )
-{
-    int ctx_count = CHROMA444 ? 1024 : 460;
-    for( int i = 0; i < 4; i++ )
-    {
-        const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
-                                                             : &x264_cabac_context_init_PB[i-1];
-        for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
-            for( int j = 0; j < ctx_count; j++ )
-            {
-                int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
-                x264_cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
-            }
-    }
-}
-
-/*****************************************************************************
- *
- *****************************************************************************/
-void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
-{
-    memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
-}
-
-void x264_cabac_encode_init_core( x264_cabac_t *cb )
-{
-    cb->i_low   = 0;
-    cb->i_range = 0x01FE;
-    cb->i_queue = -9; // the first bit will be shifted away and not written
-    cb->i_bytes_outstanding = 0;
-}
-
-void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
-{
-    x264_cabac_encode_init_core( cb );
-    cb->p_start = p_data;
-    cb->p       = p_data;
-    cb->p_end   = p_end;
-}
-
-static inline void x264_cabac_putbyte( x264_cabac_t *cb )
-{
-    if( cb->i_queue >= 0 )
-    {
-        int out = cb->i_low >> (cb->i_queue+10);
-        cb->i_low &= (0x400<<cb->i_queue)-1;
-        cb->i_queue -= 8;
-
-        if( (out & 0xff) == 0xff )
-            cb->i_bytes_outstanding++;
-        else
-        {
-            int carry = out >> 8;
-            int bytes_outstanding = cb->i_bytes_outstanding;
-            // this can't modify before the beginning of the stream because
-            // that would correspond to a probability > 1.
-            // it will write before the beginning of the stream, which is ok
-            // because a slice header always comes before cabac data.
-            // this can't carry beyond the one byte, because any 0xff bytes
-            // are in bytes_outstanding and thus not written yet.
-            cb->p[-1] += carry;
-            while( bytes_outstanding > 0 )
-            {
-                *(cb->p++) = carry-1;
-                bytes_outstanding--;
-            }
-            *(cb->p++) = out;
-            cb->i_bytes_outstanding = 0;
-        }
-    }
-}
-
-static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
-{
-    int shift = x264_cabac_renorm_shift[cb->i_range>>3];
-    cb->i_range <<= shift;
-    cb->i_low   <<= shift;
-    cb->i_queue  += shift;
-    x264_cabac_putbyte( cb );
-}
-
-/* Making custom versions of this function, even in asm, for the cases where
- * b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4
- * but nearly useless with GCC 4.3 and worse than useless on x86_64. */
-void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
-{
-    int i_state = cb->state[i_ctx];
-    int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
-    cb->i_range -= i_range_lps;
-    if( b != (i_state & 1) )
-    {
-        cb->i_low += cb->i_range;
-        cb->i_range = i_range_lps;
-    }
-    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
-    x264_cabac_encode_renorm( cb );
-}
-
-/* Note: b is negated for this function */
-void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
-{
-    cb->i_low <<= 1;
-    cb->i_low += b & cb->i_range;
-    cb->i_queue += 1;
-    x264_cabac_putbyte( cb );
-}
-
-static const int bypass_lut[16] =
-{
-    -1,      0x2,     0x14,     0x68,     0x1d0,     0x7a0,     0x1f40,     0x7e80,
-    0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
-};
-
-void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
-{
-    uint32_t v = val + (1<<exp_bits);
-    int k = 31 - x264_clz( v );
-    uint32_t x = (bypass_lut[k-exp_bits]<<exp_bits) + v;
-    k = 2*k+1-exp_bits;
-    int i = ((k-1)&7)+1;
-    do {
-        k -= i;
-        cb->i_low <<= i;
-        cb->i_low += ((x>>k)&0xff) * cb->i_range;
-        cb->i_queue += i;
-        x264_cabac_putbyte( cb );
-        i = 8;
-    } while( k > 0 );
-}
-
-void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
-{
-    cb->i_range -= 2;
-    x264_cabac_encode_renorm( cb );
-}
-
-void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
-{
-    cb->i_low += cb->i_range - 2;
-    cb->i_low |= 1;
-    cb->i_low <<= 9;
-    cb->i_queue += 9;
-    x264_cabac_putbyte( cb );
-    x264_cabac_putbyte( cb );
-    cb->i_low <<= -cb->i_queue;
-    cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10;
-    cb->i_queue = 0;
-    x264_cabac_putbyte( cb );
-
-    while( cb->i_bytes_outstanding > 0 )
-    {
-        *(cb->p++) = 0xff;
-        cb->i_bytes_outstanding--;
-    }
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/cabac.h b/android/src/main/libenc/jni/libx264/common/cabac.h
deleted file mode 100755
index cf07254..0000000
--- a/android/src/main/libenc/jni/libx264/common/cabac.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*****************************************************************************
- * cabac.h: arithmetic coder
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_CABAC_H
-#define X264_CABAC_H
-
-typedef struct
-{
-    /* state */
-    int i_low;
-    int i_range;
-
-    /* bit stream */
-    int i_queue; //stored with an offset of -8 for faster asm
-    int i_bytes_outstanding;
-
-    uint8_t *p_start;
-    uint8_t *p;
-    uint8_t *p_end;
-
-    /* aligned for memcpy_aligned starting here */
-    ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
-
-    /* context */
-    uint8_t state[1024];
-
-    /* for 16-byte alignment */
-    uint8_t padding[12];
-} x264_cabac_t;
-
-extern const uint8_t x264_cabac_transition[128][2];
-extern const uint16_t x264_cabac_entropy[128];
-
-/* init the contexts given i_slice_type, the quantif and the model */
-void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
-
-void x264_cabac_encode_init_core( x264_cabac_t *cb );
-void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
-void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
-void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
-void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
-void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
-void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
-void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
-void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
-void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
-
-#if HAVE_MMX
-#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
-#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
-#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
-#elif defined(ARCH_AARCH64)
-#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
-#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
-#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
-#else
-#define x264_cabac_encode_decision x264_cabac_encode_decision_c
-#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
-#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
-#endif
-#define x264_cabac_encode_decision_noup x264_cabac_encode_decision
-
-static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb )
-{
-    return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue;
-}
-
-/* internal only. these don't write the bitstream, just calculate bit cost: */
-
-static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b )
-{
-    int i_state = cb->state[i_ctx];
-    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
-    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
-}
-
-static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
-{
-    int i_state = *state;
-    *state = x264_cabac_transition[i_state][b];
-    return x264_cabac_entropy[i_state^b];
-}
-
-static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
-{
-    int i_state = cb->state[i_ctx];
-    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
-}
-
-static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
-{
-    return x264_cabac_entropy[*state^b];
-}
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/common.c b/android/src/main/libenc/jni/libx264/common/common.c
deleted file mode 100755
index 029f7b3..0000000
--- a/android/src/main/libenc/jni/libx264/common/common.c
+++ /dev/null
@@ -1,1444 +0,0 @@
-/*****************************************************************************
- * common.c: misc common functions
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#include <ctype.h>
-
-#if HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-#if HAVE_THP
-#include <sys/mman.h>
-#endif
-
-const int x264_bit_depth = BIT_DEPTH;
-
-const int x264_chroma_format = X264_CHROMA_FORMAT;
-
-static void x264_log_default( void *, int, const char *, va_list );
-
-/****************************************************************************
- * x264_param_default:
- ****************************************************************************/
-void x264_param_default( x264_param_t *param )
-{
-    /* */
-    memset( param, 0, sizeof( x264_param_t ) );
-
-    /* CPU autodetect */
-    param->cpu = x264_cpu_detect();
-    param->i_threads = X264_THREADS_AUTO;
-    param->i_lookahead_threads = X264_THREADS_AUTO;
-    param->b_deterministic = 1;
-    param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
-
-    /* Video properties */
-    param->i_csp           = X264_CHROMA_FORMAT ? X264_CHROMA_FORMAT : X264_CSP_I420;
-    param->i_width         = 0;
-    param->i_height        = 0;
-    param->vui.i_sar_width = 0;
-    param->vui.i_sar_height= 0;
-    param->vui.i_overscan  = 0;  /* undef */
-    param->vui.i_vidformat = 5;  /* undef */
-    param->vui.b_fullrange = -1; /* default depends on input */
-    param->vui.i_colorprim = 2;  /* undef */
-    param->vui.i_transfer  = 2;  /* undef */
-    param->vui.i_colmatrix = -1; /* default depends on input */
-    param->vui.i_chroma_loc= 0;  /* left center */
-    param->i_fps_num       = 25;
-    param->i_fps_den       = 1;
-    param->i_level_idc     = -1;
-    param->i_slice_max_size = 0;
-    param->i_slice_max_mbs = 0;
-    param->i_slice_count = 0;
-
-    /* Encoder parameters */
-    param->i_frame_reference = 3;
-    param->i_keyint_max = 250;
-    param->i_keyint_min = X264_KEYINT_MIN_AUTO;
-    param->i_bframe = 3;
-    param->i_scenecut_threshold = 40;
-    param->i_bframe_adaptive = X264_B_ADAPT_FAST;
-    param->i_bframe_bias = 0;
-    param->i_bframe_pyramid = X264_B_PYRAMID_NORMAL;
-    param->b_interlaced = 0;
-    param->b_constrained_intra = 0;
-
-    param->b_deblocking_filter = 1;
-    param->i_deblocking_filter_alphac0 = 0;
-    param->i_deblocking_filter_beta = 0;
-
-    param->b_cabac = 1;
-    param->i_cabac_init_idc = 0;
-
-    param->rc.i_rc_method = X264_RC_CRF;
-    param->rc.i_bitrate = 0;
-    param->rc.f_rate_tolerance = 1.0;
-    param->rc.i_vbv_max_bitrate = 0;
-    param->rc.i_vbv_buffer_size = 0;
-    param->rc.f_vbv_buffer_init = 0.9;
-    param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
-    param->rc.f_rf_constant = 23;
-    param->rc.i_qp_min = 0;
-    param->rc.i_qp_max = QP_MAX;
-    param->rc.i_qp_step = 4;
-    param->rc.f_ip_factor = 1.4;
-    param->rc.f_pb_factor = 1.3;
-    param->rc.i_aq_mode = X264_AQ_VARIANCE;
-    param->rc.f_aq_strength = 1.0;
-    param->rc.i_lookahead = 40;
-
-    param->rc.b_stat_write = 0;
-    param->rc.psz_stat_out = "x264_2pass.log";
-    param->rc.b_stat_read = 0;
-    param->rc.psz_stat_in = "x264_2pass.log";
-    param->rc.f_qcompress = 0.6;
-    param->rc.f_qblur = 0.5;
-    param->rc.f_complexity_blur = 20;
-    param->rc.i_zones = 0;
-    param->rc.b_mb_tree = 1;
-
-    /* Log */
-    param->pf_log = x264_log_default;
-    param->p_log_private = NULL;
-    param->i_log_level = X264_LOG_INFO;
-
-    /* */
-    param->analyse.intra = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8;
-    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8
-                         | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
-    param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
-    param->analyse.i_me_method = X264_ME_HEX;
-    param->analyse.f_psy_rd = 1.0;
-    param->analyse.b_psy = 1;
-    param->analyse.f_psy_trellis = 0;
-    param->analyse.i_me_range = 16;
-    param->analyse.i_subpel_refine = 7;
-    param->analyse.b_mixed_references = 1;
-    param->analyse.b_chroma_me = 1;
-    param->analyse.i_mv_range_thread = -1;
-    param->analyse.i_mv_range = -1; // set from level_idc
-    param->analyse.i_chroma_qp_offset = 0;
-    param->analyse.b_fast_pskip = 1;
-    param->analyse.b_weighted_bipred = 1;
-    param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
-    param->analyse.b_dct_decimate = 1;
-    param->analyse.b_transform_8x8 = 1;
-    param->analyse.i_trellis = 1;
-    param->analyse.i_luma_deadzone[0] = 21;
-    param->analyse.i_luma_deadzone[1] = 11;
-    param->analyse.b_psnr = 0;
-    param->analyse.b_ssim = 0;
-
-    param->i_cqm_preset = X264_CQM_FLAT;
-    memset( param->cqm_4iy, 16, sizeof( param->cqm_4iy ) );
-    memset( param->cqm_4py, 16, sizeof( param->cqm_4py ) );
-    memset( param->cqm_4ic, 16, sizeof( param->cqm_4ic ) );
-    memset( param->cqm_4pc, 16, sizeof( param->cqm_4pc ) );
-    memset( param->cqm_8iy, 16, sizeof( param->cqm_8iy ) );
-    memset( param->cqm_8py, 16, sizeof( param->cqm_8py ) );
-    memset( param->cqm_8ic, 16, sizeof( param->cqm_8ic ) );
-    memset( param->cqm_8pc, 16, sizeof( param->cqm_8pc ) );
-
-    param->b_repeat_headers = 1;
-    param->b_annexb = 1;
-    param->b_aud = 0;
-    param->b_vfr_input = 1;
-    param->i_nal_hrd = X264_NAL_HRD_NONE;
-    param->b_tff = 1;
-    param->b_pic_struct = 0;
-    param->b_fake_interlaced = 0;
-    param->i_frame_packing = -1;
-    param->b_opencl = 0;
-    param->i_opencl_device = 0;
-    param->opencl_device_id = NULL;
-    param->psz_clbin_file = NULL;
-}
-
-static int x264_param_apply_preset( x264_param_t *param, const char *preset )
-{
-    char *end;
-    int i = strtol( preset, &end, 10 );
-    if( *end == 0 && i >= 0 && i < sizeof(x264_preset_names)/sizeof(*x264_preset_names)-1 )
-        preset = x264_preset_names[i];
-
-    if( !strcasecmp( preset, "ultrafast" ) )
-    {
-        param->i_frame_reference = 1;
-        param->i_scenecut_threshold = 0;
-        param->b_deblocking_filter = 0;
-        param->b_cabac = 0;
-        param->i_bframe = 0;
-        param->analyse.intra = 0;
-        param->analyse.inter = 0;
-        param->analyse.b_transform_8x8 = 0;
-        param->analyse.i_me_method = X264_ME_DIA;
-        param->analyse.i_subpel_refine = 0;
-        param->rc.i_aq_mode = 0;
-        param->analyse.b_mixed_references = 0;
-        param->analyse.i_trellis = 0;
-        param->i_bframe_adaptive = X264_B_ADAPT_NONE;
-        param->rc.b_mb_tree = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-        param->analyse.b_weighted_bipred = 0;
-        param->rc.i_lookahead = 0;
-    }
-    else if( !strcasecmp( preset, "superfast" ) )
-    {
-        param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
-        param->analyse.i_me_method = X264_ME_DIA;
-        param->analyse.i_subpel_refine = 1;
-        param->i_frame_reference = 1;
-        param->analyse.b_mixed_references = 0;
-        param->analyse.i_trellis = 0;
-        param->rc.b_mb_tree = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
-        param->rc.i_lookahead = 0;
-    }
-    else if( !strcasecmp( preset, "veryfast" ) )
-    {
-        param->analyse.i_subpel_refine = 2;
-        param->i_frame_reference = 1;
-        param->analyse.b_mixed_references = 0;
-        param->analyse.i_trellis = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
-        param->rc.i_lookahead = 10;
-    }
-    else if( !strcasecmp( preset, "faster" ) )
-    {
-        param->analyse.b_mixed_references = 0;
-        param->i_frame_reference = 2;
-        param->analyse.i_subpel_refine = 4;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
-        param->rc.i_lookahead = 20;
-    }
-    else if( !strcasecmp( preset, "fast" ) )
-    {
-        param->i_frame_reference = 2;
-        param->analyse.i_subpel_refine = 6;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE;
-        param->rc.i_lookahead = 30;
-    }
-    else if( !strcasecmp( preset, "medium" ) )
-    {
-        /* Default is medium */
-    }
-    else if( !strcasecmp( preset, "slow" ) )
-    {
-        param->analyse.i_subpel_refine = 8;
-        param->i_frame_reference = 5;
-        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-        param->analyse.i_trellis = 2;
-        param->rc.i_lookahead = 50;
-    }
-    else if( !strcasecmp( preset, "slower" ) )
-    {
-        param->analyse.i_me_method = X264_ME_UMH;
-        param->analyse.i_subpel_refine = 9;
-        param->i_frame_reference = 8;
-        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-        param->analyse.i_trellis = 2;
-        param->rc.i_lookahead = 60;
-    }
-    else if( !strcasecmp( preset, "veryslow" ) )
-    {
-        param->analyse.i_me_method = X264_ME_UMH;
-        param->analyse.i_subpel_refine = 10;
-        param->analyse.i_me_range = 24;
-        param->i_frame_reference = 16;
-        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-        param->analyse.i_trellis = 2;
-        param->i_bframe = 8;
-        param->rc.i_lookahead = 60;
-    }
-    else if( !strcasecmp( preset, "placebo" ) )
-    {
-        param->analyse.i_me_method = X264_ME_TESA;
-        param->analyse.i_subpel_refine = 11;
-        param->analyse.i_me_range = 24;
-        param->i_frame_reference = 16;
-        param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
-        param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
-        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-        param->analyse.b_fast_pskip = 0;
-        param->analyse.i_trellis = 2;
-        param->i_bframe = 16;
-        param->rc.i_lookahead = 60;
-    }
-    else
-    {
-        x264_log( NULL, X264_LOG_ERROR, "invalid preset '%s'\n", preset );
-        return -1;
-    }
-    return 0;
-}
-
-static int x264_param_apply_tune( x264_param_t *param, const char *tune )
-{
-    char *tmp = x264_malloc( strlen( tune ) + 1 );
-    if( !tmp )
-        return -1;
-    tmp = strcpy( tmp, tune );
-    char *s = strtok( tmp, ",./-+" );
-    int psy_tuning_used = 0;
-    while( s )
-    {
-        if( !strncasecmp( s, "film", 4 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->i_deblocking_filter_alphac0 = -1;
-            param->i_deblocking_filter_beta = -1;
-            param->analyse.f_psy_trellis = 0.15;
-        }
-        else if( !strncasecmp( s, "animation", 9 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
-            param->i_deblocking_filter_alphac0 = 1;
-            param->i_deblocking_filter_beta = 1;
-            param->analyse.f_psy_rd = 0.4;
-            param->rc.f_aq_strength = 0.6;
-            param->i_bframe += 2;
-        }
-        else if( !strncasecmp( s, "grain", 5 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->i_deblocking_filter_alphac0 = -2;
-            param->i_deblocking_filter_beta = -2;
-            param->analyse.f_psy_trellis = 0.25;
-            param->analyse.b_dct_decimate = 0;
-            param->rc.f_pb_factor = 1.1;
-            param->rc.f_ip_factor = 1.1;
-            param->rc.f_aq_strength = 0.5;
-            param->analyse.i_luma_deadzone[0] = 6;
-            param->analyse.i_luma_deadzone[1] = 6;
-            param->rc.f_qcompress = 0.8;
-        }
-        else if( !strncasecmp( s, "stillimage", 10 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->i_deblocking_filter_alphac0 = -3;
-            param->i_deblocking_filter_beta = -3;
-            param->analyse.f_psy_rd = 2.0;
-            param->analyse.f_psy_trellis = 0.7;
-            param->rc.f_aq_strength = 1.2;
-        }
-        else if( !strncasecmp( s, "psnr", 4 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->rc.i_aq_mode = X264_AQ_NONE;
-            param->analyse.b_psy = 0;
-        }
-        else if( !strncasecmp( s, "ssim", 4 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
-            param->analyse.b_psy = 0;
-        }
-        else if( !strncasecmp( s, "fastdecode", 10 ) )
-        {
-            param->b_deblocking_filter = 0;
-            param->b_cabac = 0;
-            param->analyse.b_weighted_bipred = 0;
-            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-        }
-        else if( !strncasecmp( s, "zerolatency", 11 ) )
-        {
-            param->rc.i_lookahead = 0;
-            param->i_sync_lookahead = 0;
-            param->i_bframe = 0;
-            param->b_sliced_threads = 1;
-            param->b_vfr_input = 0;
-            param->rc.b_mb_tree = 0;
-        }
-        else if( !strncasecmp( s, "touhou", 6 ) )
-        {
-            if( psy_tuning_used++ ) goto psy_failure;
-            param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
-            param->i_deblocking_filter_alphac0 = -1;
-            param->i_deblocking_filter_beta = -1;
-            param->analyse.f_psy_trellis = 0.2;
-            param->rc.f_aq_strength = 1.3;
-            if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
-                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
-        }
-        else
-        {
-            x264_log( NULL, X264_LOG_ERROR, "invalid tune '%s'\n", s );
-            x264_free( tmp );
-            return -1;
-        }
-        if( 0 )
-        {
-    psy_failure:
-            x264_log( NULL, X264_LOG_WARNING, "only 1 psy tuning can be used: ignoring tune %s\n", s );
-        }
-        s = strtok( NULL, ",./-+" );
-    }
-    x264_free( tmp );
-    return 0;
-}
-
-int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune )
-{
-    x264_param_default( param );
-
-    if( preset && x264_param_apply_preset( param, preset ) < 0 )
-        return -1;
-    if( tune && x264_param_apply_tune( param, tune ) < 0 )
-        return -1;
-    return 0;
-}
-
-void x264_param_apply_fastfirstpass( x264_param_t *param )
-{
-    /* Set faster options in case of turbo firstpass. */
-    if( param->rc.b_stat_write && !param->rc.b_stat_read )
-    {
-        param->i_frame_reference = 1;
-        param->analyse.b_transform_8x8 = 0;
-        param->analyse.inter = 0;
-        param->analyse.i_me_method = X264_ME_DIA;
-        param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
-        param->analyse.i_trellis = 0;
-        param->analyse.b_fast_pskip = 1;
-    }
-}
-
-static int profile_string_to_int( const char *str )
-{
-    if( !strcasecmp( str, "baseline" ) )
-        return PROFILE_BASELINE;
-    if( !strcasecmp( str, "main" ) )
-        return PROFILE_MAIN;
-    if( !strcasecmp( str, "high" ) )
-        return PROFILE_HIGH;
-    if( !strcasecmp( str, "high10" ) )
-        return PROFILE_HIGH10;
-    if( !strcasecmp( str, "high422" ) )
-        return PROFILE_HIGH422;
-    if( !strcasecmp( str, "high444" ) )
-        return PROFILE_HIGH444_PREDICTIVE;
-    return -1;
-}
-
-int x264_param_apply_profile( x264_param_t *param, const char *profile )
-{
-    if( !profile )
-        return 0;
-
-    int p = profile_string_to_int( profile );
-    if( p < 0 )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
-        return -1;
-    }
-    if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
-        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
-        return -1;
-    }
-    if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile );
-        return -1;
-    }
-    if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile );
-        return -1;
-    }
-    if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH );
-        return -1;
-    }
-
-    if( p == PROFILE_BASELINE )
-    {
-        param->analyse.b_transform_8x8 = 0;
-        param->b_cabac = 0;
-        param->i_cqm_preset = X264_CQM_FLAT;
-        param->psz_cqm_file = NULL;
-        param->i_bframe = 0;
-        param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-        if( param->b_interlaced )
-        {
-            x264_log( NULL, X264_LOG_ERROR, "baseline profile doesn't support interlacing\n" );
-            return -1;
-        }
-        if( param->b_fake_interlaced )
-        {
-            x264_log( NULL, X264_LOG_ERROR, "baseline profile doesn't support fake interlacing\n" );
-            return -1;
-        }
-    }
-    else if( p == PROFILE_MAIN )
-    {
-        param->analyse.b_transform_8x8 = 0;
-        param->i_cqm_preset = X264_CQM_FLAT;
-        param->psz_cqm_file = NULL;
-    }
-    return 0;
-}
-
-static int parse_enum( const char *arg, const char * const *names, int *dst )
-{
-    for( int i = 0; names[i]; i++ )
-        if( !strcasecmp( arg, names[i] ) )
-        {
-            *dst = i;
-            return 0;
-        }
-    return -1;
-}
-
-static int parse_cqm( const char *str, uint8_t *cqm, int length )
-{
-    int i = 0;
-    do {
-        int coef;
-        if( !sscanf( str, "%d", &coef ) || coef < 1 || coef > 255 )
-            return -1;
-        cqm[i++] = coef;
-    } while( i < length && (str = strchr( str, ',' )) && str++ );
-    return (i == length) ? 0 : -1;
-}
-
-static int x264_atobool( const char *str, int *b_error )
-{
-    if( !strcmp(str, "1") ||
-        !strcasecmp(str, "true") ||
-        !strcasecmp(str, "yes") )
-        return 1;
-    if( !strcmp(str, "0") ||
-        !strcasecmp(str, "false") ||
-        !strcasecmp(str, "no") )
-        return 0;
-    *b_error = 1;
-    return 0;
-}
-
-static int x264_atoi( const char *str, int *b_error )
-{
-    char *end;
-    int v = strtol( str, &end, 0 );
-    if( end == str || *end != '\0' )
-        *b_error = 1;
-    return v;
-}
-
-static double x264_atof( const char *str, int *b_error )
-{
-    char *end;
-    double v = strtod( str, &end );
-    if( end == str || *end != '\0' )
-        *b_error = 1;
-    return v;
-}
-
-#define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) )
-#undef atoi
-#undef atof
-#define atoi(str) x264_atoi( str, &b_error )
-#define atof(str) x264_atof( str, &b_error )
-
-int x264_param_parse( x264_param_t *p, const char *name, const char *value )
-{
-    char *name_buf = NULL;
-    int b_error = 0;
-    int errortype = X264_PARAM_BAD_VALUE;
-    int name_was_bool;
-    int value_was_null = !value;
-
-    if( !name )
-        return X264_PARAM_BAD_NAME;
-    if( !value )
-        value = "true";
-
-    if( value[0] == '=' )
-        value++;
-
-    if( strchr( name, '_' ) ) // s/_/-/g
-    {
-        char *c;
-        name_buf = strdup(name);
-        if( !name_buf )
-            return X264_PARAM_BAD_NAME;
-        while( (c = strchr( name_buf, '_' )) )
-            *c = '-';
-        name = name_buf;
-    }
-
-    if( !strncmp( name, "no", 2 ) )
-    {
-        name += 2;
-        if( name[0] == '-' )
-            name++;
-        value = atobool(value) ? "false" : "true";
-    }
-    name_was_bool = 0;
-
-#define OPT(STR) else if( !strcmp( name, STR ) )
-#define OPT2(STR0, STR1) else if( !strcmp( name, STR0 ) || !strcmp( name, STR1 ) )
-    if(0);
-    OPT("asm")
-    {
-        p->cpu = isdigit(value[0]) ? atoi(value) :
-                 !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
-        if( b_error )
-        {
-            char *buf = strdup( value );
-            if( buf )
-            {
-                char *tok, UNUSED *saveptr=NULL, *init;
-                b_error = 0;
-                p->cpu = 0;
-                for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
-                {
-                    int i = 0;
-                    while( x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name) )
-                        i++;
-                    p->cpu |= x264_cpu_names[i].flags;
-                    if( !x264_cpu_names[i].flags )
-                        b_error = 1;
-                }
-                free( buf );
-                if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
-                    p->cpu |= X264_CPU_SSE2_IS_FAST;
-            }
-        }
-    }
-    OPT("threads")
-    {
-        if( !strcasecmp(value, "auto") )
-            p->i_threads = X264_THREADS_AUTO;
-        else
-            p->i_threads = atoi(value);
-    }
-    OPT("lookahead-threads")
-    {
-        if( !strcasecmp(value, "auto") )
-            p->i_lookahead_threads = X264_THREADS_AUTO;
-        else
-            p->i_lookahead_threads = atoi(value);
-    }
-    OPT("sliced-threads")
-        p->b_sliced_threads = atobool(value);
-    OPT("sync-lookahead")
-    {
-        if( !strcasecmp(value, "auto") )
-            p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
-        else
-            p->i_sync_lookahead = atoi(value);
-    }
-    OPT2("deterministic", "n-deterministic")
-        p->b_deterministic = atobool(value);
-    OPT("cpu-independent")
-        p->b_cpu_independent = atobool(value);
-    OPT2("level", "level-idc")
-    {
-        if( !strcmp(value, "1b") )
-            p->i_level_idc = 9;
-        else if( atof(value) < 6 )
-            p->i_level_idc = (int)(10*atof(value)+.5);
-        else
-            p->i_level_idc = atoi(value);
-    }
-    OPT("bluray-compat")
-        p->b_bluray_compat = atobool(value);
-    OPT("avcintra-class")
-        p->i_avcintra_class = atoi(value);
-    OPT("sar")
-    {
-        b_error = ( 2 != sscanf( value, "%d:%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) &&
-                    2 != sscanf( value, "%d/%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) );
-    }
-    OPT("overscan")
-        b_error |= parse_enum( value, x264_overscan_names, &p->vui.i_overscan );
-    OPT("videoformat")
-        b_error |= parse_enum( value, x264_vidformat_names, &p->vui.i_vidformat );
-    OPT("fullrange")
-        b_error |= parse_enum( value, x264_fullrange_names, &p->vui.b_fullrange );
-    OPT("colorprim")
-        b_error |= parse_enum( value, x264_colorprim_names, &p->vui.i_colorprim );
-    OPT("transfer")
-        b_error |= parse_enum( value, x264_transfer_names, &p->vui.i_transfer );
-    OPT("colormatrix")
-        b_error |= parse_enum( value, x264_colmatrix_names, &p->vui.i_colmatrix );
-    OPT("chromaloc")
-    {
-        p->vui.i_chroma_loc = atoi(value);
-        b_error = ( p->vui.i_chroma_loc < 0 || p->vui.i_chroma_loc > 5 );
-    }
-    OPT("fps")
-    {
-        if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) != 2 )
-        {
-            double fps = atof(value);
-            if( fps > 0.0 && fps <= INT_MAX/1000.0 )
-            {
-                p->i_fps_num = (int)(fps * 1000.0 + .5);
-                p->i_fps_den = 1000;
-            }
-            else
-            {
-                p->i_fps_num = atoi(value);
-                p->i_fps_den = 1;
-            }
-        }
-    }
-    OPT2("ref", "frameref")
-        p->i_frame_reference = atoi(value);
-    OPT("dpb-size")
-        p->i_dpb_size = atoi(value);
-    OPT("keyint")
-    {
-        if( strstr( value, "infinite" ) )
-            p->i_keyint_max = X264_KEYINT_MAX_INFINITE;
-        else
-            p->i_keyint_max = atoi(value);
-    }
-    OPT2("min-keyint", "keyint-min")
-    {
-        p->i_keyint_min = atoi(value);
-        if( p->i_keyint_max < p->i_keyint_min )
-            p->i_keyint_max = p->i_keyint_min;
-    }
-    OPT("scenecut")
-    {
-        p->i_scenecut_threshold = atobool(value);
-        if( b_error || p->i_scenecut_threshold )
-        {
-            b_error = 0;
-            p->i_scenecut_threshold = atoi(value);
-        }
-    }
-    OPT("intra-refresh")
-        p->b_intra_refresh = atobool(value);
-    OPT("bframes")
-        p->i_bframe = atoi(value);
-    OPT("b-adapt")
-    {
-        p->i_bframe_adaptive = atobool(value);
-        if( b_error )
-        {
-            b_error = 0;
-            p->i_bframe_adaptive = atoi(value);
-        }
-    }
-    OPT("b-bias")
-        p->i_bframe_bias = atoi(value);
-    OPT("b-pyramid")
-    {
-        b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
-        if( b_error )
-        {
-            b_error = 0;
-            p->i_bframe_pyramid = atoi(value);
-        }
-    }
-    OPT("open-gop")
-        p->b_open_gop = atobool(value);
-    OPT("nf")
-        p->b_deblocking_filter = !atobool(value);
-    OPT2("filter", "deblock")
-    {
-        if( 2 == sscanf( value, "%d:%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) ||
-            2 == sscanf( value, "%d,%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) )
-        {
-            p->b_deblocking_filter = 1;
-        }
-        else if( sscanf( value, "%d", &p->i_deblocking_filter_alphac0 ) )
-        {
-            p->b_deblocking_filter = 1;
-            p->i_deblocking_filter_beta = p->i_deblocking_filter_alphac0;
-        }
-        else
-            p->b_deblocking_filter = atobool(value);
-    }
-    OPT("slice-max-size")
-        p->i_slice_max_size = atoi(value);
-    OPT("slice-max-mbs")
-        p->i_slice_max_mbs = atoi(value);
-    OPT("slice-min-mbs")
-        p->i_slice_min_mbs = atoi(value);
-    OPT("slices")
-        p->i_slice_count = atoi(value);
-    OPT("slices-max")
-        p->i_slice_count_max = atoi(value);
-    OPT("cabac")
-        p->b_cabac = atobool(value);
-    OPT("cabac-idc")
-        p->i_cabac_init_idc = atoi(value);
-    OPT("interlaced")
-        p->b_interlaced = atobool(value);
-    OPT("tff")
-        p->b_interlaced = p->b_tff = atobool(value);
-    OPT("bff")
-    {
-        p->b_interlaced = atobool(value);
-        p->b_tff = !p->b_interlaced;
-    }
-    OPT("constrained-intra")
-        p->b_constrained_intra = atobool(value);
-    OPT("cqm")
-    {
-        if( strstr( value, "flat" ) )
-            p->i_cqm_preset = X264_CQM_FLAT;
-        else if( strstr( value, "jvt" ) )
-            p->i_cqm_preset = X264_CQM_JVT;
-        else
-            p->psz_cqm_file = strdup(value);
-    }
-    OPT("cqmfile")
-        p->psz_cqm_file = strdup(value);
-    OPT("cqm4")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
-        b_error |= parse_cqm( value, p->cqm_4py, 16 );
-        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
-        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
-    }
-    OPT("cqm8")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_8iy, 64 );
-        b_error |= parse_cqm( value, p->cqm_8py, 64 );
-        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
-        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
-    }
-    OPT("cqm4i")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
-        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
-    }
-    OPT("cqm4p")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4py, 16 );
-        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
-    }
-    OPT("cqm4iy")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4iy, 16 );
-    }
-    OPT("cqm4ic")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
-    }
-    OPT("cqm4py")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4py, 16 );
-    }
-    OPT("cqm4pc")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_4pc, 16 );
-    }
-    OPT("cqm8i")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_8iy, 64 );
-        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
-    }
-    OPT("cqm8p")
-    {
-        p->i_cqm_preset = X264_CQM_CUSTOM;
-        b_error |= parse_cqm( value, p->cqm_8py, 64 );
-        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
-    }
-    OPT("log")
-        p->i_log_level = atoi(value);
-    OPT("dump-yuv")
-        p->psz_dump_yuv = strdup(value);
-    OPT2("analyse", "partitions")
-    {
-        p->analyse.inter = 0;
-        if( strstr( value, "none" ) )  p->analyse.inter =  0;
-        if( strstr( value, "all" ) )   p->analyse.inter = ~0;
-
-        if( strstr( value, "i4x4" ) )  p->analyse.inter |= X264_ANALYSE_I4x4;
-        if( strstr( value, "i8x8" ) )  p->analyse.inter |= X264_ANALYSE_I8x8;
-        if( strstr( value, "p8x8" ) )  p->analyse.inter |= X264_ANALYSE_PSUB16x16;
-        if( strstr( value, "p4x4" ) )  p->analyse.inter |= X264_ANALYSE_PSUB8x8;
-        if( strstr( value, "b8x8" ) )  p->analyse.inter |= X264_ANALYSE_BSUB16x16;
-    }
-    OPT("8x8dct")
-        p->analyse.b_transform_8x8 = atobool(value);
-    OPT2("weightb", "weight-b")
-        p->analyse.b_weighted_bipred = atobool(value);
-    OPT("weightp")
-        p->analyse.i_weighted_pred = atoi(value);
-    OPT2("direct", "direct-pred")
-        b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
-    OPT("chroma-qp-offset")
-        p->analyse.i_chroma_qp_offset = atoi(value);
-    OPT("me")
-        b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method );
-    OPT2("merange", "me-range")
-        p->analyse.i_me_range = atoi(value);
-    OPT2("mvrange", "mv-range")
-        p->analyse.i_mv_range = atoi(value);
-    OPT2("mvrange-thread", "mv-range-thread")
-        p->analyse.i_mv_range_thread = atoi(value);
-    OPT2("subme", "subq")
-        p->analyse.i_subpel_refine = atoi(value);
-    OPT("psy-rd")
-    {
-        if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
-            2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
-            2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
-        { }
-        else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
-        {
-            p->analyse.f_psy_trellis = 0;
-        }
-        else
-        {
-            p->analyse.f_psy_rd = 0;
-            p->analyse.f_psy_trellis = 0;
-        }
-    }
-    OPT("psy")
-        p->analyse.b_psy = atobool(value);
-    OPT("chroma-me")
-        p->analyse.b_chroma_me = atobool(value);
-    OPT("mixed-refs")
-        p->analyse.b_mixed_references = atobool(value);
-    OPT("trellis")
-        p->analyse.i_trellis = atoi(value);
-    OPT("fast-pskip")
-        p->analyse.b_fast_pskip = atobool(value);
-    OPT("dct-decimate")
-        p->analyse.b_dct_decimate = atobool(value);
-    OPT("deadzone-inter")
-        p->analyse.i_luma_deadzone[0] = atoi(value);
-    OPT("deadzone-intra")
-        p->analyse.i_luma_deadzone[1] = atoi(value);
-    OPT("nr")
-        p->analyse.i_noise_reduction = atoi(value);
-    OPT("bitrate")
-    {
-        p->rc.i_bitrate = atoi(value);
-        p->rc.i_rc_method = X264_RC_ABR;
-    }
-    OPT2("qp", "qp_constant")
-    {
-        p->rc.i_qp_constant = atoi(value);
-        p->rc.i_rc_method = X264_RC_CQP;
-    }
-    OPT("crf")
-    {
-        p->rc.f_rf_constant = atof(value);
-        p->rc.i_rc_method = X264_RC_CRF;
-    }
-    OPT("crf-max")
-        p->rc.f_rf_constant_max = atof(value);
-    OPT("rc-lookahead")
-        p->rc.i_lookahead = atoi(value);
-    OPT2("qpmin", "qp-min")
-        p->rc.i_qp_min = atoi(value);
-    OPT2("qpmax", "qp-max")
-        p->rc.i_qp_max = atoi(value);
-    OPT2("qpstep", "qp-step")
-        p->rc.i_qp_step = atoi(value);
-    OPT("ratetol")
-        p->rc.f_rate_tolerance = !strncmp("inf", value, 3) ? 1e9 : atof(value);
-    OPT("vbv-maxrate")
-        p->rc.i_vbv_max_bitrate = atoi(value);
-    OPT("vbv-bufsize")
-        p->rc.i_vbv_buffer_size = atoi(value);
-    OPT("vbv-init")
-        p->rc.f_vbv_buffer_init = atof(value);
-    OPT2("ipratio", "ip-factor")
-        p->rc.f_ip_factor = atof(value);
-    OPT2("pbratio", "pb-factor")
-        p->rc.f_pb_factor = atof(value);
-    OPT("aq-mode")
-        p->rc.i_aq_mode = atoi(value);
-    OPT("aq-strength")
-        p->rc.f_aq_strength = atof(value);
-    OPT("pass")
-    {
-        int pass = x264_clip3( atoi(value), 0, 3 );
-        p->rc.b_stat_write = pass & 1;
-        p->rc.b_stat_read = pass & 2;
-    }
-    OPT("stats")
-    {
-        p->rc.psz_stat_in = strdup(value);
-        p->rc.psz_stat_out = strdup(value);
-    }
-    OPT("qcomp")
-        p->rc.f_qcompress = atof(value);
-    OPT("mbtree")
-        p->rc.b_mb_tree = atobool(value);
-    OPT("qblur")
-        p->rc.f_qblur = atof(value);
-    OPT2("cplxblur", "cplx-blur")
-        p->rc.f_complexity_blur = atof(value);
-    OPT("zones")
-        p->rc.psz_zones = strdup(value);
-    OPT("crop-rect")
-        b_error |= sscanf( value, "%u,%u,%u,%u", &p->crop_rect.i_left, &p->crop_rect.i_top,
-                                                 &p->crop_rect.i_right, &p->crop_rect.i_bottom ) != 4;
-    OPT("psnr")
-        p->analyse.b_psnr = atobool(value);
-    OPT("ssim")
-        p->analyse.b_ssim = atobool(value);
-    OPT("aud")
-        p->b_aud = atobool(value);
-    OPT("sps-id")
-        p->i_sps_id = atoi(value);
-    OPT("global-header")
-        p->b_repeat_headers = !atobool(value);
-    OPT("repeat-headers")
-        p->b_repeat_headers = atobool(value);
-    OPT("annexb")
-        p->b_annexb = atobool(value);
-    OPT("force-cfr")
-        p->b_vfr_input = !atobool(value);
-    OPT("nal-hrd")
-        b_error |= parse_enum( value, x264_nal_hrd_names, &p->i_nal_hrd );
-    OPT("filler")
-        p->rc.b_filler = atobool(value);
-    OPT("pic-struct")
-        p->b_pic_struct = atobool(value);
-    OPT("fake-interlaced")
-        p->b_fake_interlaced = atobool(value);
-    OPT("frame-packing")
-        p->i_frame_packing = atoi(value);
-    OPT("stitchable")
-        p->b_stitchable = atobool(value);
-    OPT("opencl")
-        p->b_opencl = atobool( value );
-    OPT("opencl-clbin")
-        p->psz_clbin_file = strdup( value );
-    OPT("opencl-device")
-        p->i_opencl_device = atoi( value );
-    else
-    {
-        b_error = 1;
-        errortype = X264_PARAM_BAD_NAME;
-    }
-#undef OPT
-#undef OPT2
-#undef atobool
-#undef atoi
-#undef atof
-
-    if( name_buf )
-        free( name_buf );
-
-    b_error |= value_was_null && !name_was_bool;
-    return b_error ? errortype : 0;
-}
-
-/****************************************************************************
- * x264_log:
- ****************************************************************************/
-void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
-{
-    if( !h || i_level <= h->param.i_log_level )
-    {
-        va_list arg;
-        va_start( arg, psz_fmt );
-        if( !h )
-            x264_log_default( NULL, i_level, psz_fmt, arg );
-        else
-            h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
-        va_end( arg );
-    }
-}
-
-static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg )
-{
-    char *psz_prefix;
-    switch( i_level )
-    {
-        case X264_LOG_ERROR:
-            psz_prefix = "error";
-            break;
-        case X264_LOG_WARNING:
-            psz_prefix = "warning";
-            break;
-        case X264_LOG_INFO:
-            psz_prefix = "info";
-            break;
-        case X264_LOG_DEBUG:
-            psz_prefix = "debug";
-            break;
-        default:
-            psz_prefix = "unknown";
-            break;
-    }
-    fprintf( stderr, "x264 [%s]: ", psz_prefix );
-    x264_vfprintf( stderr, psz_fmt, arg );
-}
-
-/****************************************************************************
- * x264_picture_init:
- ****************************************************************************/
-void x264_picture_init( x264_picture_t *pic )
-{
-    memset( pic, 0, sizeof( x264_picture_t ) );
-    pic->i_type = X264_TYPE_AUTO;
-    pic->i_qpplus1 = X264_QP_AUTO;
-    pic->i_pic_struct = PIC_STRUCT_AUTO;
-}
-
-/****************************************************************************
- * x264_picture_alloc:
- ****************************************************************************/
-int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
-{
-    typedef struct
-    {
-        int planes;
-        int width_fix8[3];
-        int height_fix8[3];
-    } x264_csp_tab_t;
-
-    static const x264_csp_tab_t x264_csp_tab[] =
-    {
-        [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
-        [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
-        [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
-        [X264_CSP_NV21] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
-        [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
-        [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
-        [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
-        [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
-        [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
-        [X264_CSP_BGR]  = { 1, { 256*3 },               { 256*1 },              },
-        [X264_CSP_BGRA] = { 1, { 256*4 },               { 256*1 },              },
-        [X264_CSP_RGB]  = { 1, { 256*3 },               { 256*1 },              },
-    };
-
-    int csp = i_csp & X264_CSP_MASK;
-    if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 )
-        return -1;
-    x264_picture_init( pic );
-    pic->img.i_csp = i_csp;
-    pic->img.i_plane = x264_csp_tab[csp].planes;
-    int depth_factor = i_csp & X264_CSP_HIGH_DEPTH ? 2 : 1;
-    int plane_offset[3] = {0};
-    int frame_size = 0;
-    for( int i = 0; i < pic->img.i_plane; i++ )
-    {
-        int stride = (((int64_t)i_width * x264_csp_tab[csp].width_fix8[i]) >> 8) * depth_factor;
-        int plane_size = (((int64_t)i_height * x264_csp_tab[csp].height_fix8[i]) >> 8) * stride;
-        pic->img.i_stride[i] = stride;
-        plane_offset[i] = frame_size;
-        frame_size += plane_size;
-    }
-    pic->img.plane[0] = x264_malloc( frame_size );
-    if( !pic->img.plane[0] )
-        return -1;
-    for( int i = 1; i < pic->img.i_plane; i++ )
-        pic->img.plane[i] = pic->img.plane[0] + plane_offset[i];
-    return 0;
-}
-
-/****************************************************************************
- * x264_picture_clean:
- ****************************************************************************/
-void x264_picture_clean( x264_picture_t *pic )
-{
-    x264_free( pic->img.plane[0] );
-
-    /* just to be safe */
-    memset( pic, 0, sizeof( x264_picture_t ) );
-}
-
-/****************************************************************************
- * x264_malloc:
- ****************************************************************************/
-void *x264_malloc( int i_size )
-{
-    uint8_t *align_buf = NULL;
-#if HAVE_MALLOC_H
-#if HAVE_THP
-#define HUGE_PAGE_SIZE 2*1024*1024
-#define HUGE_PAGE_THRESHOLD HUGE_PAGE_SIZE*7/8 /* FIXME: Is this optimal? */
-    /* Attempt to allocate huge pages to reduce TLB misses. */
-    if( i_size >= HUGE_PAGE_THRESHOLD )
-    {
-        align_buf = memalign( HUGE_PAGE_SIZE, i_size );
-        if( align_buf )
-        {
-            /* Round up to the next huge page boundary if we are close enough. */
-            size_t madv_size = (i_size + HUGE_PAGE_SIZE - HUGE_PAGE_THRESHOLD) & ~(HUGE_PAGE_SIZE-1);
-            madvise( align_buf, madv_size, MADV_HUGEPAGE );
-        }
-    }
-    else
-#undef HUGE_PAGE_SIZE
-#undef HUGE_PAGE_THRESHOLD
-#endif
-        align_buf = memalign( NATIVE_ALIGN, i_size );
-#else
-    uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
-    if( buf )
-    {
-        align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **);
-        align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1);
-        *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
-    }
-#endif
-    if( !align_buf )
-        x264_log( NULL, X264_LOG_ERROR, "malloc of size %d failed\n", i_size );
-    return align_buf;
-}
-
-/****************************************************************************
- * x264_free:
- ****************************************************************************/
-void x264_free( void *p )
-{
-    if( p )
-    {
-#if HAVE_MALLOC_H
-        free( p );
-#else
-        free( *( ( ( void **) p ) - 1 ) );
-#endif
-    }
-}
-
-/****************************************************************************
- * x264_reduce_fraction:
- ****************************************************************************/
-#define REDUCE_FRACTION( name, type )\
-void name( type *n, type *d )\
-{                   \
-    type a = *n;    \
-    type b = *d;    \
-    type c;         \
-    if( !a || !b )  \
-        return;     \
-    c = a % b;      \
-    while( c )      \
-    {               \
-        a = b;      \
-        b = c;      \
-        c = a % b;  \
-    }               \
-    *n /= b;        \
-    *d /= b;        \
-}
-
-REDUCE_FRACTION( x264_reduce_fraction  , uint32_t )
-REDUCE_FRACTION( x264_reduce_fraction64, uint64_t )
-
-/****************************************************************************
- * x264_slurp_file:
- ****************************************************************************/
-char *x264_slurp_file( const char *filename )
-{
-    int b_error = 0;
-    int64_t i_size;
-    char *buf;
-    FILE *fh = x264_fopen( filename, "rb" );
-    if( !fh )
-        return NULL;
-
-    b_error |= fseek( fh, 0, SEEK_END ) < 0;
-    b_error |= ( i_size = ftell( fh ) ) <= 0;
-    if( WORD_SIZE == 4 )
-        b_error |= i_size > INT32_MAX;
-    b_error |= fseek( fh, 0, SEEK_SET ) < 0;
-    if( b_error )
-        goto error;
-
-    buf = x264_malloc( i_size+2 );
-    if( !buf )
-        goto error;
-
-    b_error |= fread( buf, 1, i_size, fh ) != i_size;
-    fclose( fh );
-    if( b_error )
-    {
-        x264_free( buf );
-        return NULL;
-    }
-
-    if( buf[i_size-1] != '\n' )
-        buf[i_size++] = '\n';
-    buf[i_size] = '\0';
-
-    return buf;
-error:
-    fclose( fh );
-    return NULL;
-}
-
-/****************************************************************************
- * x264_param2string:
- ****************************************************************************/
-char *x264_param2string( x264_param_t *p, int b_res )
-{
-    int len = 1000;
-    char *buf, *s;
-    if( p->rc.psz_zones )
-        len += strlen(p->rc.psz_zones);
-    buf = s = x264_malloc( len );
-    if( !buf )
-        return NULL;
-
-    if( b_res )
-    {
-        s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
-        s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
-        s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
-        s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
-    }
-
-    if( p->b_opencl )
-        s += sprintf( s, "opencl=%d ", p->b_opencl );
-    s += sprintf( s, "cabac=%d", p->b_cabac );
-    s += sprintf( s, " ref=%d", p->i_frame_reference );
-    s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
-                  p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta );
-    s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
-    s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
-    s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
-    s += sprintf( s, " psy=%d", p->analyse.b_psy );
-    if( p->analyse.b_psy )
-        s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
-    s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
-    s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
-    s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
-    s += sprintf( s, " trellis=%d", p->analyse.i_trellis );
-    s += sprintf( s, " 8x8dct=%d", p->analyse.b_transform_8x8 );
-    s += sprintf( s, " cqm=%d", p->i_cqm_preset );
-    s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
-    s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
-    s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
-    s += sprintf( s, " threads=%d", p->i_threads );
-    s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads );
-    s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
-    if( p->i_slice_count )
-        s += sprintf( s, " slices=%d", p->i_slice_count );
-    if( p->i_slice_count_max )
-        s += sprintf( s, " slices_max=%d", p->i_slice_count_max );
-    if( p->i_slice_max_size )
-        s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
-    if( p->i_slice_max_mbs )
-        s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
-    if( p->i_slice_min_mbs )
-        s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs );
-    s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
-    s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
-    s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" );
-    s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat );
-    if( p->b_stitchable )
-        s += sprintf( s, " stitchable=%d", p->b_stitchable );
-
-    s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );
-
-    s += sprintf( s, " bframes=%d", p->i_bframe );
-    if( p->i_bframe )
-    {
-        s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d open_gop=%d",
-                      p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
-                      p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->b_open_gop );
-    }
-    s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
-
-    if( p->i_keyint_max == X264_KEYINT_MAX_INFINITE )
-        s += sprintf( s, " keyint=infinite" );
-    else
-        s += sprintf( s, " keyint=%d", p->i_keyint_max );
-    s += sprintf( s, " keyint_min=%d scenecut=%d intra_refresh=%d",
-                  p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh );
-
-    if( p->rc.b_mb_tree || p->rc.i_vbv_buffer_size )
-        s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );
-
-    s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
-                               ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" )
-                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
-    if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
-    {
-        if( p->rc.i_rc_method == X264_RC_CRF )
-            s += sprintf( s, " crf=%.1f", p->rc.f_rf_constant );
-        else
-            s += sprintf( s, " bitrate=%d ratetol=%.1f",
-                          p->rc.i_bitrate, p->rc.f_rate_tolerance );
-        s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
-                      p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step );
-        if( p->rc.b_stat_read )
-            s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
-                          p->rc.f_complexity_blur, p->rc.f_qblur );
-        if( p->rc.i_vbv_buffer_size )
-        {
-            s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d",
-                          p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size );
-            if( p->rc.i_rc_method == X264_RC_CRF )
-                s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max );
-        }
-    }
-    else if( p->rc.i_rc_method == X264_RC_CQP )
-        s += sprintf( s, " qp=%d", p->rc.i_qp_constant );
-
-    if( p->rc.i_vbv_buffer_size )
-        s += sprintf( s, " nal_hrd=%s filler=%d", x264_nal_hrd_names[p->i_nal_hrd], p->rc.b_filler );
-    if( p->crop_rect.i_left | p->crop_rect.i_top | p->crop_rect.i_right | p->crop_rect.i_bottom )
-        s += sprintf( s, " crop_rect=%u,%u,%u,%u", p->crop_rect.i_left, p->crop_rect.i_top,
-                                                   p->crop_rect.i_right, p->crop_rect.i_bottom );
-    if( p->i_frame_packing >= 0 )
-        s += sprintf( s, " frame-packing=%d", p->i_frame_packing );
-
-    if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) )
-    {
-        s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor );
-        if( p->i_bframe && !p->rc.b_mb_tree )
-            s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor );
-        s += sprintf( s, " aq=%d", p->rc.i_aq_mode );
-        if( p->rc.i_aq_mode )
-            s += sprintf( s, ":%.2f", p->rc.f_aq_strength );
-        if( p->rc.psz_zones )
-            s += sprintf( s, " zones=%s", p->rc.psz_zones );
-        else if( p->rc.i_zones )
-            s += sprintf( s, " zones" );
-    }
-
-    return buf;
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/common.h b/android/src/main/libenc/jni/libx264/common/common.h
deleted file mode 100755
index 3a74c9e..0000000
--- a/android/src/main/libenc/jni/libx264/common/common.h
+++ /dev/null
@@ -1,1025 +0,0 @@
-/*****************************************************************************
- * common.h: misc common functions
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_COMMON_H
-#define X264_COMMON_H
-
-/****************************************************************************
- * Macros
- ****************************************************************************/
-#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
-#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
-#define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c)))
-#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
-#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
-#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
-#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
-#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
-#define FIX8(f) ((int)(f*(1<<8)+.5))
-#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
-#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0])))
-
-#define CHECKED_MALLOC( var, size )\
-do {\
-    var = x264_malloc( size );\
-    if( !var )\
-        goto fail;\
-} while( 0 )
-#define CHECKED_MALLOCZERO( var, size )\
-do {\
-    CHECKED_MALLOC( var, size );\
-    memset( var, 0, size );\
-} while( 0 )
-
-/* Macros for merging multiple allocations into a single large malloc, for improved
- * use with huge pages. */
-
-/* Needs to be enough to contain any set of buffers that use combined allocations */
-#define PREALLOC_BUF_SIZE 1024
-
-#define PREALLOC_INIT\
-    int    prealloc_idx = 0;\
-    size_t prealloc_size = 0;\
-    uint8_t **preallocs[PREALLOC_BUF_SIZE];
-
-#define PREALLOC( var, size )\
-do {\
-    var = (void*)prealloc_size;\
-    preallocs[prealloc_idx++] = (uint8_t**)&var;\
-    prealloc_size += ALIGN(size, NATIVE_ALIGN);\
-} while(0)
-
-#define PREALLOC_END( ptr )\
-do {\
-    CHECKED_MALLOC( ptr, prealloc_size );\
-    while( prealloc_idx-- )\
-        *preallocs[prealloc_idx] += (intptr_t)ptr;\
-} while(0)
-
-#define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
-
-#define X264_BFRAME_MAX 16
-#define X264_REF_MAX 16
-#define X264_THREAD_MAX 128
-#define X264_LOOKAHEAD_THREAD_MAX 16
-#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
-#define X264_LOOKAHEAD_MAX 250
-#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
-#define QP_MAX_SPEC (51+QP_BD_OFFSET)
-#define QP_MAX (QP_MAX_SPEC+18)
-#define QP_MAX_MAX (51+2*6+18)
-#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
-// arbitrary, but low because SATD scores are 1/4 normal
-#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
-#define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)
-
-// number of pixels (per thread) in progress at any given time.
-// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
-#define X264_THREAD_HEIGHT 24
-
-/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
- * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
- * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
- * real weights are being used. */
-
-#define X264_WEIGHTP_FAKE (-1)
-
-#define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
-#define FILLER_OVERHEAD (NALU_OVERHEAD+1)
-#define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1)))
-
-/****************************************************************************
- * Includes
- ****************************************************************************/
-#include "osdep.h"
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <limits.h>
-
-#if HAVE_INTERLACED
-#   define MB_INTERLACED h->mb.b_interlaced
-#   define SLICE_MBAFF h->sh.b_mbaff
-#   define PARAM_INTERLACED h->param.b_interlaced
-#else
-#   define MB_INTERLACED 0
-#   define SLICE_MBAFF 0
-#   define PARAM_INTERLACED 0
-#endif
-
-#ifdef CHROMA_FORMAT
-#    define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
-#    define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
-#else
-#    define CHROMA_FORMAT h->sps->i_chroma_format_idc
-#    define CHROMA_H_SHIFT h->mb.chroma_h_shift
-#    define CHROMA_V_SHIFT h->mb.chroma_v_shift
-#endif
-
-#define CHROMA_SIZE(s) ((s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT))
-#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
-#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
-
-/* Unions for type-punning.
- * Mn: load or store n bits, aligned, native-endian
- * CPn: copy n bits, aligned, native-endian
- * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
-typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
-typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
-typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
-typedef struct { uint64_t i[2]; } x264_uint128_t;
-typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
-#define M16(src) (((x264_union16_t*)(src))->i)
-#define M32(src) (((x264_union32_t*)(src))->i)
-#define M64(src) (((x264_union64_t*)(src))->i)
-#define M128(src) (((x264_union128_t*)(src))->i)
-#define M128_ZERO ((x264_uint128_t){{0,0}})
-#define CP16(dst,src) M16(dst) = M16(src)
-#define CP32(dst,src) M32(dst) = M32(src)
-#define CP64(dst,src) M64(dst) = M64(src)
-#define CP128(dst,src) M128(dst) = M128(src)
-
-#if HIGH_BIT_DEPTH
-    typedef uint16_t pixel;
-    typedef uint64_t pixel4;
-    typedef int32_t  dctcoef;
-    typedef uint32_t udctcoef;
-
-#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
-#   define MPIXEL_X4(src) M64(src)
-#else
-    typedef uint8_t  pixel;
-    typedef uint32_t pixel4;
-    typedef int16_t  dctcoef;
-    typedef uint16_t udctcoef;
-
-#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
-#   define MPIXEL_X4(src) M32(src)
-#endif
-
-#define BIT_DEPTH X264_BIT_DEPTH
-
-#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
-
-#define X264_SCAN8_LUMA_SIZE (5*8)
-#define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
-#define X264_SCAN8_0 (4+1*8)
-
-/* Scan8 organization:
- *    0 1 2 3 4 5 6 7
- * 0  DY    y y y y y
- * 1        y Y Y Y Y
- * 2        y Y Y Y Y
- * 3        y Y Y Y Y
- * 4        y Y Y Y Y
- * 5  DU    u u u u u
- * 6        u U U U U
- * 7        u U U U U
- * 8        u U U U U
- * 9        u U U U U
- * 10 DV    v v v v v
- * 11       v V V V V
- * 12       v V V V V
- * 13       v V V V V
- * 14       v V V V V
- * DY/DU/DV are for luma/chroma DC.
- */
-
-#define LUMA_DC   48
-#define CHROMA_DC 49
-
-static const uint8_t x264_scan8[16*3 + 3] =
-{
-    4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
-    6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
-    4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
-    6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
-    4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
-    6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
-    4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
-    6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
-    4+11*8, 5+11*8, 4+12*8, 5+12*8,
-    6+11*8, 7+11*8, 6+12*8, 7+12*8,
-    4+13*8, 5+13*8, 4+14*8, 5+14*8,
-    6+13*8, 7+13*8, 6+14*8, 7+14*8,
-    0+ 0*8, 0+ 5*8, 0+10*8
-};
-
-#include "x264.h"
-#if HAVE_OPENCL
-#include "opencl.h"
-#endif
-#include "cabac.h"
-#include "bitstream.h"
-#include "set.h"
-#include "predict.h"
-#include "pixel.h"
-#include "mc.h"
-#include "frame.h"
-#include "dct.h"
-#include "quant.h"
-#include "cpu.h"
-#include "threadpool.h"
-
-/****************************************************************************
- * General functions
- ****************************************************************************/
-/* x264_malloc : will do or emulate a memalign
- * you have to use x264_free for buffers allocated with x264_malloc */
-void *x264_malloc( int );
-void  x264_free( void * );
-
-/* x264_slurp_file: malloc space for the whole file and read it */
-char *x264_slurp_file( const char *filename );
-
-/* mdate: return the current date in microsecond */
-int64_t x264_mdate( void );
-
-/* x264_param2string: return a (malloced) string containing most of
- * the encoding options */
-char *x264_param2string( x264_param_t *p, int b_res );
-
-/* log */
-void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
-
-void x264_reduce_fraction( uint32_t *n, uint32_t *d );
-void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( x264_t *h );
-void x264_cabac_init( x264_t *h );
-
-static ALWAYS_INLINE pixel x264_clip_pixel( int x )
-{
-    return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
-}
-
-static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
-{
-    return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v );
-}
-
-static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max )
-{
-    return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v );
-}
-
-static ALWAYS_INLINE int x264_median( int a, int b, int c )
-{
-    int t = (a-b)&((a-b)>>31);
-    a -= t;
-    b += t;
-    b -= (b-c)&((b-c)>>31);
-    b += (a-b)&((a-b)>>31);
-    return b;
-}
-
-static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
-{
-    dst[0] = x264_median( a[0], b[0], c[0] );
-    dst[1] = x264_median( a[1], b[1], c[1] );
-}
-
-static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
-{
-    int sum = 0;
-    for( int i = 0; i < i_mvc-1; i++ )
-    {
-        sum += abs( mvc[i][0] - mvc[i+1][0] )
-             + abs( mvc[i][1] - mvc[i+1][1] );
-    }
-    return sum;
-}
-
-static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
-{
-    int amvd0 = mvdleft[0] + mvdtop[0];
-    int amvd1 = mvdleft[1] + mvdtop[1];
-    amvd0 = (amvd0 > 2) + (amvd0 > 32);
-    amvd1 = (amvd1 > 2) + (amvd1 > 32);
-    return amvd0 + (amvd1<<8);
-}
-
-extern const uint8_t x264_exp2_lut[64];
-extern const float x264_log2_lut[128];
-extern const float x264_log2_lz_lut[32];
-
-/* Not a general-purpose function; multiplies input by -1/6 to convert
- * qp to qscale. */
-static ALWAYS_INLINE int x264_exp2fix8( float x )
-{
-    int i = x*(-64.f/6.f) + 512.5f;
-    if( i < 0 ) return 0;
-    if( i > 1023 ) return 0xffff;
-    return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
-}
-
-static ALWAYS_INLINE float x264_log2( uint32_t x )
-{
-    int lz = x264_clz( x );
-    return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
-}
-
-/****************************************************************************
- *
- ****************************************************************************/
-enum slice_type_e
-{
-    SLICE_TYPE_P  = 0,
-    SLICE_TYPE_B  = 1,
-    SLICE_TYPE_I  = 2,
-};
-
-static const char slice_type_to_char[] = { 'P', 'B', 'I' };
-
-enum sei_payload_type_e
-{
-    SEI_BUFFERING_PERIOD       = 0,
-    SEI_PIC_TIMING             = 1,
-    SEI_PAN_SCAN_RECT          = 2,
-    SEI_FILLER                 = 3,
-    SEI_USER_DATA_REGISTERED   = 4,
-    SEI_USER_DATA_UNREGISTERED = 5,
-    SEI_RECOVERY_POINT         = 6,
-    SEI_DEC_REF_PIC_MARKING    = 7,
-    SEI_FRAME_PACKING          = 45,
-};
-
-typedef struct
-{
-    x264_sps_t *sps;
-    x264_pps_t *pps;
-
-    int i_type;
-    int i_first_mb;
-    int i_last_mb;
-
-    int i_pps_id;
-
-    int i_frame_num;
-
-    int b_mbaff;
-    int b_field_pic;
-    int b_bottom_field;
-
-    int i_idr_pic_id;   /* -1 if nal_type != 5 */
-
-    int i_poc;
-    int i_delta_poc_bottom;
-
-    int i_delta_poc[2];
-    int i_redundant_pic_cnt;
-
-    int b_direct_spatial_mv_pred;
-
-    int b_num_ref_idx_override;
-    int i_num_ref_idx_l0_active;
-    int i_num_ref_idx_l1_active;
-
-    int b_ref_pic_list_reordering[2];
-    struct
-    {
-        int idc;
-        int arg;
-    } ref_pic_list_order[2][X264_REF_MAX];
-
-    /* P-frame weighting */
-    int b_weighted_pred;
-    x264_weight_t weight[X264_REF_MAX*2][3];
-
-    int i_mmco_remove_from_end;
-    int i_mmco_command_count;
-    struct /* struct for future expansion */
-    {
-        int i_difference_of_pic_nums;
-        int i_poc;
-    } mmco[X264_REF_MAX];
-
-    int i_cabac_init_idc;
-
-    int i_qp;
-    int i_qp_delta;
-    int b_sp_for_swidth;
-    int i_qs_delta;
-
-    /* deblocking filter */
-    int i_disable_deblocking_filter_idc;
-    int i_alpha_c0_offset;
-    int i_beta_offset;
-
-} x264_slice_header_t;
-
-typedef struct x264_lookahead_t
-{
-    volatile uint8_t              b_exit_thread;
-    uint8_t                       b_thread_active;
-    uint8_t                       b_analyse_keyframe;
-    int                           i_last_keyframe;
-    int                           i_slicetype_length;
-    x264_frame_t                  *last_nonb;
-    x264_pthread_t                thread_handle;
-    x264_sync_frame_list_t        ifbuf;
-    x264_sync_frame_list_t        next;
-    x264_sync_frame_list_t        ofbuf;
-} x264_lookahead_t;
-
-typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
-
-typedef struct x264_left_table_t
-{
-    uint8_t intra[4];
-    uint8_t nnz[4];
-    uint8_t nnz_chroma[4];
-    uint8_t mv[4];
-    uint8_t ref[4];
-} x264_left_table_t;
-
-/* Current frame stats */
-typedef struct
-{
-    /* MV bits (MV+Ref+Block Type) */
-    int i_mv_bits;
-    /* Texture bits (DCT coefs) */
-    int i_tex_bits;
-    /* ? */
-    int i_misc_bits;
-    /* MB type counts */
-    int i_mb_count[19];
-    int i_mb_count_i;
-    int i_mb_count_p;
-    int i_mb_count_skip;
-    int i_mb_count_8x8dct[2];
-    int i_mb_count_ref[2][X264_REF_MAX*2];
-    int i_mb_partition[17];
-    int i_mb_cbp[6];
-    int i_mb_pred_mode[4][13];
-    int i_mb_field[3];
-    /* Adaptive direct mv pred */
-    int i_direct_score[2];
-    /* Metrics */
-    int64_t i_ssd[3];
-    double f_ssim;
-    int i_ssim_cnt;
-} x264_frame_stat_t;
-
-struct x264_t
-{
-    /* encoder parameters */
-    x264_param_t    param;
-
-    x264_t          *thread[X264_THREAD_MAX+1];
-    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
-    int             b_thread_active;
-    int             i_thread_phase; /* which thread to use for the next frame */
-    int             i_thread_idx;   /* which thread this is */
-    int             i_threadslice_start; /* first row in this thread slice */
-    int             i_threadslice_end; /* row after the end of this thread slice */
-    int             i_threadslice_pass; /* which pass of encoding we are on */
-    x264_threadpool_t *threadpool;
-    x264_threadpool_t *lookaheadpool;
-    x264_pthread_mutex_t mutex;
-    x264_pthread_cond_t cv;
-
-    /* bitstream output */
-    struct
-    {
-        int         i_nal;
-        int         i_nals_allocated;
-        x264_nal_t  *nal;
-        int         i_bitstream;    /* size of p_bitstream */
-        uint8_t     *p_bitstream;   /* will hold data for all nal */
-        bs_t        bs;
-    } out;
-
-    uint8_t *nal_buffer;
-    int      nal_buffer_size;
-
-    x264_t          *reconfig_h;
-    int             reconfig;
-
-    /**** thread synchronization starts here ****/
-
-    /* frame number/poc */
-    int             i_frame;
-    int             i_frame_num;
-
-    int             i_thread_frames; /* Number of different frames being encoded by threads;
-                                      * 1 when sliced-threads is on. */
-    int             i_nal_type;
-    int             i_nal_ref_idc;
-
-    int64_t         i_disp_fields;  /* Number of displayed fields (both coded and implied via pic_struct) */
-    int             i_disp_fields_last_frame;
-    int64_t         i_prev_duration; /* Duration of previous frame */
-    int64_t         i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */
-    int64_t         i_cpb_delay;    /* Equal to number of fields preceding this field
-                                     * since last buffering_period SEI */
-    int64_t         i_coded_fields_lookahead; /* Use separate counters for lookahead */
-    int64_t         i_cpb_delay_lookahead;
-
-    int64_t         i_cpb_delay_pir_offset;
-    int64_t         i_cpb_delay_pir_offset_next;
-
-    int             b_queued_intra_refresh;
-    int64_t         i_last_idr_pts;
-
-    int             i_idr_pic_id;
-
-    /* quantization matrix for decoding, [cqm][qp%6][coef] */
-    int             (*dequant4_mf[4])[16];   /* [4][6][16] */
-    int             (*dequant8_mf[4])[64];   /* [4][6][64] */
-    /* quantization matrix for trellis, [cqm][qp][coef] */
-    int             (*unquant4_mf[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
-    int             (*unquant8_mf[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
-    /* quantization matrix for deadzone */
-    udctcoef        (*quant4_mf[4])[16];     /* [4][QP_MAX_SPEC+1][16] */
-    udctcoef        (*quant8_mf[4])[64];     /* [4][QP_MAX_SPEC+1][64] */
-    udctcoef        (*quant4_bias[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
-    udctcoef        (*quant8_bias[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
-    udctcoef        (*quant4_bias0[4])[16];  /* [4][QP_MAX_SPEC+1][16] */
-    udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC+1][64] */
-    udctcoef        (*nr_offset_emergency)[4][64];
-
-    /* mv/ref cost arrays. */
-    uint16_t *cost_mv[QP_MAX+1];
-    uint16_t *cost_mv_fpel[QP_MAX+1][4];
-
-    const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
-
-    /* Slice header */
-    x264_slice_header_t sh;
-
-    /* SPS / PPS */
-    x264_sps_t      sps[1];
-    x264_pps_t      pps[1];
-
-    /* Slice header backup, for SEI_DEC_REF_PIC_MARKING */
-    int b_sh_backup;
-    x264_slice_header_t sh_backup;
-
-    /* cabac context */
-    x264_cabac_t    cabac;
-
-    struct
-    {
-        /* Frames to be encoded (whose types have been decided) */
-        x264_frame_t **current;
-        /* Unused frames: 0 = fenc, 1 = fdec */
-        x264_frame_t **unused[2];
-
-        /* Unused blank frames (for duplicates) */
-        x264_frame_t **blank_unused;
-
-        /* frames used for reference + sentinels */
-        x264_frame_t *reference[X264_REF_MAX+2];
-
-        int i_last_keyframe;       /* Frame number of the last keyframe */
-        int i_last_idr;            /* Frame number of the last IDR (not RP)*/
-        int i_poc_last_open_gop;   /* Poc of the I frame of the last open-gop. The value
-                                    * is only assigned during the period between that
-                                    * I frame and the next P or I frame, else -1 */
-
-        int i_input;    /* Number of input frames already accepted */
-
-        int i_max_dpb;  /* Number of frames allocated in the decoded picture buffer */
-        int i_max_ref0;
-        int i_max_ref1;
-        int i_delay;    /* Number of frames buffered for B reordering */
-        int     i_bframe_delay;
-        int64_t i_bframe_delay_time;
-        int64_t i_first_pts;
-        int64_t i_prev_reordered_pts[2];
-        int64_t i_largest_pts;
-        int64_t i_second_largest_pts;
-        int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
-        int b_have_sub8x8_esa;
-    } frames;
-
-    /* current frame being encoded */
-    x264_frame_t    *fenc;
-
-    /* frame being reconstructed */
-    x264_frame_t    *fdec;
-
-    /* references lists */
-    int             i_ref[2];
-    x264_frame_t    *fref[2][X264_REF_MAX+3];
-    x264_frame_t    *fref_nearest[2];
-    int             b_ref_reorder[2];
-
-    /* hrd */
-    int initial_cpb_removal_delay;
-    int initial_cpb_removal_delay_offset;
-    int64_t i_reordered_pts_delay;
-
-    /* Current MB DCT coeffs */
-    struct
-    {
-        ALIGNED_N( dctcoef luma16x16_dc[3][16] );
-        ALIGNED_16( dctcoef chroma_dc[2][8] );
-        // FIXME share memory?
-        ALIGNED_N( dctcoef luma8x8[12][64] );
-        ALIGNED_N( dctcoef luma4x4[16*3][16] );
-    } dct;
-
-    /* MB table and cache for current frame/mb */
-    struct
-    {
-        int     i_mb_width;
-        int     i_mb_height;
-        int     i_mb_count;                 /* number of mbs in a frame */
-
-        /* Chroma subsampling */
-        int     chroma_h_shift;
-        int     chroma_v_shift;
-
-        /* Strides */
-        int     i_mb_stride;
-        int     i_b8_stride;
-        int     i_b4_stride;
-        int     left_b8[2];
-        int     left_b4[2];
-
-        /* Current index */
-        int     i_mb_x;
-        int     i_mb_y;
-        int     i_mb_xy;
-        int     i_b8_xy;
-        int     i_b4_xy;
-
-        /* Search parameters */
-        int     i_me_method;
-        int     i_subpel_refine;
-        int     b_chroma_me;
-        int     b_trellis;
-        int     b_noise_reduction;
-        int     b_dct_decimate;
-        int     i_psy_rd; /* Psy RD strength--fixed point value*/
-        int     i_psy_trellis; /* Psy trellis strength--fixed point value*/
-
-        int     b_interlaced;
-        int     b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */
-
-        /* Allowed qpel MV range to stay within the picture + emulated edge pixels */
-        int     mv_min[2];
-        int     mv_max[2];
-        int     mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
-        int     mv_maxy_row[3];
-        /* Subpel MV range for motion search.
-         * same mv_min/max but includes levels' i_mv_range. */
-        int     mv_min_spel[2];
-        int     mv_max_spel[2];
-        int     mv_miny_spel_row[3];
-        int     mv_maxy_spel_row[3];
-        /* Fullpel MV range for motion search */
-        ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
-        int     mv_miny_fpel_row[3];
-        int     mv_maxy_fpel_row[3];
-
-        /* neighboring MBs */
-        unsigned int i_neighbour;
-        unsigned int i_neighbour8[4];       /* neighbours of each 8x8 or 4x4 block that are available */
-        unsigned int i_neighbour4[16];      /* at the time the block is coded */
-        unsigned int i_neighbour_intra;     /* for constrained intra pred */
-        unsigned int i_neighbour_frame;     /* ignoring slice boundaries */
-        int     i_mb_type_top;
-        int     i_mb_type_left[2];
-        int     i_mb_type_topleft;
-        int     i_mb_type_topright;
-        int     i_mb_prev_xy;
-        int     i_mb_left_xy[2];
-        int     i_mb_top_xy;
-        int     i_mb_topleft_xy;
-        int     i_mb_topright_xy;
-        int     i_mb_top_y;
-        int     i_mb_topleft_y;
-        int     i_mb_topright_y;
-        const x264_left_table_t *left_index_table;
-        int     i_mb_top_mbpair_xy;
-        int     topleft_partition;
-        int     b_allow_skip;
-        int     field_decoding_flag;
-
-        /**** thread synchronization ends here ****/
-        /* subsequent variables are either thread-local or constant,
-         * and won't be copied from one thread to another */
-
-        /* mb table */
-        uint8_t *base;                      /* base pointer for all malloced data in this mb */
-        int8_t  *type;                      /* mb type */
-        uint8_t *partition;                 /* mb partition */
-        int8_t  *qp;                        /* mb qp */
-        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
-        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
-                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
-        uint8_t (*non_zero_count)[16*3];    /* nzc. for I_PCM set to 16 */
-        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
-        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
-        uint8_t (*mvd[2])[8][2];            /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
-        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
-        int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
-        int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
-        int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
-        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
-                                             * NOTE: this will fail on resolutions above 2^16 MBs... */
-        uint8_t *field;
-
-         /* buffer for weighted versions of the reference frames */
-        pixel *p_weight_buf[X264_REF_MAX];
-
-        /* current value */
-        int     i_type;
-        int     i_partition;
-        ALIGNED_4( uint8_t i_sub_partition[4] );
-        int     b_transform_8x8;
-
-        int     i_cbp_luma;
-        int     i_cbp_chroma;
-
-        int     i_intra16x16_pred_mode;
-        int     i_chroma_pred_mode;
-
-        /* skip flags for i4x4 and i8x8
-         * 0 = encode as normal.
-         * 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction.
-         * 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */
-        int i_skip_intra;
-        /* skip flag for motion compensation */
-        /* if we've already done MC, we don't need to do it again */
-        int b_skip_mc;
-        /* set to true if we are re-encoding a macroblock. */
-        int b_reencode_mb;
-        int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
-        int b_deblock_rdo;
-        int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */
-
-        struct
-        {
-            /* space for p_fenc and p_fdec */
-#define FENC_STRIDE 16
-#define FDEC_STRIDE 32
-            ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
-
-            /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
-            ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
-            ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
-            ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
-            ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
-            uint32_t i4x4_nnz_buf[4];
-            uint32_t i8x8_nnz_buf[4];
-            int i4x4_cbp;
-            int i8x8_cbp;
-
-            /* Psy trellis DCT data */
-            ALIGNED_16( dctcoef fenc_dct8[4][64] );
-            ALIGNED_16( dctcoef fenc_dct4[16][16] );
-
-            /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_N( uint32_t fenc_satd_cache[32] );
-
-            /* pointer over mb of the frame to be compressed */
-            pixel *p_fenc[3]; /* y,u,v */
-            /* pointer to the actual source frame, not a block copy */
-            pixel *p_fenc_plane[3];
-
-            /* pointer over mb of the frame to be reconstructed  */
-            pixel *p_fdec[3];
-
-            /* pointer over mb of the references */
-            int i_fref[2];
-            /* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
-            pixel *p_fref[2][X264_REF_MAX*2][12];
-            pixel *p_fref_w[X264_REF_MAX*2];  /* weighted fullpel luma */
-            uint16_t *p_integral[2][X264_REF_MAX];
-
-            /* fref stride */
-            int     i_stride[3];
-        } pic;
-
-        /* cache */
-        struct
-        {
-            /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
-            ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
-
-            /* i_non_zero_count if available else 0x80 */
-            ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );
-
-            /* -1 if unused, -2 if unavailable */
-            ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
-
-            /* 0 if not available */
-            ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
-            ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );
-
-            /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
-            ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] );
-
-            ALIGNED_4( int16_t direct_mv[2][4][2] );
-            ALIGNED_4( int8_t  direct_ref[2][4] );
-            int     direct_partition;
-            ALIGNED_4( int16_t pskip_mv[2] );
-
-            /* number of neighbors (top and left) that used 8x8 dct */
-            int     i_neighbour_transform_size;
-            int     i_neighbour_skip;
-
-            /* neighbor CBPs */
-            int     i_cbp_top;
-            int     i_cbp_left;
-
-            /* extra data required for mbaff in mv prediction */
-            int16_t topright_mv[2][3][2];
-            int8_t  topright_ref[2][3];
-
-            /* current mb deblock strength */
-            uint8_t (*deblock_strength)[8][4];
-        } cache;
-
-        /* */
-        int     i_qp;       /* current qp */
-        int     i_chroma_qp;
-        int     i_last_qp;  /* last qp */
-        int     i_last_dqp; /* last delta qp */
-        int     b_variable_qp; /* whether qp is allowed to vary per macroblock */
-        int     b_lossless;
-        int     b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
-        int     b_direct_auto_write; /* analyse direct modes, to use and/or save */
-
-        /* lambda values */
-        int     i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
-        int     i_psy_rd_lambda;
-        int     i_chroma_lambda2_offset;
-
-        /* B_direct and weighted prediction */
-        int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
-        int16_t (*dist_scale_factor)[4];
-        int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
-        int8_t (*bipred_weight)[4];
-        /* maps fref1[0]'s ref indices into the current list0 */
-#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
-        int8_t  map_col_to_list0[X264_REF_MAX+2];
-        int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
-        int8_t deblock_ref_table[X264_REF_MAX*2+2];
-#define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
-    } mb;
-
-    /* rate control encoding only */
-    x264_ratecontrol_t *rc;
-
-    /* stats */
-    struct
-    {
-        /* Cumulated stats */
-
-        /* per slice info */
-        int     i_frame_count[3];
-        int64_t i_frame_size[3];
-        double  f_frame_qp[3];
-        int     i_consecutive_bframes[X264_BFRAME_MAX+1];
-        /* */
-        double  f_ssd_global[3];
-        double  f_psnr_average[3];
-        double  f_psnr_mean_y[3];
-        double  f_psnr_mean_u[3];
-        double  f_psnr_mean_v[3];
-        double  f_ssim_mean_y[3];
-        double  f_frame_duration[3];
-        /* */
-        int64_t i_mb_count[3][19];
-        int64_t i_mb_partition[2][17];
-        int64_t i_mb_count_8x8dct[2];
-        int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
-        int64_t i_mb_cbp[6];
-        int64_t i_mb_pred_mode[4][13];
-        int64_t i_mb_field[3];
-        /* */
-        int     i_direct_score[2];
-        int     i_direct_frames[2];
-        /* num p-frames weighted */
-        int     i_wpred[2];
-
-        /* Current frame stats */
-        x264_frame_stat_t frame;
-    } stat;
-
-    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
-    udctcoef (*nr_offset)[64];
-    uint32_t (*nr_residual_sum)[64];
-    uint32_t *nr_count;
-
-    ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
-    ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
-    uint32_t nr_count_buf[2][4];
-
-    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
-
-    /* Buffers that are allocated per-thread even in sliced threads. */
-    void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
-    void *scratch_buffer2; /* if the first one's already in use */
-    pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
-    /* Deblock strength values are stored for each 4x4 partition. In MBAFF
-     * there are four extra values that need to be stored, located in [4][i]. */
-    uint8_t (*deblock_strength[2])[2][8][4];
-
-    /* CPU functions dependents */
-    x264_predict_t      predict_16x16[4+3];
-    x264_predict8x8_t   predict_8x8[9+3];
-    x264_predict_t      predict_4x4[9+3];
-    x264_predict_t      predict_chroma[4+3];
-    x264_predict_t      predict_8x8c[4+3];
-    x264_predict_t      predict_8x16c[4+3];
-    x264_predict_8x8_filter_t predict_8x8_filter;
-
-    x264_pixel_function_t pixf;
-    x264_mc_functions_t   mc;
-    x264_dct_function_t   dctf;
-    x264_zigzag_function_t zigzagf;
-    x264_zigzag_function_t zigzagf_interlaced;
-    x264_zigzag_function_t zigzagf_progressive;
-    x264_quant_function_t quantf;
-    x264_deblock_function_t loopf;
-    x264_bitstream_function_t bsf;
-
-    x264_lookahead_t *lookahead;
-
-#if HAVE_OPENCL
-    x264_opencl_t opencl;
-#endif
-};
-
-typedef struct
-{
-    int sad;
-    int16_t mv[2];
-} mvsad_t;
-
-// included at the end because it needs x264_t
-#include "macroblock.h"
-
-static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
-{
-    int cnt = 0;
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        int mx = (mvc[i][0] + 2) >> 2;
-        int my = (mvc[i][1] + 2) >> 2;
-        uint32_t mv = pack16to32_mask(mx, my);
-        if( !mv || mv == pmv ) continue;
-        dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
-        dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
-        cnt++;
-    }
-    return cnt;
-}
-
-static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
-{
-    int cnt = 0;
-    int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        uint32_t mv = M32( mvc[i] );
-        int mx = mvc[i][0];
-        int my = mvc[i][1];
-        if( !mv || mv == pmv ) continue;
-        dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
-        dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
-        cnt++;
-    }
-    return cnt;
-}
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/util.h"
-#endif
-
-#include "rectangle.h"
-
-#endif
-
diff --git a/android/src/main/libenc/jni/libx264/common/cpu.c b/android/src/main/libenc/jni/libx264/common/cpu.c
deleted file mode 100755
index 135bb5e..0000000
--- a/android/src/main/libenc/jni/libx264/common/cpu.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*****************************************************************************
- * cpu.c: cpu detection
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#if HAVE_POSIXTHREAD && SYS_LINUX
-#include <sched.h>
-#endif
-#if SYS_BEOS
-#include <kernel/OS.h>
-#endif
-#if SYS_MACOSX || SYS_FREEBSD
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-#if SYS_OPENBSD
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-const x264_cpu_name_t x264_cpu_names[] =
-{
-#if HAVE_MMX
-//  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
-//  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
-#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
-    {"MMX2",        MMX2},
-    {"MMXEXT",      MMX2},
-    {"SSE",         MMX2|X264_CPU_SSE},
-#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
-    {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
-    {"SSE2",        SSE2},
-    {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
-    {"SSE3",        SSE2|X264_CPU_SSE3},
-    {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
-    {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
-    {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
-    {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
-#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
-    {"AVX",         AVX},
-    {"XOP",         AVX|X264_CPU_XOP},
-    {"FMA4",        AVX|X264_CPU_FMA4},
-    {"FMA3",        AVX|X264_CPU_FMA3},
-    {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
-#undef AVX
-#undef SSE2
-#undef MMX2
-    {"Cache32",         X264_CPU_CACHELINE_32},
-    {"Cache64",         X264_CPU_CACHELINE_64},
-    {"LZCNT",           X264_CPU_LZCNT},
-    {"BMI1",            X264_CPU_BMI1},
-    {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
-    {"SlowCTZ",         X264_CPU_SLOW_CTZ},
-    {"SlowAtom",        X264_CPU_SLOW_ATOM},
-    {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
-    {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
-    {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
-    {"UnalignedStack",  X264_CPU_STACK_MOD4},
-#elif ARCH_PPC
-    {"Altivec",         X264_CPU_ALTIVEC},
-#elif ARCH_ARM
-    {"ARMv6",           X264_CPU_ARMV6},
-    {"NEON",            X264_CPU_NEON},
-    {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
-#elif ARCH_AARCH64
-    {"ARMv8",           X264_CPU_ARMV8},
-    {"NEON",            X264_CPU_NEON},
-#elif ARCH_MIPS
-    {"MSA",             X264_CPU_MSA},
-#endif
-    {"", 0},
-};
-
-#if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
-    if( !canjump )
-    {
-        signal( sig, SIG_DFL );
-        raise( sig );
-    }
-
-    canjump = 0;
-    siglongjmp( jmpbuf, 1 );
-}
-#endif
-
-#if HAVE_MMX
-int x264_cpu_cpuid_test( void );
-void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
-void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
-
-uint32_t x264_cpu_detect( void )
-{
-    uint32_t cpu = 0;
-    uint32_t eax, ebx, ecx, edx;
-    uint32_t vendor[4] = {0};
-    uint32_t max_extended_cap, max_basic_cap;
-    int cache;
-
-#if !ARCH_X86_64
-    if( !x264_cpu_cpuid_test() )
-        return 0;
-#endif
-
-    x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
-    max_basic_cap = eax;
-    if( max_basic_cap == 0 )
-        return 0;
-
-    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-    if( edx&0x00800000 )
-        cpu |= X264_CPU_MMX;
-    else
-        return cpu;
-    if( edx&0x02000000 )
-        cpu |= X264_CPU_MMX2|X264_CPU_SSE;
-    if( edx&0x00008000 )
-        cpu |= X264_CPU_CMOV;
-    else
-        return cpu;
-    if( edx&0x04000000 )
-        cpu |= X264_CPU_SSE2;
-    if( ecx&0x00000001 )
-        cpu |= X264_CPU_SSE3;
-    if( ecx&0x00000200 )
-        cpu |= X264_CPU_SSSE3;
-    if( ecx&0x00080000 )
-        cpu |= X264_CPU_SSE4;
-    if( ecx&0x00100000 )
-        cpu |= X264_CPU_SSE42;
-    /* Check OXSAVE and AVX bits */
-    if( (ecx&0x18000000) == 0x18000000 )
-    {
-        /* Check for OS support */
-        x264_cpu_xgetbv( 0, &eax, &edx );
-        if( (eax&0x6) == 0x6 )
-        {
-            cpu |= X264_CPU_AVX;
-            if( ecx&0x00001000 )
-                cpu |= X264_CPU_FMA3;
-        }
-    }
-
-    if( max_basic_cap >= 7 )
-    {
-        x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
-        /* AVX2 requires OS support, but BMI1/2 don't. */
-        if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
-            cpu |= X264_CPU_AVX2;
-        if( ebx&0x00000008 )
-        {
-            cpu |= X264_CPU_BMI1;
-            if( ebx&0x00000100 )
-                cpu |= X264_CPU_BMI2;
-        }
-    }
-
-    if( cpu & X264_CPU_SSSE3 )
-        cpu |= X264_CPU_SSE2_IS_FAST;
-
-    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
-    max_extended_cap = eax;
-
-    if( max_extended_cap >= 0x80000001 )
-    {
-        x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
-
-        if( ecx&0x00000020 )
-            cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
-        if( ecx&0x00000040 ) /* SSE4a, AMD only */
-        {
-            int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
-            cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
-            if( family == 0x14 )
-            {
-                cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
-                cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
-                cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
-            }
-            if( family == 0x16 )
-            {
-                cpu |= X264_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
-                                                * compared to alternate instruction sequences that this
-                                                * is equal or faster on almost all such functions. */
-            }
-        }
-
-        if( cpu & X264_CPU_AVX )
-        {
-            if( ecx&0x00000800 ) /* XOP */
-                cpu |= X264_CPU_XOP;
-            if( ecx&0x00010000 ) /* FMA4 */
-                cpu |= X264_CPU_FMA4;
-        }
-
-        if( !strcmp((char*)vendor, "AuthenticAMD") )
-        {
-            if( edx&0x00400000 )
-                cpu |= X264_CPU_MMX2;
-            if( !(cpu&X264_CPU_LZCNT) )
-                cpu |= X264_CPU_SLOW_CTZ;
-            if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
-                cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
-        }
-    }
-
-    if( !strcmp((char*)vendor, "GenuineIntel") )
-    {
-        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
-        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
-        if( family == 6 )
-        {
-            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
-             * theoretically support sse2, but it's significantly slower than mmx for
-             * almost all of x264's functions, so let's just pretend they don't. */
-            if( model == 9 || model == 13 || model == 14 )
-            {
-                cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
-                assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
-            }
-            /* Detect Atom CPU */
-            else if( model == 28 )
-            {
-                cpu |= X264_CPU_SLOW_ATOM;
-                cpu |= X264_CPU_SLOW_CTZ;
-                cpu |= X264_CPU_SLOW_PSHUFB;
-            }
-            /* Conroe has a slow shuffle unit. Check the model number to make sure not
-             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
-            else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
-                cpu |= X264_CPU_SLOW_SHUFFLE;
-        }
-    }
-
-    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
-    {
-        /* cacheline size is specified in 3 places, any of which may be missing */
-        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        cache = (ebx&0xff00)>>5; // cflush size
-        if( !cache && max_extended_cap >= 0x80000006 )
-        {
-            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
-            cache = ecx&0xff; // cacheline size
-        }
-        if( !cache && max_basic_cap >= 2 )
-        {
-            // Cache and TLB Information
-            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
-            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
-                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
-            uint32_t buf[4];
-            int max, i = 0;
-            do {
-                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
-                max = buf[0]&0xff;
-                buf[0] &= ~0xff;
-                for( int j = 0; j < 4; j++ )
-                    if( !(buf[j]>>31) )
-                        while( buf[j] )
-                        {
-                            if( strchr( cache32_ids, buf[j]&0xff ) )
-                                cache = 32;
-                            if( strchr( cache64_ids, buf[j]&0xff ) )
-                                cache = 64;
-                            buf[j] >>= 8;
-                        }
-            } while( ++i < max );
-        }
-
-        if( cache == 32 )
-            cpu |= X264_CPU_CACHELINE_32;
-        else if( cache == 64 )
-            cpu |= X264_CPU_CACHELINE_64;
-        else
-            x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
-    }
-
-#if STACK_ALIGNMENT < 16
-    cpu |= X264_CPU_STACK_MOD4;
-#endif
-
-    return cpu;
-}
-
-#elif ARCH_PPC && HAVE_ALTIVEC
-
-#if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD
-#include <sys/sysctl.h>
-uint32_t x264_cpu_detect( void )
-{
-    /* Thank you VLC */
-    uint32_t cpu = 0;
-#if SYS_OPENBSD
-    int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
-#elif SYS_MACOSX
-    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
-#endif
-    int      has_altivec = 0;
-    size_t   length = sizeof( has_altivec );
-#if SYS_MACOSX || SYS_OPENBSD
-    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
-#else
-    int      error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
-#endif
-
-    if( error == 0 && has_altivec != 0 )
-        cpu |= X264_CPU_ALTIVEC;
-
-    return cpu;
-}
-
-#elif SYS_LINUX
-
-uint32_t x264_cpu_detect( void )
-{
-#ifdef __NO_FPRS__
-    return 0;
-#else
-    static void (*oldsig)( int );
-
-    oldsig = signal( SIGILL, sigill_handler );
-    if( sigsetjmp( jmpbuf, 1 ) )
-    {
-        signal( SIGILL, oldsig );
-        return 0;
-    }
-
-    canjump = 1;
-    asm volatile( "mtspr 256, %0\n\t"
-                  "vand 0, 0, 0\n\t"
-                  :
-                  : "r"(-1) );
-    canjump = 0;
-
-    signal( SIGILL, oldsig );
-
-    return X264_CPU_ALTIVEC;
-#endif
-}
-#endif
-
-#elif ARCH_ARM
-
-void x264_cpu_neon_test( void );
-int x264_cpu_fast_neon_mrc_test( void );
-
-uint32_t x264_cpu_detect( void )
-{
-    int flags = 0;
-#if HAVE_ARMV6
-    flags |= X264_CPU_ARMV6;
-
-    // don't do this hack if compiled with -mfpu=neon
-#if !HAVE_NEON
-    static void (* oldsig)( int );
-    oldsig = signal( SIGILL, sigill_handler );
-    if( sigsetjmp( jmpbuf, 1 ) )
-    {
-        signal( SIGILL, oldsig );
-        return flags;
-    }
-
-    canjump = 1;
-    x264_cpu_neon_test();
-    canjump = 0;
-    signal( SIGILL, oldsig );
-#endif
-
-    flags |= X264_CPU_NEON;
-
-    // fast neon -> arm (Cortex-A9) detection relies on user access to the
-    // cycle counter; this assumes ARMv7 performance counters.
-    // NEON requires at least ARMv7, ARMv8 may require changes here, but
-    // hopefully this hacky detection method will have been replaced by then.
-    // Note that there is potential for a race condition if another program or
-    // x264 instance disables or reinits the counters while x264 is using them,
-    // which may result in incorrect detection and the counters stuck enabled.
-    // right now Apple does not seem to support performance counters for this test
-#ifndef __MACH__
-    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
-#endif
-    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
-#endif
-    return flags;
-}
-
-#elif ARCH_AARCH64
-
-uint32_t x264_cpu_detect( void )
-{
-    return X264_CPU_ARMV8 | X264_CPU_NEON;
-}
-
-#elif ARCH_MIPS
-
-uint32_t x264_cpu_detect( void )
-{
-    uint32_t flags = 0;
-#if HAVE_MSA
-    flags |= X264_CPU_MSA;
-#endif
-    return flags;
-}
-
-#else
-
-uint32_t x264_cpu_detect( void )
-{
-    return 0;
-}
-
-#endif
-
-int x264_cpu_num_processors( void )
-{
-#if !HAVE_THREAD
-    return 1;
-
-#elif SYS_WINDOWS
-    return x264_pthread_num_processors_np();
-
-#elif SYS_CYGWIN || SYS_SunOS
-    return sysconf( _SC_NPROCESSORS_ONLN );
-
-#elif SYS_LINUX
-#ifdef __ANDROID__
-    // Android NDK does not expose sched_getaffinity
-    return sysconf( _SC_NPROCESSORS_CONF );
-#else
-    cpu_set_t p_aff;
-    memset( &p_aff, 0, sizeof(p_aff) );
-    if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
-        return 1;
-#if HAVE_CPU_COUNT
-    return CPU_COUNT(&p_aff);
-#else
-    int np = 0;
-    for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ )
-        np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
-    return np;
-#endif
-#endif
-
-#elif SYS_BEOS
-    system_info info;
-    get_system_info( &info );
-    return info.cpu_count;
-
-#elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
-    int ncpu;
-    size_t length = sizeof( ncpu );
-#if SYS_OPENBSD
-    int mib[2] = { CTL_HW, HW_NCPU };
-    if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
-#else
-    if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
-#endif
-    {
-        ncpu = 1;
-    }
-    return ncpu;
-
-#else
-    return 1;
-#endif
-}
diff --git a/android/src/main/libenc/jni/libx264/common/cpu.h b/android/src/main/libenc/jni/libx264/common/cpu.h
deleted file mode 100755
index 1381419..0000000
--- a/android/src/main/libenc/jni/libx264/common/cpu.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*****************************************************************************
- * cpu.h: cpu detection
- *****************************************************************************
- * Copyright (C) 2004-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_CPU_H
-#define X264_CPU_H
-
-uint32_t x264_cpu_detect( void );
-int      x264_cpu_num_processors( void );
-void     x264_cpu_emms( void );
-void     x264_cpu_sfence( void );
-#if HAVE_MMX
-/* There is no way to forbid the compiler from using float instructions
- * before the emms so miscompilation could theoretically occur in the
- * unlikely event that the compiler reorders emms and float instructions. */
-#if HAVE_X86_INLINE_ASM
-/* Clobbering memory makes the compiler less likely to reorder code. */
-#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \
-                                  "st(3)","st(4)","st(5)","st(6)","st(7)" )
-#else
-#define x264_emms() x264_cpu_emms()
-#endif
-#else
-#define x264_emms()
-#endif
-#define x264_sfence x264_cpu_sfence
-
-/* kludge:
- * gcc can't give variables any greater alignment than the stack frame has.
- * We need 32 byte alignment for AVX2, so here we make sure that the stack is
- * aligned to 32 bytes.
- * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
- * problem, but I don't want to require such a new version.
- * aligning to 32 bytes only works if the compiler supports keeping that
- * alignment between functions (osdep.h handles manual alignment of arrays
- * if it doesn't).
- */
-#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
-intptr_t x264_stack_align( void (*func)(), ... );
-#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
-#else
-#define x264_stack_align(func,...) func(__VA_ARGS__)
-#endif
-
-typedef struct
-{
-    const char name[16];
-    uint32_t flags;
-} x264_cpu_name_t;
-extern const x264_cpu_name_t x264_cpu_names[];
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/dct.c b/android/src/main/libenc/jni/libx264/common/dct.c
deleted file mode 100755
index 7dfeea2..0000000
--- a/android/src/main/libenc/jni/libx264/common/dct.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-/*****************************************************************************
- * dct.c: transform and zigzag
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-#if HAVE_MMX
-#   include "x86/dct.h"
-#endif
-#if ARCH_PPC
-#   include "ppc/dct.h"
-#endif
-#if ARCH_ARM
-#   include "arm/dct.h"
-#endif
-#if ARCH_AARCH64
-#   include "aarch64/dct.h"
-#endif
-#if ARCH_MIPS
-#   include "mips/dct.h"
-#endif
-
-/* the inverse of the scaling factors introduced by 8x8 fdct */
-/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
-#define W(i) (i==0 ? FIX8(1.0000) :\
-              i==1 ? FIX8(0.8859) :\
-              i==2 ? FIX8(1.6000) :\
-              i==3 ? FIX8(0.9415) :\
-              i==4 ? FIX8(1.2651) :\
-              i==5 ? FIX8(1.1910) :0)
-const uint32_t x264_dct8_weight_tab[64] = {
-    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-
-    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
-};
-#undef W
-
-#define W(i) (i==0 ? FIX8(1.76777) :\
-              i==1 ? FIX8(1.11803) :\
-              i==2 ? FIX8(0.70711) :0)
-const uint32_t x264_dct4_weight_tab[16] = {
-    W(0), W(1), W(0), W(1),
-    W(1), W(2), W(1), W(2),
-    W(0), W(1), W(0), W(1),
-    W(1), W(2), W(1), W(2)
-};
-#undef W
-
-/* inverse squared */
-#define W(i) (i==0 ? FIX8(3.125) :\
-              i==1 ? FIX8(1.25) :\
-              i==2 ? FIX8(0.5) :0)
-const uint32_t x264_dct4_weight2_tab[16] = {
-    W(0), W(1), W(0), W(1),
-    W(1), W(2), W(1), W(2),
-    W(0), W(1), W(0), W(1),
-    W(1), W(2), W(1), W(2)
-};
-#undef W
-
-#define W(i) (i==0 ? FIX8(1.00000) :\
-              i==1 ? FIX8(0.78487) :\
-              i==2 ? FIX8(2.56132) :\
-              i==3 ? FIX8(0.88637) :\
-              i==4 ? FIX8(1.60040) :\
-              i==5 ? FIX8(1.41850) :0)
-const uint32_t x264_dct8_weight2_tab[64] = {
-    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-
-    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
-    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
-    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
-};
-#undef W
-
-
-static void dct4x4dc( dctcoef d[16] )
-{
-    dctcoef tmp[16];
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s01 = d[i*4+0] + d[i*4+1];
-        int d01 = d[i*4+0] - d[i*4+1];
-        int s23 = d[i*4+2] + d[i*4+3];
-        int d23 = d[i*4+2] - d[i*4+3];
-
-        tmp[0*4+i] = s01 + s23;
-        tmp[1*4+i] = s01 - s23;
-        tmp[2*4+i] = d01 - d23;
-        tmp[3*4+i] = d01 + d23;
-    }
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s01 = tmp[i*4+0] + tmp[i*4+1];
-        int d01 = tmp[i*4+0] - tmp[i*4+1];
-        int s23 = tmp[i*4+2] + tmp[i*4+3];
-        int d23 = tmp[i*4+2] - tmp[i*4+3];
-
-        d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
-        d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
-        d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
-        d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
-    }
-}
-
-static void idct4x4dc( dctcoef d[16] )
-{
-    dctcoef tmp[16];
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s01 = d[i*4+0] + d[i*4+1];
-        int d01 = d[i*4+0] - d[i*4+1];
-        int s23 = d[i*4+2] + d[i*4+3];
-        int d23 = d[i*4+2] - d[i*4+3];
-
-        tmp[0*4+i] = s01 + s23;
-        tmp[1*4+i] = s01 - s23;
-        tmp[2*4+i] = d01 - d23;
-        tmp[3*4+i] = d01 + d23;
-    }
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s01 = tmp[i*4+0] + tmp[i*4+1];
-        int d01 = tmp[i*4+0] - tmp[i*4+1];
-        int s23 = tmp[i*4+2] + tmp[i*4+3];
-        int d23 = tmp[i*4+2] - tmp[i*4+3];
-
-        d[i*4+0] = s01 + s23;
-        d[i*4+1] = s01 - s23;
-        d[i*4+2] = d01 - d23;
-        d[i*4+3] = d01 + d23;
-    }
-}
-
-static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
-{
-    int a0 = dct4x4[0][0] + dct4x4[1][0];
-    int a1 = dct4x4[2][0] + dct4x4[3][0];
-    int a2 = dct4x4[4][0] + dct4x4[5][0];
-    int a3 = dct4x4[6][0] + dct4x4[7][0];
-    int a4 = dct4x4[0][0] - dct4x4[1][0];
-    int a5 = dct4x4[2][0] - dct4x4[3][0];
-    int a6 = dct4x4[4][0] - dct4x4[5][0];
-    int a7 = dct4x4[6][0] - dct4x4[7][0];
-    int b0 = a0 + a1;
-    int b1 = a2 + a3;
-    int b2 = a4 + a5;
-    int b3 = a6 + a7;
-    int b4 = a0 - a1;
-    int b5 = a2 - a3;
-    int b6 = a4 - a5;
-    int b7 = a6 - a7;
-    dct[0] = b0 + b1;
-    dct[1] = b2 + b3;
-    dct[2] = b0 - b1;
-    dct[3] = b2 - b3;
-    dct[4] = b4 - b5;
-    dct[5] = b6 - b7;
-    dct[6] = b4 + b5;
-    dct[7] = b6 + b7;
-    dct4x4[0][0] = 0;
-    dct4x4[1][0] = 0;
-    dct4x4[2][0] = 0;
-    dct4x4[3][0] = 0;
-    dct4x4[4][0] = 0;
-    dct4x4[5][0] = 0;
-    dct4x4[6][0] = 0;
-    dct4x4[7][0] = 0;
-}
-
-static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
-                                  pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
-{
-    for( int y = 0; y < i_size; y++ )
-    {
-        for( int x = 0; x < i_size; x++ )
-            diff[x + y*i_size] = pix1[x] - pix2[x];
-        pix1 += i_pix1;
-        pix2 += i_pix2;
-    }
-}
-
-static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
-{
-    dctcoef d[16];
-    dctcoef tmp[16];
-
-    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s03 = d[i*4+0] + d[i*4+3];
-        int s12 = d[i*4+1] + d[i*4+2];
-        int d03 = d[i*4+0] - d[i*4+3];
-        int d12 = d[i*4+1] - d[i*4+2];
-
-        tmp[0*4+i] =   s03 +   s12;
-        tmp[1*4+i] = 2*d03 +   d12;
-        tmp[2*4+i] =   s03 -   s12;
-        tmp[3*4+i] =   d03 - 2*d12;
-    }
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s03 = tmp[i*4+0] + tmp[i*4+3];
-        int s12 = tmp[i*4+1] + tmp[i*4+2];
-        int d03 = tmp[i*4+0] - tmp[i*4+3];
-        int d12 = tmp[i*4+1] - tmp[i*4+2];
-
-        dct[i*4+0] =   s03 +   s12;
-        dct[i*4+1] = 2*d03 +   d12;
-        dct[i*4+2] =   s03 -   s12;
-        dct[i*4+3] =   d03 - 2*d12;
-    }
-}
-
-static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
-{
-    sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
-    sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
-    sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
-    sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
-}
-
-static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
-{
-    sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
-    sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
-    sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
-    sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
-{
-    int sum = 0;
-    for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
-        sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
-             - pix2[0] - pix2[1] - pix2[2] - pix2[3];
-    return sum;
-}
-
-static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
-{
-    dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
-    dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
-    dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
-    dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
-
-    /* 2x2 DC transform */
-    int d0 = dct[0] + dct[1];
-    int d1 = dct[2] + dct[3];
-    int d2 = dct[0] - dct[1];
-    int d3 = dct[2] - dct[3];
-    dct[0] = d0 + d1;
-    dct[1] = d0 - d1;
-    dct[2] = d2 + d3;
-    dct[3] = d2 - d3;
-}
-
-static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
-{
-    int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
-    int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
-    int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
-    int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
-    int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
-    int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
-    int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
-    int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
-
-    /* 2x4 DC transform */
-    int b0 = a0 + a1;
-    int b1 = a2 + a3;
-    int b2 = a4 + a5;
-    int b3 = a6 + a7;
-    int b4 = a0 - a1;
-    int b5 = a2 - a3;
-    int b6 = a4 - a5;
-    int b7 = a6 - a7;
-    a0 = b0 + b1;
-    a1 = b2 + b3;
-    a2 = b4 + b5;
-    a3 = b6 + b7;
-    a4 = b0 - b1;
-    a5 = b2 - b3;
-    a6 = b4 - b5;
-    a7 = b6 - b7;
-    dct[0] = a0 + a1;
-    dct[1] = a2 + a3;
-    dct[2] = a0 - a1;
-    dct[3] = a2 - a3;
-    dct[4] = a4 - a5;
-    dct[5] = a6 - a7;
-    dct[6] = a4 + a5;
-    dct[7] = a6 + a7;
-}
-
-static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
-{
-    dctcoef d[16];
-    dctcoef tmp[16];
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s02 =  dct[0*4+i]     +  dct[2*4+i];
-        int d02 =  dct[0*4+i]     -  dct[2*4+i];
-        int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
-        int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
-
-        tmp[i*4+0] = s02 + s13;
-        tmp[i*4+1] = d02 + d13;
-        tmp[i*4+2] = d02 - d13;
-        tmp[i*4+3] = s02 - s13;
-    }
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
-        int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
-        int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
-        int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
-
-        d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
-        d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
-        d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
-        d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
-    }
-
-
-    for( int y = 0; y < 4; y++ )
-    {
-        for( int x = 0; x < 4; x++ )
-            p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
-        p_dst += FDEC_STRIDE;
-    }
-}
-
-static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
-{
-    add4x4_idct( &p_dst[0],               dct[0] );
-    add4x4_idct( &p_dst[4],               dct[1] );
-    add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
-    add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
-}
-
-static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
-{
-    add8x8_idct( &p_dst[0],               &dct[0] );
-    add8x8_idct( &p_dst[8],               &dct[4] );
-    add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
-    add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
-}
-
-/****************************************************************************
- * 8x8 transform:
- ****************************************************************************/
-
-#define DCT8_1D {\
-    int s07 = SRC(0) + SRC(7);\
-    int s16 = SRC(1) + SRC(6);\
-    int s25 = SRC(2) + SRC(5);\
-    int s34 = SRC(3) + SRC(4);\
-    int a0 = s07 + s34;\
-    int a1 = s16 + s25;\
-    int a2 = s07 - s34;\
-    int a3 = s16 - s25;\
-    int d07 = SRC(0) - SRC(7);\
-    int d16 = SRC(1) - SRC(6);\
-    int d25 = SRC(2) - SRC(5);\
-    int d34 = SRC(3) - SRC(4);\
-    int a4 = d16 + d25 + (d07 + (d07>>1));\
-    int a5 = d07 - d34 - (d25 + (d25>>1));\
-    int a6 = d07 + d34 - (d16 + (d16>>1));\
-    int a7 = d16 - d25 + (d34 + (d34>>1));\
-    DST(0) =  a0 + a1     ;\
-    DST(1) =  a4 + (a7>>2);\
-    DST(2) =  a2 + (a3>>1);\
-    DST(3) =  a5 + (a6>>2);\
-    DST(4) =  a0 - a1     ;\
-    DST(5) =  a6 - (a5>>2);\
-    DST(6) = (a2>>1) - a3 ;\
-    DST(7) = (a4>>2) - a7 ;\
-}
-
-static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
-{
-    dctcoef tmp[64];
-
-    pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-
-#define SRC(x) tmp[x*8+i]
-#define DST(x) tmp[x*8+i]
-    for( int i = 0; i < 8; i++ )
-        DCT8_1D
-#undef SRC
-#undef DST
-
-#define SRC(x) tmp[i*8+x]
-#define DST(x) dct[x*8+i]
-    for( int i = 0; i < 8; i++ )
-        DCT8_1D
-#undef SRC
-#undef DST
-}
-
-static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
-{
-    sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
-    sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
-    sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
-    sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-#define IDCT8_1D {\
-    int a0 =  SRC(0) + SRC(4);\
-    int a2 =  SRC(0) - SRC(4);\
-    int a4 = (SRC(2)>>1) - SRC(6);\
-    int a6 = (SRC(6)>>1) + SRC(2);\
-    int b0 = a0 + a6;\
-    int b2 = a2 + a4;\
-    int b4 = a2 - a4;\
-    int b6 = a0 - a6;\
-    int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
-    int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
-    int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
-    int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
-    int b1 = (a7>>2) + a1;\
-    int b3 =  a3 + (a5>>2);\
-    int b5 = (a3>>2) - a5;\
-    int b7 =  a7 - (a1>>2);\
-    DST(0, b0 + b7);\
-    DST(1, b2 + b5);\
-    DST(2, b4 + b3);\
-    DST(3, b6 + b1);\
-    DST(4, b6 - b1);\
-    DST(5, b4 - b3);\
-    DST(6, b2 - b5);\
-    DST(7, b0 - b7);\
-}
-
-static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
-{
-    dct[0] += 32; // rounding for the >>6 at the end
-
-#define SRC(x)     dct[x*8+i]
-#define DST(x,rhs) dct[x*8+i] = (rhs)
-    for( int i = 0; i < 8; i++ )
-        IDCT8_1D
-#undef SRC
-#undef DST
-
-#define SRC(x)     dct[i*8+x]
-#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
-    for( int i = 0; i < 8; i++ )
-        IDCT8_1D
-#undef SRC
-#undef DST
-}
-
-static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
-{
-    add8x8_idct8( &dst[0],               dct[0] );
-    add8x8_idct8( &dst[8],               dct[1] );
-    add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
-    add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
-}
-
-static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
-{
-    dc = (dc + 32) >> 6;
-    for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
-    {
-        p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
-        p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
-        p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
-        p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
-    }
-}
-
-static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
-{
-    add4x4_idct_dc( &p_dst[0],               dct[0] );
-    add4x4_idct_dc( &p_dst[4],               dct[1] );
-    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
-    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
-}
-
-static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
-{
-    for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
-    {
-        add4x4_idct_dc( &p_dst[ 0], dct[0] );
-        add4x4_idct_dc( &p_dst[ 4], dct[1] );
-        add4x4_idct_dc( &p_dst[ 8], dct[2] );
-        add4x4_idct_dc( &p_dst[12], dct[3] );
-    }
-}
-
-
-/****************************************************************************
- * x264_dct_init:
- ****************************************************************************/
-void x264_dct_init( int cpu, x264_dct_function_t *dctf )
-{
-    dctf->sub4x4_dct    = sub4x4_dct;
-    dctf->add4x4_idct   = add4x4_idct;
-
-    dctf->sub8x8_dct    = sub8x8_dct;
-    dctf->sub8x8_dct_dc = sub8x8_dct_dc;
-    dctf->add8x8_idct   = add8x8_idct;
-    dctf->add8x8_idct_dc = add8x8_idct_dc;
-
-    dctf->sub8x16_dct_dc = sub8x16_dct_dc;
-
-    dctf->sub16x16_dct  = sub16x16_dct;
-    dctf->add16x16_idct = add16x16_idct;
-    dctf->add16x16_idct_dc = add16x16_idct_dc;
-
-    dctf->sub8x8_dct8   = sub8x8_dct8;
-    dctf->add8x8_idct8  = add8x8_idct8;
-
-    dctf->sub16x16_dct8  = sub16x16_dct8;
-    dctf->add16x16_idct8 = add16x16_idct8;
-
-    dctf->dct4x4dc  = dct4x4dc;
-    dctf->idct4x4dc = idct4x4dc;
-
-    dctf->dct2x4dc = dct2x4dc;
-
-#if HIGH_BIT_DEPTH
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX )
-    {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
-    }
-    if( cpu&X264_CPU_SSE2 )
-    {
-        dctf->add4x4_idct     = x264_add4x4_idct_sse2;
-        dctf->dct4x4dc        = x264_dct4x4dc_sse2;
-        dctf->idct4x4dc       = x264_idct4x4dc_sse2;
-        dctf->dct2x4dc        = x264_dct2x4dc_sse2;
-        dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
-        dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
-        dctf->add8x8_idct     = x264_add8x8_idct_sse2;
-        dctf->add16x16_idct   = x264_add16x16_idct_sse2;
-        dctf->add8x8_idct8    = x264_add8x8_idct8_sse2;
-        dctf->add16x16_idct8    = x264_add16x16_idct8_sse2;
-        dctf->sub8x8_dct_dc   = x264_sub8x8_dct_dc_sse2;
-        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_sse2;
-        dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_sse2;
-        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
-    }
-    if( cpu&X264_CPU_SSE4 )
-    {
-        dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse4;
-        dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse4;
-    }
-    if( cpu&X264_CPU_AVX )
-    {
-        dctf->add4x4_idct     = x264_add4x4_idct_avx;
-        dctf->dct4x4dc        = x264_dct4x4dc_avx;
-        dctf->idct4x4dc       = x264_idct4x4dc_avx;
-        dctf->dct2x4dc        = x264_dct2x4dc_avx;
-        dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
-        dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
-        dctf->add8x8_idct     = x264_add8x8_idct_avx;
-        dctf->add16x16_idct   = x264_add16x16_idct_avx;
-        dctf->add8x8_idct8    = x264_add8x8_idct8_avx;
-        dctf->add16x16_idct8  = x264_add16x16_idct8_avx;
-        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_avx;
-        dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_avx;
-        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
-    }
-#endif // HAVE_MMX
-#else // !HIGH_BIT_DEPTH
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX )
-    {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
-        dctf->add4x4_idct   = x264_add4x4_idct_mmx;
-        dctf->idct4x4dc     = x264_idct4x4dc_mmx;
-        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
-
-#if !ARCH_X86_64
-        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
-        dctf->add8x8_idct   = x264_add8x8_idct_mmx;
-        dctf->add16x16_idct = x264_add16x16_idct_mmx;
-
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
-        dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
-        dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
-#endif
-    }
-
-    if( cpu&X264_CPU_MMX2 )
-    {
-        dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
-        dctf->dct2x4dc         = x264_dct2x4dc_mmx2;
-        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
-    }
-
-    if( cpu&X264_CPU_SSE2 )
-    {
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
-        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
-        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
-        dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
-        dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
-
-        if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
-        {
-            dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
-            dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
-            dctf->add8x8_idct   = x264_add8x8_idct_sse2;
-            dctf->add16x16_idct = x264_add16x16_idct_sse2;
-            dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
-        }
-    }
-
-    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
-    {
-        dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
-        if( !(cpu&X264_CPU_SLOW_ATOM) )
-        {
-            dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
-            dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
-            dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
-            dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
-            dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
-            if( !(cpu&X264_CPU_SLOW_PSHUFB) )
-            {
-                dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
-                dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
-            }
-        }
-    }
-
-    if( cpu&X264_CPU_SSE4 )
-        dctf->add4x4_idct   = x264_add4x4_idct_sse4;
-
-    if( cpu&X264_CPU_AVX )
-    {
-        dctf->add4x4_idct      = x264_add4x4_idct_avx;
-        dctf->add8x8_idct      = x264_add8x8_idct_avx;
-        dctf->add16x16_idct    = x264_add16x16_idct_avx;
-        dctf->add8x8_idct8     = x264_add8x8_idct8_avx;
-        dctf->add16x16_idct8   = x264_add16x16_idct8_avx;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
-        dctf->sub8x8_dct       = x264_sub8x8_dct_avx;
-        dctf->sub16x16_dct     = x264_sub16x16_dct_avx;
-        dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
-        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
-    }
-
-    if( cpu&X264_CPU_XOP )
-    {
-        dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
-        dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
-    }
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        dctf->add8x8_idct      = x264_add8x8_idct_avx2;
-        dctf->add16x16_idct    = x264_add16x16_idct_avx2;
-        dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
-        dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
-#if ARCH_X86_64
-        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
-#endif
-    }
-#endif //HAVE_MMX
-
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-    {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
-
-        dctf->add4x4_idct   = x264_add4x4_idct_altivec;
-        dctf->add8x8_idct   = x264_add8x8_idct_altivec;
-        dctf->add16x16_idct = x264_add16x16_idct_altivec;
-
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
-
-        dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
-        dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
-    }
-#endif
-
-#if HAVE_ARMV6 || ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
-        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
-        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
-        dctf->dct4x4dc      = x264_dct4x4dc_neon;
-        dctf->idct4x4dc     = x264_idct4x4dc_neon;
-
-        dctf->add4x4_idct   = x264_add4x4_idct_neon;
-        dctf->add8x8_idct   = x264_add8x8_idct_neon;
-        dctf->add16x16_idct = x264_add16x16_idct_neon;
-
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
-
-        dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
-        dctf->add16x16_idct8= x264_add16x16_idct8_neon;
-        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
-    }
-#endif
-
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        dctf->sub4x4_dct       = x264_sub4x4_dct_msa;
-        dctf->sub8x8_dct       = x264_sub8x8_dct_msa;
-        dctf->sub16x16_dct     = x264_sub16x16_dct_msa;
-        dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_msa;
-        dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_msa;
-        dctf->dct4x4dc         = x264_dct4x4dc_msa;
-        dctf->idct4x4dc        = x264_idct4x4dc_msa;
-        dctf->add4x4_idct      = x264_add4x4_idct_msa;
-        dctf->add8x8_idct      = x264_add8x8_idct_msa;
-        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_msa;
-        dctf->add16x16_idct    = x264_add16x16_idct_msa;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
-        dctf->add8x8_idct8     = x264_add8x8_idct8_msa;
-        dctf->add16x16_idct8   = x264_add16x16_idct8_msa;
-    }
-#endif
-
-#endif // HIGH_BIT_DEPTH
-}
-
-
-#define ZIG(i,y,x) level[i] = dct[x*8+y];
-#define ZIGZAG8_FRAME\
-    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
-    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
-    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
-    ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
-    ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
-    ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
-    ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
-    ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
-    ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
-    ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
-    ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
-    ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
-    ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
-    ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
-    ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
-    ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
-
-#define ZIGZAG8_FIELD\
-    ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
-    ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
-    ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
-    ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
-    ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
-    ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
-    ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
-    ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
-    ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
-    ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
-    ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
-    ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
-    ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
-    ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
-    ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
-    ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
-
-#define ZIGZAG4_FRAME\
-    ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
-    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
-    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
-    ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
-
-#define ZIGZAG4_FIELD\
-    ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
-    ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
-    ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
-    ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
-
-static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
-{
-    ZIGZAG8_FRAME
-}
-
-static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
-{
-    ZIGZAG8_FIELD
-}
-
-#undef ZIG
-#define ZIG(i,y,x) level[i] = dct[x*4+y];
-#define ZIGDC(i,y,x) ZIG(i,y,x)
-
-static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
-{
-    ZIGZAG4_FRAME
-}
-
-static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
-{
-    memcpy( level, dct, 2 * sizeof(dctcoef) );
-    ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
-    memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
-}
-
-#undef ZIG
-#define ZIG(i,y,x) {\
-    int oe = x+y*FENC_STRIDE;\
-    int od = x+y*FDEC_STRIDE;\
-    level[i] = p_src[oe] - p_dst[od];\
-    nz |= level[i];\
-}
-#define COPY4x4\
-    CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
-    CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
-    CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
-    CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
-#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
-#define COPY8x8\
-    CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
-    CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
-
-static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
-{
-    int nz = 0;
-    ZIGZAG4_FRAME
-    COPY4x4
-    return !!nz;
-}
-
-static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
-{
-    int nz = 0;
-    ZIGZAG4_FIELD
-    COPY4x4
-    return !!nz;
-}
-
-#undef ZIGDC
-#define ZIGDC(i,y,x) {\
-    int oe = x+y*FENC_STRIDE;\
-    int od = x+y*FDEC_STRIDE;\
-    *dc = p_src[oe] - p_dst[od];\
-    level[0] = 0;\
-}
-
-static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
-{
-    int nz = 0;
-    ZIGZAG4_FRAME
-    COPY4x4
-    return !!nz;
-}
-
-static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
-{
-    int nz = 0;
-    ZIGZAG4_FIELD
-    COPY4x4
-    return !!nz;
-}
-
-static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
-{
-    int nz = 0;
-    ZIGZAG8_FRAME
-    COPY8x8
-    return !!nz;
-}
-static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
-{
-    int nz = 0;
-    ZIGZAG8_FIELD
-    COPY8x8
-    return !!nz;
-}
-
-#undef ZIG
-#undef COPY4x4
-
-static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
-{
-    for( int i = 0; i < 4; i++ )
-    {
-        int nz = 0;
-        for( int j = 0; j < 16; j++ )
-        {
-            nz |= src[i+j*4];
-            dst[i*16+j] = src[i+j*4];
-        }
-        nnz[(i&1) + (i>>1)*8] = !!nz;
-    }
-}
-
-void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
-{
-    pf_interlaced->scan_8x8   = zigzag_scan_8x8_field;
-    pf_progressive->scan_8x8  = zigzag_scan_8x8_frame;
-    pf_interlaced->scan_4x4   = zigzag_scan_4x4_field;
-    pf_progressive->scan_4x4  = zigzag_scan_4x4_frame;
-    pf_interlaced->sub_8x8    = zigzag_sub_8x8_field;
-    pf_progressive->sub_8x8   = zigzag_sub_8x8_frame;
-    pf_interlaced->sub_4x4    = zigzag_sub_4x4_field;
-    pf_progressive->sub_4x4   = zigzag_sub_4x4_frame;
-    pf_interlaced->sub_4x4ac  = zigzag_sub_4x4ac_field;
-    pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
-
-#if HIGH_BIT_DEPTH
-#if HAVE_MMX
-    if( cpu&X264_CPU_SSE2 )
-    {
-        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse2;
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
-    }
-    if( cpu&X264_CPU_SSE4 )
-        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
-    if( cpu&X264_CPU_AVX )
-        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
-#if ARCH_X86_64
-    if( cpu&X264_CPU_AVX )
-    {
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
-    }
-#endif // ARCH_X86_64
-#endif // HAVE_MMX
-#else
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX )
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
-    if( cpu&X264_CPU_MMX2 )
-    {
-        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
-    }
-    if( cpu&X264_CPU_SSE )
-        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse;
-    if( cpu&X264_CPU_SSE2_IS_FAST )
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_ssse3;
-        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
-        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
-        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
-        if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
-            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
-    }
-    if( cpu&X264_CPU_AVX )
-    {
-        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_avx;
-        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_avx;
-#if ARCH_X86_64
-        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
-        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
-#endif
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
-    }
-    if( cpu&X264_CPU_XOP )
-    {
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
-        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
-        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
-    }
-#endif // HAVE_MMX
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-    {
-        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_altivec;
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
-    }
-#endif
-#if HAVE_ARMV6 || ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_neon;
-#if ARCH_AARCH64
-        pf_interlaced->scan_4x4   = x264_zigzag_scan_4x4_field_neon;
-        pf_interlaced->scan_8x8   = x264_zigzag_scan_8x8_field_neon;
-        pf_interlaced->sub_4x4    = x264_zigzag_sub_4x4_field_neon;
-        pf_interlaced->sub_4x4ac  = x264_zigzag_sub_4x4ac_field_neon;
-        pf_interlaced->sub_8x8    = x264_zigzag_sub_8x8_field_neon;
-        pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_neon;
-        pf_progressive->sub_4x4   = x264_zigzag_sub_4x4_frame_neon;
-        pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
-        pf_progressive->sub_8x8   = x264_zigzag_sub_8x8_frame_neon;
-#endif // ARCH_AARCH64
-    }
-#endif // HAVE_ARMV6 || ARCH_AARCH64
-#endif // HIGH_BIT_DEPTH
-
-    pf_interlaced->interleave_8x8_cavlc =
-    pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
-#if HAVE_MMX
-#if HIGH_BIT_DEPTH
-    if( cpu&X264_CPU_SSE2 )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
-    }
-    if( cpu&X264_CPU_AVX )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
-    }
-#else
-    if( cpu&X264_CPU_MMX )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
-    }
-    if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
-    }
-
-    if( cpu&X264_CPU_AVX )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
-    }
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
-    }
-#endif // HIGH_BIT_DEPTH
-#endif
-#if !HIGH_BIT_DEPTH
-#if ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        pf_interlaced->interleave_8x8_cavlc =
-        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
-    }
-#endif // ARCH_AARCH64
-#endif // !HIGH_BIT_DEPTH
-#if !HIGH_BIT_DEPTH
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
-    }
-#endif
-#endif
-}
diff --git a/android/src/main/libenc/jni/libx264/common/dct.h b/android/src/main/libenc/jni/libx264/common/dct.h
deleted file mode 100755
index 91c343a..0000000
--- a/android/src/main/libenc/jni/libx264/common/dct.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*****************************************************************************
- * dct.h: transform and zigzag
- *****************************************************************************
- * Copyright (C) 2004-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_DCT_H
-#define X264_DCT_H
-
-extern const uint32_t x264_dct4_weight_tab[16];
-extern const uint32_t x264_dct8_weight_tab[64];
-extern const uint32_t x264_dct4_weight2_tab[16];
-extern const uint32_t x264_dct8_weight2_tab[64];
-
-typedef struct
-{
-    // pix1  stride = FENC_STRIDE
-    // pix2  stride = FDEC_STRIDE
-    // p_dst stride = FDEC_STRIDE
-    void (*sub4x4_dct)   ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
-    void (*add4x4_idct)  ( pixel *p_dst, dctcoef dct[16] );
-
-    void (*sub8x8_dct)   ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
-    void (*sub8x8_dct_dc)( dctcoef dct[4], pixel *pix1, pixel *pix2 );
-    void (*add8x8_idct)  ( pixel *p_dst, dctcoef dct[4][16] );
-    void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );
-
-    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
-
-    void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
-    void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
-    void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );
-
-    void (*sub8x8_dct8)  ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
-    void (*add8x8_idct8) ( pixel *p_dst, dctcoef dct[64] );
-
-    void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
-    void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );
-
-    void (*dct4x4dc) ( dctcoef d[16] );
-    void (*idct4x4dc)( dctcoef d[16] );
-
-    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
-
-} x264_dct_function_t;
-
-typedef struct
-{
-    void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
-    void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
-    int  (*sub_8x8)  ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
-    int  (*sub_4x4)  ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
-    int  (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
-    void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-
-} x264_zigzag_function_t;
-
-void x264_dct_init( int cpu, x264_dct_function_t *dctf );
-void x264_dct_init_weights( void );
-void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/deblock.c b/android/src/main/libenc/jni/libx264/common/deblock.c
deleted file mode 100755
index 636bf3e..0000000
--- a/android/src/main/libenc/jni/libx264/common/deblock.c
+++ /dev/null
@@ -1,905 +0,0 @@
-/*****************************************************************************
- * deblock.c: deblocking
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-/* Deblocking filter */
-static const uint8_t i_alpha_table[52+12*3] =
-{
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
-     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
-    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
-    80, 90,101,113,127,144,162,182,203,226,
-   255,255,
-   255,255,255,255,255,255,255,255,255,255,255,255,
-};
-static const uint8_t i_beta_table[52+12*3] =
-{
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
-     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
-     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
-    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
-    18, 18,
-    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
-};
-static const int8_t i_tc0_table[52+12*3][4] =
-{
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
-    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
-    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
-    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
-    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
-    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
-    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
-    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
-};
-#define alpha_table(x) i_alpha_table[(x)+24]
-#define beta_table(x)  i_beta_table[(x)+24]
-#define tc0_table(x)   i_tc0_table[(x)+24]
-
-/* From ffmpeg */
-static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
-{
-    int p2 = pix[-3*xstride];
-    int p1 = pix[-2*xstride];
-    int p0 = pix[-1*xstride];
-    int q0 = pix[ 0*xstride];
-    int q1 = pix[ 1*xstride];
-    int q2 = pix[ 2*xstride];
-
-    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-    {
-        int tc = tc0;
-        int delta;
-        if( abs( p2 - p0 ) < beta )
-        {
-            if( tc0 )
-                pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 );
-            tc++;
-        }
-        if( abs( q2 - q0 ) < beta )
-        {
-            if( tc0 )
-                pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 );
-            tc++;
-        }
-
-        delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
-        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
-    }
-}
-static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
-{
-    for( int i = 0; i < 4; i++ )
-    {
-        if( tc0[i] < 0 )
-        {
-            pix += 4*ystride;
-            continue;
-        }
-        for( int d = 0; d < 4; d++, pix += ystride )
-            deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
-    }
-}
-static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    for( int d = 0; d < 8; d++, pix += stride )
-        deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
-}
-static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
-}
-static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
-}
-
-static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
-{
-    int p1 = pix[-2*xstride];
-    int p0 = pix[-1*xstride];
-    int q0 = pix[ 0*xstride];
-    int q1 = pix[ 1*xstride];
-
-    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-    {
-        int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
-        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
-    }
-}
-static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
-{
-    for( int i = 0; i < 4; i++ )
-    {
-        int tc = tc0[i];
-        if( tc <= 0 )
-        {
-            pix += height*ystride;
-            continue;
-        }
-        for( int d = 0; d < height; d++, pix += ystride-2 )
-            for( int e = 0; e < 2; e++, pix++ )
-                deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
-    }
-}
-static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
-}
-static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
-}
-static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
-}
-static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
-}
-
-static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
-{
-    int p2 = pix[-3*xstride];
-    int p1 = pix[-2*xstride];
-    int p0 = pix[-1*xstride];
-    int q0 = pix[ 0*xstride];
-    int q1 = pix[ 1*xstride];
-    int q2 = pix[ 2*xstride];
-
-    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-    {
-        if( abs( p0 - q0 ) < ((alpha >> 2) + 2) )
-        {
-            if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
-            {
-                const int p3 = pix[-4*xstride];
-                pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-            }
-            else /* p0' */
-                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-            if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
-            {
-                const int q3 = pix[3*xstride];
-                pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-            }
-            else /* q0' */
-                pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-        }
-        else /* p0', q0' */
-        {
-            pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-            pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-        }
-    }
-}
-static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
-{
-    for( int d = 0; d < 16; d++, pix += ystride )
-        deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
-}
-static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
-{
-    for( int d = 0; d < 8; d++, pix += ystride )
-        deblock_edge_luma_intra_c( pix, 1, alpha, beta );
-}
-static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_luma_intra_c( pix, stride, 1, alpha, beta );
-}
-static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_luma_intra_c( pix, 1, stride, alpha, beta );
-}
-
-static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
-{
-    int p1 = pix[-2*xstride];
-    int p0 = pix[-1*xstride];
-    int q0 = pix[ 0*xstride];
-    int q1 = pix[ 1*xstride];
-
-    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-    {
-        pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
-        pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
-    }
-}
-static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
-{
-    for( int d = 0; d < height; d++, pix += ystride-2 )
-        for( int e = 0; e < width; e++, pix++ )
-            deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
-}
-static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
-}
-static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
-}
-static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
-}
-static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
-{
-    deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
-}
-
-static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
-                                int bframe )
-{
-    for( int dir = 0; dir < 2; dir++ )
-    {
-        int s1 = dir ? 1 : 8;
-        int s2 = dir ? 8 : 1;
-        for( int edge = 0; edge < 4; edge++ )
-            for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
-            {
-                int locn = loc - s2;
-                if( nnz[loc] || nnz[locn] )
-                    bs[dir][edge][i] = 2;
-                else if( ref[0][loc] != ref[0][locn] ||
-                         abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
-                         abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
-                        (bframe && (ref[1][loc] != ref[1][locn] ||
-                         abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
-                         abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
-                {
-                    bs[dir][edge][i] = 1;
-                }
-                else
-                    bs[dir][edge][i] = 0;
-            }
-    }
-}
-
-static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
-                                        int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
-{
-    int index_a = i_qp + a;
-    int index_b = i_qp + b;
-    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
-    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
-    int8_t tc[4];
-
-    if( !M32(bS) || !alpha || !beta )
-        return;
-
-    tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
-    tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
-    tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
-    tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
-
-    pf_inter( pix, i_stride, alpha, beta, tc );
-}
-
-static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
-                                              int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
-{
-    int index_a = i_qp + a;
-    int index_b = i_qp + b;
-    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
-    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
-
-    if( !alpha || !beta )
-        return;
-
-    pf_intra( pix, i_stride, alpha, beta );
-}
-
-static ALWAYS_INLINE void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
-{
-    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
-
-    h->mb.i_neighbour = 0;
-    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
-    h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
-    h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
-    h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
-    h->mb.i_mb_left_xy[1] =
-    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
-    if( SLICE_MBAFF )
-    {
-        if( mb_y&1 )
-        {
-            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
-        }
-        else
-        {
-            if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
-            {
-                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
-                h->mb.i_mb_top_y++;
-            }
-            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
-        }
-    }
-
-    if( mb_x > 0 && (deblock_on_slice_edges ||
-        h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
-        h->mb.i_neighbour |= MB_LEFT;
-    if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
-        || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
-        h->mb.i_neighbour |= MB_TOP;
-}
-
-void x264_frame_deblock_row( x264_t *h, int mb_y )
-{
-    int b_interlaced = SLICE_MBAFF;
-    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
-    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
-    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
-    int stridey   = h->fdec->i_stride[0];
-    int strideuv  = h->fdec->i_stride[1];
-    int chroma444 = CHROMA444;
-    int chroma_height = 16 >> CHROMA_V_SHIFT;
-    intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
-
-    for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
-    {
-        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
-        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
-
-        int mb_xy = h->mb.i_mb_xy;
-        int transform_8x8 = h->mb.mb_transform_size[mb_xy];
-        int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
-        uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
-
-        pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
-        pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
-
-        if( mb_y & MB_INTERLACED )
-        {
-            pixy -= 15*stridey;
-            pixuv -= (chroma_height-1)*strideuv;
-        }
-
-        int stride2y  = stridey << MB_INTERLACED;
-        int stride2uv = strideuv << MB_INTERLACED;
-        int qp = h->mb.qp[mb_xy];
-        int qpc = h->chroma_qp_table[qp];
-        int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh;
-
-        #define FILTER( intra, dir, edge, qp, chroma_qp )\
-        do\
-        {\
-            if( !(edge & 1) || !transform_8x8 )\
-            {\
-                deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
-                                     stride2y, bs[dir][edge], qp, a, b, 0,\
-                                     h->loopf.deblock_luma##intra[dir] );\
-                if( CHROMA_FORMAT == CHROMA_444 )\
-                {\
-                    deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
-                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
-                                         h->loopf.deblock_luma##intra[dir] );\
-                    deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
-                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
-                                         h->loopf.deblock_luma##intra[dir] );\
-                }\
-                else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\
-                {\
-                    deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
-                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
-                                         h->loopf.deblock_chroma##intra[dir] );\
-                }\
-            }\
-            if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\
-            {\
-                deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
-                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
-                                     h->loopf.deblock_chroma##intra[dir] );\
-            }\
-        } while(0)
-
-        if( h->mb.i_neighbour & MB_LEFT )
-        {
-            if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
-            {
-                int luma_qp[2];
-                int chroma_qp[2];
-                int left_qp[2];
-                x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
-                x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
-                x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
-                x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
-                int c = chroma444 ? 0 : 1;
-
-                left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
-                luma_qp[0] = (qp + left_qp[0] + 1) >> 1;
-                chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
-                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) )
-                {
-                    deblock_edge_intra( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_intra_deblock );
-                    deblock_edge_intra( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
-                    if( chroma444 )
-                        deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
-                }
-                else
-                {
-                    deblock_edge( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_deblock );
-                    deblock_edge( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
-                    if( chroma444 )
-                        deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
-                }
-
-                int offy = MB_INTERLACED ? 4 : 0;
-                int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
-                left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
-                luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
-                chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
-                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) )
-                {
-                    deblock_edge_intra( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_intra_deblock );
-                    deblock_edge_intra( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
-                    if( chroma444 )
-                        deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
-                }
-                else
-                {
-                    deblock_edge( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_deblock );
-                    deblock_edge( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
-                    if( chroma444 )
-                        deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
-                }
-            }
-            else
-            {
-                int qpl = h->mb.qp[h->mb.i_mb_xy-1];
-                int qp_left = (qp + qpl + 1) >> 1;
-                int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1;
-                int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );
-                int intra_deblock = intra_cur || intra_left;
-
-                /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
-                 * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
-                 * So reset their effective QP to max, to indicate that lack of guarantee. */
-                if( h->fdec->mb_info && M32( bs[0][0] ) )
-                {
-#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
-                    RESET_EFFECTIVE_QP(mb_xy);
-                    RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
-                }
-
-                if( intra_deblock )
-                    FILTER( _intra, 0, 0, qp_left, qpc_left );
-                else
-                    FILTER(       , 0, 0, qp_left, qpc_left );
-            }
-        }
-        if( !first_edge_only )
-        {
-            FILTER( , 0, 1, qp, qpc );
-            FILTER( , 0, 2, qp, qpc );
-            FILTER( , 0, 3, qp, qpc );
-        }
-
-        if( h->mb.i_neighbour & MB_TOP )
-        {
-            if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] )
-            {
-                int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
-
-                for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
-                {
-                    int qpt = h->mb.qp[mbn_xy];
-                    int qp_top = (qp + qpt + 1) >> 1;
-                    int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
-                    int intra_top = IS_INTRA( h->mb.type[mbn_xy] );
-                    if( intra_cur || intra_top )
-                        M32( bs[1][4*j] ) = 0x03030303;
-
-                    // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
-                    deblock_edge( h, pixy      + j*stridey,  2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] );
-                    if( chroma444 )
-                    {
-                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
-                        deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
-                    }
-                    else
-                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] );
-                }
-            }
-            else
-            {
-                int qpt = h->mb.qp[h->mb.i_mb_top_xy];
-                int qp_top = (qp + qpt + 1) >> 1;
-                int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
-                int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
-                int intra_deblock = intra_cur || intra_top;
-
-                /* This edge has been modified, reset effective qp to max. */
-                if( h->fdec->mb_info && M32( bs[1][0] ) )
-                {
-                    RESET_EFFECTIVE_QP(mb_xy);
-                    RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
-                }
-
-                if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock )
-                {
-                    FILTER( _intra, 1, 0, qp_top, qpc_top );
-                }
-                else
-                {
-                    if( intra_deblock )
-                        M32( bs[1][0] ) = 0x03030303;
-                    FILTER(       , 1, 0, qp_top, qpc_top );
-                }
-            }
-        }
-
-        if( !first_edge_only )
-        {
-            FILTER( , 1, 1, qp, qpc );
-            FILTER( , 1, 2, qp, qpc );
-            FILTER( , 1, 3, qp, qpc );
-        }
-
-        #undef FILTER
-    }
-}
-
-/* For deblock-aware RD.
- * TODO:
- *  deblock macroblock edges
- *  support analysis partitions smaller than 16x16
- *  deblock chroma for 4:2:0/4:2:2
- *  handle duplicate refs correctly
- */
-void x264_macroblock_deblock( x264_t *h )
-{
-    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
-    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
-    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
-    int intra_cur = IS_INTRA( h->mb.i_type );
-    int qp = h->mb.i_qp;
-    int qpc = h->mb.i_chroma_qp;
-    if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
-        return;
-
-    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
-    if( intra_cur )
-    {
-        M32( bs[0][1] ) = 0x03030303;
-        M64( bs[0][2] ) = 0x0303030303030303ULL;
-        M32( bs[1][1] ) = 0x03030303;
-        M64( bs[1][2] ) = 0x0303030303030303ULL;
-    }
-    else
-        h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
-                                   bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
-
-    int transform_8x8 = h->mb.b_transform_8x8;
-
-    #define FILTER( dir, edge )\
-    do\
-    {\
-        deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
-                      FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\
-                      h->loopf.deblock_luma[dir] );\
-        if( CHROMA444 )\
-        {\
-            deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
-                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
-                          h->loopf.deblock_luma[dir] );\
-            deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
-                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
-                          h->loopf.deblock_luma[dir] );\
-        }\
-    } while(0)
-
-    if( !transform_8x8 ) FILTER( 0, 1 );
-                         FILTER( 0, 2 );
-    if( !transform_8x8 ) FILTER( 0, 3 );
-
-    if( !transform_8x8 ) FILTER( 1, 1 );
-                         FILTER( 1, 2 );
-    if( !transform_8x8 ) FILTER( 1, 3 );
-
-    #undef FILTER
-}
-
-#if HAVE_MMX
-void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                  int mvy_limit, int bframe );
-void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                  int mvy_limit, int bframe );
-void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                  int mvy_limit, int bframe );
-void x264_deblock_strength_avx  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                  int mvy_limit, int bframe );
-void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                  int mvy_limit, int bframe );
-
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-#if ARCH_X86
-void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-
-#if HIGH_BIT_DEPTH
-void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
-#else
-// FIXME this wrapper has a significant cpu cost
-static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    x264_deblock_v8_luma_mmx2( pix,   stride, alpha, beta, tc0   );
-    x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
-}
-static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
-{
-    x264_deblock_v8_luma_intra_mmx2( pix,   stride, alpha, beta );
-    x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
-}
-#endif // HIGH_BIT_DEPTH
-#endif
-#endif
-
-#if ARCH_PPC
-void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-#endif // ARCH_PPC
-
-#if HAVE_ARMV6 || ARCH_AARCH64
-void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
-                                 int mvy_limit, int bframe );
-void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_MSA
-void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
-                                int bframe );
-#endif
-#endif
-
-void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
-{
-    pf->deblock_luma[1] = deblock_v_luma_c;
-    pf->deblock_luma[0] = deblock_h_luma_c;
-    pf->deblock_chroma[1] = deblock_v_chroma_c;
-    pf->deblock_h_chroma_420 = deblock_h_chroma_c;
-    pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
-    pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
-    pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
-    pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
-    pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
-    pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
-    pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
-    pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
-    pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
-    pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
-    pf->deblock_strength = deblock_strength_c;
-
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX2 )
-    {
-#if ARCH_X86
-        pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
-        pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
-        pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
-        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
-        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
-        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
-        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
-        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
-        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
-        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
-        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
-        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
-#endif
-#if !HIGH_BIT_DEPTH
-        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
-#endif
-        pf->deblock_strength = x264_deblock_strength_mmx2;
-        if( cpu&X264_CPU_SSE2 )
-        {
-            pf->deblock_strength = x264_deblock_strength_sse2;
-            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
-            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
-            pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
-            pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
-            pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
-            pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
-            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
-            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
-            if( !(cpu&X264_CPU_STACK_MOD4) )
-            {
-                pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
-                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
-                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
-#if HIGH_BIT_DEPTH
-                pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
-#endif
-            }
-        }
-        if( cpu&X264_CPU_SSSE3 )
-            pf->deblock_strength = x264_deblock_strength_ssse3;
-        if( cpu&X264_CPU_AVX )
-        {
-            pf->deblock_strength = x264_deblock_strength_avx;
-            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
-            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
-            pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
-            pf->deblock_luma[1] = x264_deblock_v_luma_avx;
-            pf->deblock_luma[0] = x264_deblock_h_luma_avx;
-            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
-            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
-            if( !(cpu&X264_CPU_STACK_MOD4) )
-            {
-                pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
-                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
-                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
-#if HIGH_BIT_DEPTH
-                pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
-                pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
-#endif
-            }
-        }
-        if( cpu&X264_CPU_AVX2 )
-        {
-            pf->deblock_strength = x264_deblock_strength_avx2;
-        }
-    }
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-    {
-        pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
-        pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
-    }
-#endif // HAVE_ALTIVEC
-
-#if HAVE_ARMV6 || ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        pf->deblock_luma[1] = x264_deblock_v_luma_neon;
-        pf->deblock_luma[0] = x264_deblock_h_luma_neon;
-        pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
-        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
-        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
-        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
-        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
-        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
-        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
-        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
-        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
-        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
-        pf->deblock_strength     = x264_deblock_strength_neon;
-    }
-#endif
-
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf->deblock_luma[1] = x264_deblock_v_luma_msa;
-        pf->deblock_luma[0] = x264_deblock_h_luma_msa;
-        pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
-        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
-        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
-        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
-        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
-        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
-        pf->deblock_strength = x264_deblock_strength_msa;
-    }
-#endif
-#endif // !HIGH_BIT_DEPTH
-
-    /* These functions are equivalent, so don't duplicate them. */
-    pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
-    pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/frame.c b/android/src/main/libenc/jni/libx264/common/frame.c
deleted file mode 100755
index ab0c7b3..0000000
--- a/android/src/main/libenc/jni/libx264/common/frame.c
+++ /dev/null
@@ -1,889 +0,0 @@
-/*****************************************************************************
- * frame.c: frame handling
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-static int align_stride( int x, int align, int disalign )
-{
-    x = ALIGN( x, align );
-    if( !(x&(disalign-1)) )
-        x += align;
-    return x;
-}
-
-static int align_plane_size( int x, int disalign )
-{
-    if( !(x&(disalign-1)) )
-        x += 128;
-    return x;
-}
-
-static int x264_frame_internal_csp( int external_csp )
-{
-    switch( external_csp & X264_CSP_MASK )
-    {
-        case X264_CSP_NV12:
-        case X264_CSP_NV21:
-        case X264_CSP_I420:
-        case X264_CSP_YV12:
-            return X264_CSP_NV12;
-        case X264_CSP_NV16:
-        case X264_CSP_I422:
-        case X264_CSP_YV16:
-        case X264_CSP_V210:
-            return X264_CSP_NV16;
-        case X264_CSP_I444:
-        case X264_CSP_YV24:
-        case X264_CSP_BGR:
-        case X264_CSP_BGRA:
-        case X264_CSP_RGB:
-            return X264_CSP_I444;
-        default:
-            return X264_CSP_NONE;
-    }
-}
-
-static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
-{
-    x264_frame_t *frame;
-    int i_csp = x264_frame_internal_csp( h->param.i_csp );
-    int i_mb_count = h->mb.i_mb_count;
-    int i_stride, i_width, i_lines, luma_plane_count;
-    int i_padv = PADV << PARAM_INTERLACED;
-    int align = 16;
-#if ARCH_X86 || ARCH_X86_64
-    if( h->param.cpu&X264_CPU_CACHELINE_64 )
-        align = 64;
-    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
-        align = 32;
-#endif
-#if ARCH_PPC
-    int disalign = 1<<9;
-#else
-    int disalign = 1<<10;
-#endif
-
-    CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
-    PREALLOC_INIT
-
-    /* allocate frame data (+64 for extra data for me) */
-    i_width  = h->mb.i_mb_width*16;
-    i_lines  = h->mb.i_mb_height*16;
-    i_stride = align_stride( i_width + 2*PADH, align, disalign );
-
-    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
-    {
-        luma_plane_count = 1;
-        frame->i_plane = 2;
-        for( int i = 0; i < 2; i++ )
-        {
-            frame->i_width[i] = i_width >> i;
-            frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
-            frame->i_stride[i] = i_stride;
-        }
-    }
-    else if( i_csp == X264_CSP_I444 )
-    {
-        luma_plane_count = 3;
-        frame->i_plane = 3;
-        for( int i = 0; i < 3; i++ )
-        {
-            frame->i_width[i] = i_width;
-            frame->i_lines[i] = i_lines;
-            frame->i_stride[i] = i_stride;
-        }
-    }
-    else
-        goto fail;
-
-    frame->i_csp = i_csp;
-    frame->i_width_lowres = frame->i_width[0]/2;
-    frame->i_lines_lowres = frame->i_lines[0]/2;
-    frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
-
-    for( int i = 0; i < h->param.i_bframe + 2; i++ )
-        for( int j = 0; j < h->param.i_bframe + 2; j++ )
-            PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
-
-    frame->i_poc = -1;
-    frame->i_type = X264_TYPE_AUTO;
-    frame->i_qpplus1 = X264_QP_AUTO;
-    frame->i_pts = -1;
-    frame->i_frame = -1;
-    frame->i_frame_num = -1;
-    frame->i_lines_completed = -1;
-    frame->b_fdec = b_fdec;
-    frame->i_pic_struct = PIC_STRUCT_AUTO;
-    frame->i_field_cnt = -1;
-    frame->i_duration =
-    frame->i_cpb_duration =
-    frame->i_dpb_output_delay =
-    frame->i_cpb_delay = 0;
-    frame->i_coded_fields_lookahead =
-    frame->i_cpb_delay_lookahead = -1;
-
-    frame->orig = frame;
-
-    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
-    {
-        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
-        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
-        PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-        if( PARAM_INTERLACED )
-            PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-    }
-
-    /* all 4 luma planes allocated together, since the cacheline split code
-     * requires them to be in-phase wrt cacheline alignment. */
-
-    for( int p = 0; p < luma_plane_count; p++ )
-    {
-        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
-        if( h->param.analyse.i_subpel_refine && b_fdec )
-        {
-            /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-            PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
-            if( PARAM_INTERLACED )
-                PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
-        }
-        else
-        {
-            PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
-            if( PARAM_INTERLACED )
-                PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
-        }
-    }
-
-    frame->b_duplicate = 0;
-
-    if( b_fdec ) /* fdec frame */
-    {
-        PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
-        PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
-        PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-        PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
-        PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
-        if( h->param.i_bframe )
-        {
-            PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-            PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
-        }
-        else
-        {
-            frame->mv[1]  = NULL;
-            frame->ref[1] = NULL;
-        }
-        PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-        PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
-        PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
-        if( h->param.analyse.i_me_method >= X264_ME_ESA )
-            PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-        if( PARAM_INTERLACED )
-            PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
-        if( h->param.analyse.b_mb_info )
-            PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
-    }
-    else /* fenc frame */
-    {
-        if( h->frames.b_have_lowres )
-        {
-            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
-
-            PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
-
-            for( int j = 0; j <= !!h->param.i_bframe; j++ )
-                for( int i = 0; i <= h->param.i_bframe; i++ )
-                {
-                    PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                    PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
-                }
-            PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
-            for( int j = 0; j <= h->param.i_bframe+1; j++ )
-                for( int i = 0; i <= h->param.i_bframe+1; i++ )
-                    PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-
-        }
-        if( h->param.rc.i_aq_mode )
-        {
-            PREALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
-            PREALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
-            if( h->frames.b_have_lowres )
-                PREALLOC( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
-        }
-    }
-
-    PREALLOC_END( frame->base );
-
-    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
-    {
-        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
-        if( PARAM_INTERLACED )
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
-    }
-
-    for( int p = 0; p < luma_plane_count; p++ )
-    {
-        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
-        if( h->param.analyse.i_subpel_refine && b_fdec )
-        {
-            for( int i = 0; i < 4; i++ )
-            {
-                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-            }
-            frame->plane[p] = frame->filtered[p][0];
-            frame->plane_fld[p] = frame->filtered_fld[p][0];
-        }
-        else
-        {
-            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
-            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
-        }
-    }
-
-    if( b_fdec )
-    {
-        M32( frame->mv16x16[0] ) = 0;
-        frame->mv16x16++;
-
-        if( h->param.analyse.i_me_method >= X264_ME_ESA )
-            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-    }
-    else
-    {
-        if( h->frames.b_have_lowres )
-        {
-            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
-            for( int i = 0; i < 4; i++ )
-                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
-
-            for( int j = 0; j <= !!h->param.i_bframe; j++ )
-                for( int i = 0; i <= h->param.i_bframe; i++ )
-                    memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
-
-            frame->i_intra_cost = frame->lowres_costs[0][0];
-            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
-
-            if( h->param.rc.i_aq_mode )
-                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
-                memset( frame->i_inv_qscale_factor, 0, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
-        }
-    }
-
-    if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
-        goto fail;
-    if( x264_pthread_cond_init( &frame->cv, NULL ) )
-        goto fail;
-
-#if HAVE_OPENCL
-    frame->opencl.ocl = h->opencl.ocl;
-#endif
-
-    return frame;
-
-fail:
-    x264_free( frame );
-    return NULL;
-}
-
-void x264_frame_delete( x264_frame_t *frame )
-{
-    /* Duplicate frames are blank copies of real frames (including pointers),
-     * so freeing those pointers would cause a double free later. */
-    if( !frame->b_duplicate )
-    {
-        x264_free( frame->base );
-
-        if( frame->param && frame->param->param_free )
-            frame->param->param_free( frame->param );
-        if( frame->mb_info_free )
-            frame->mb_info_free( frame->mb_info );
-        if( frame->extra_sei.sei_free )
-        {
-            for( int i = 0; i < frame->extra_sei.num_payloads; i++ )
-                frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload );
-            frame->extra_sei.sei_free( frame->extra_sei.payloads );
-        }
-        x264_pthread_mutex_destroy( &frame->mutex );
-        x264_pthread_cond_destroy( &frame->cv );
-#if HAVE_OPENCL
-        x264_opencl_frame_delete( frame );
-#endif
-    }
-    x264_free( frame );
-}
-
-static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
-{
-    int width = h->param.i_width >> xshift;
-    int height = h->param.i_height >> yshift;
-    *pix = src->img.plane[plane];
-    *stride = src->img.i_stride[plane];
-    if( src->img.i_csp & X264_CSP_VFLIP )
-    {
-        *pix += (height-1) * *stride;
-        *stride = -*stride;
-    }
-    if( width > abs(*stride) )
-    {
-        x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
-        return -1;
-    }
-    return 0;
-}
-
-#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0)
-
-int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
-{
-    int i_csp = src->img.i_csp & X264_CSP_MASK;
-    if( dst->i_csp != x264_frame_internal_csp( i_csp ) )
-    {
-        x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
-        return -1;
-    }
-
-#if HIGH_BIT_DEPTH
-    if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
-    {
-        x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
-        return -1;
-    }
-#else
-    if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
-    {
-        x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
-        return -1;
-    }
-#endif
-
-    if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
-    {
-        x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
-        return -1;
-    }
-
-    if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
-    {
-        x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
-        dst->i_forced_type = X264_TYPE_AUTO;
-    }
-    else
-        dst->i_forced_type = src->i_type;
-
-    dst->i_type     = dst->i_forced_type;
-    dst->i_qpplus1  = src->i_qpplus1;
-    dst->i_pts      = dst->i_reordered_pts = src->i_pts;
-    dst->param      = src->param;
-    dst->i_pic_struct = src->i_pic_struct;
-    dst->extra_sei  = src->extra_sei;
-    dst->opaque     = src->opaque;
-    dst->mb_info    = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
-    dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
-
-    uint8_t *pix[3];
-    int stride[3];
-    if( i_csp == X264_CSP_V210 )
-    {
-         stride[0] = src->img.i_stride[0];
-         pix[0] = src->img.plane[0];
-
-         h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
-                                             dst->plane[1], dst->i_stride[1],
-                                             (uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
-    }
-    else if( i_csp >= X264_CSP_BGR )
-    {
-         stride[0] = src->img.i_stride[0];
-         pix[0] = src->img.plane[0];
-         if( src->img.i_csp & X264_CSP_VFLIP )
-         {
-             pix[0] += (h->param.i_height-1) * stride[0];
-             stride[0] = -stride[0];
-         }
-         int b = i_csp==X264_CSP_RGB;
-         h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b],
-                                            dst->plane[0], dst->i_stride[0],
-                                            dst->plane[2-b], dst->i_stride[2-b],
-                                            (pixel*)pix[0], stride[0]/sizeof(pixel), i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
-    }
-    else
-    {
-        int v_shift = CHROMA_V_SHIFT;
-        get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
-        h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
-                          stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
-        if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
-        {
-            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
-            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
-                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
-        }
-        else if( i_csp == X264_CSP_NV21 )
-        {
-            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
-            h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
-                                   stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift );
-        }
-        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
-        {
-            int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
-            get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
-            get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
-            h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
-                                         (pixel*)pix[1], stride[1]/sizeof(pixel),
-                                         (pixel*)pix[2], stride[2]/sizeof(pixel),
-                                         h->param.i_width>>1, h->param.i_height>>v_shift );
-        }
-        else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
-        {
-            get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
-            get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
-            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
-                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height );
-            h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
-                              stride[2]/sizeof(pixel), h->param.i_width, h->param.i_height );
-        }
-    }
-    return 0;
-}
-
-static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
-{
-    uint8_t *dstp = (uint8_t*)dst;
-    uint32_t v1 = *src;
-    uint32_t v2 = size == 1 ? v1 + (v1 <<  8) : M16( src );
-    uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
-    int i = 0;
-    len *= size;
-
-    /* Align the input pointer if it isn't already */
-    if( (intptr_t)dstp & (WORD_SIZE - 1) )
-    {
-        if( size <= 2 && ((intptr_t)dstp & 3) )
-        {
-            if( size == 1 && ((intptr_t)dstp & 1) )
-                dstp[i++] = v1;
-            if( (intptr_t)dstp & 2 )
-            {
-                M16( dstp+i ) = v2;
-                i += 2;
-            }
-        }
-        if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
-        {
-            M32( dstp+i ) = v4;
-            i += 4;
-        }
-    }
-
-    /* Main copy loop */
-    if( WORD_SIZE == 8 )
-    {
-        uint64_t v8 = v4 + ((uint64_t)v4<<32);
-        for( ; i < len - 7; i+=8 )
-            M64( dstp+i ) = v8;
-    }
-    for( ; i < len - 3; i+=4 )
-        M32( dstp+i ) = v4;
-
-    /* Finish up the last few bytes */
-    if( size <= 2 )
-    {
-        if( i < len - 1 )
-        {
-            M16( dstp+i ) = v2;
-            i += 2;
-        }
-        if( size == 1 && i != len )
-            dstp[i] = v1;
-    }
-}
-
-static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
-{
-#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
-    for( int y = 0; y < i_height; y++ )
-    {
-        /* left band */
-        pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
-        /* right band */
-        pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
-    }
-    /* upper band */
-    if( b_pad_top )
-        for( int y = 0; y < i_padv; y++ )
-            memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * sizeof(pixel) );
-    /* lower band */
-    if( b_pad_bottom )
-        for( int y = 0; y < i_padv; y++ )
-            memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * sizeof(pixel) );
-#undef PPIXEL
-}
-
-void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
-{
-    int pad_top = mb_y == 0;
-    int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
-    int b_start = mb_y == h->i_threadslice_start;
-    int b_end   = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
-    if( mb_y & SLICE_MBAFF )
-        return;
-    for( int i = 0; i < frame->i_plane; i++ )
-    {
-        int h_shift = i && CHROMA_H_SHIFT;
-        int v_shift = i && CHROMA_V_SHIFT;
-        int stride = frame->i_stride[i];
-        int width = 16*h->mb.i_mb_width;
-        int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
-        int padh = PADH;
-        int padv = PADV >> v_shift;
-        // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
-        if( b_end && !b_start )
-            height += 4 >> (v_shift + SLICE_MBAFF);
-        pixel *pix;
-        int starty = 16*mb_y - 4*!b_start;
-        if( SLICE_MBAFF )
-        {
-            // border samples for each field are extended separately
-            pix = frame->plane_fld[i] + (starty*stride >> v_shift);
-            plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
-
-            height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
-            if( b_end && !b_start )
-                height += 4 >> v_shift;
-            pix = frame->plane[i] + (starty*stride >> v_shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
-        }
-        else
-        {
-            pix = frame->plane[i] + (starty*stride >> v_shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
-        }
-    }
-}
-
-void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
-{
-    /* during filtering, 8 extra pixels were filtered on each edge,
-     * but up to 3 of the horizontal ones may be wrong.
-       we want to expand border from the last filtered pixel */
-    int b_start = !mb_y;
-    int width = 16*h->mb.i_mb_width + 8;
-    int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
-    int padh = PADH - 4;
-    int padv = PADV - 8;
-    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
-        for( int i = 1; i < 4; i++ )
-        {
-            int stride = frame->i_stride[p];
-            // buffer: 8 luma, to match the hpel filter
-            pixel *pix;
-            if( SLICE_MBAFF )
-            {
-                pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
-                plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
-                plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
-            }
-
-            pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
-            plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
-        }
-}
-
-void x264_frame_expand_border_lowres( x264_frame_t *frame )
-{
-    for( int i = 0; i < 4; i++ )
-        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
-}
-
-void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
-{
-    int v_shift = CHROMA_V_SHIFT;
-    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
-                         PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
-}
-
-void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
-{
-    for( int i = 0; i < frame->i_plane; i++ )
-    {
-        int i_width = h->param.i_width;
-        int h_shift = i && CHROMA_H_SHIFT;
-        int v_shift = i && CHROMA_V_SHIFT;
-        int i_height = h->param.i_height >> v_shift;
-        int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
-        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
-
-        if( i_padx )
-        {
-            for( int y = 0; y < i_height; y++ )
-                pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
-                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
-                              i_padx>>h_shift, sizeof(pixel)<<h_shift );
-        }
-        if( i_pady )
-        {
-            for( int y = i_height; y < i_height + i_pady; y++ )
-                memcpy( &frame->plane[i][y*frame->i_stride[i]],
-                        &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
-                        (i_width + i_padx) * sizeof(pixel) );
-        }
-    }
-}
-
-void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
-{
-    for( int i = 0; i < h->fenc->i_plane; i++ )
-    {
-        int v_shift = i && CHROMA_V_SHIFT;
-        int stride = h->fenc->i_stride[i];
-        int height = h->param.i_height >> v_shift;
-        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
-        pixel *fenc = h->fenc->plane[i] + 16*mb_x;
-        for( int y = height; y < height + pady; y++ )
-            memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
-    }
-}
-
-/* threading */
-void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
-{
-    x264_pthread_mutex_lock( &frame->mutex );
-    frame->i_lines_completed = i_lines_completed;
-    x264_pthread_cond_broadcast( &frame->cv );
-    x264_pthread_mutex_unlock( &frame->mutex );
-}
-
-void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
-{
-    x264_pthread_mutex_lock( &frame->mutex );
-    while( frame->i_lines_completed < i_lines_completed )
-        x264_pthread_cond_wait( &frame->cv, &frame->mutex );
-    x264_pthread_mutex_unlock( &frame->mutex );
-}
-
-void x264_threadslice_cond_broadcast( x264_t *h, int pass )
-{
-    x264_pthread_mutex_lock( &h->mutex );
-    h->i_threadslice_pass = pass;
-    if( pass > 0 )
-        x264_pthread_cond_broadcast( &h->cv );
-    x264_pthread_mutex_unlock( &h->mutex );
-}
-
-void x264_threadslice_cond_wait( x264_t *h, int pass )
-{
-    x264_pthread_mutex_lock( &h->mutex );
-    while( h->i_threadslice_pass < pass )
-        x264_pthread_cond_wait( &h->cv, &h->mutex );
-    x264_pthread_mutex_unlock( &h->mutex );
-}
-
-int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
-{
-    if( h->param.i_slice_count_max )
-    {
-        int slice_count;
-        if( h->param.b_sliced_threads )
-            slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
-        else
-            slice_count = frame->i_slice_count++;
-        if( slice_count >= h->param.i_slice_count_max )
-            return -1;
-    }
-    return 0;
-}
-
-/* list operators */
-
-void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
-{
-    int i = 0;
-    while( list[i] ) i++;
-    list[i] = frame;
-}
-
-x264_frame_t *x264_frame_pop( x264_frame_t **list )
-{
-    x264_frame_t *frame;
-    int i = 0;
-    assert( list[0] );
-    while( list[i+1] ) i++;
-    frame = list[i];
-    list[i] = NULL;
-    return frame;
-}
-
-void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
-{
-    int i = 0;
-    while( list[i] ) i++;
-    while( i-- )
-        list[i+1] = list[i];
-    list[0] = frame;
-}
-
-x264_frame_t *x264_frame_shift( x264_frame_t **list )
-{
-    x264_frame_t *frame = list[0];
-    int i;
-    for( i = 0; list[i]; i++ )
-        list[i] = list[i+1];
-    assert(frame);
-    return frame;
-}
-
-void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
-{
-    assert( frame->i_reference_count > 0 );
-    frame->i_reference_count--;
-    if( frame->i_reference_count == 0 )
-        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
-}
-
-x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
-{
-    x264_frame_t *frame;
-    if( h->frames.unused[b_fdec][0] )
-        frame = x264_frame_pop( h->frames.unused[b_fdec] );
-    else
-        frame = x264_frame_new( h, b_fdec );
-    if( !frame )
-        return NULL;
-    frame->b_last_minigop_bframe = 0;
-    frame->i_reference_count = 1;
-    frame->b_intra_calculated = 0;
-    frame->b_scenecut = 1;
-    frame->b_keyframe = 0;
-    frame->b_corrupt = 0;
-    frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
-
-    memset( frame->weight, 0, sizeof(frame->weight) );
-    memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
-
-    return frame;
-}
-
-void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
-{
-    assert( frame->i_reference_count > 0 );
-    frame->i_reference_count--;
-    if( frame->i_reference_count == 0 )
-        x264_frame_push( h->frames.blank_unused, frame );
-}
-
-x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
-{
-    x264_frame_t *frame;
-    if( h->frames.blank_unused[0] )
-        frame = x264_frame_pop( h->frames.blank_unused );
-    else
-        frame = x264_malloc( sizeof(x264_frame_t) );
-    if( !frame )
-        return NULL;
-    frame->b_duplicate = 1;
-    frame->i_reference_count = 1;
-    return frame;
-}
-
-void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
-                              int i_width, int i_height, x264_weight_t *w )
-{
-    /* Weight horizontal strips of height 16. This was found to be the optimal height
-     * in terms of the cache loads. */
-    while( i_height > 0 )
-    {
-        int x;
-        for( x = 0; x < i_width-8; x += 16 )
-            w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
-        if( x < i_width )
-            w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
-        i_height -= 16;
-        dst += 16 * i_dst_stride;
-        src += 16 * i_src_stride;
-    }
-}
-
-void x264_frame_delete_list( x264_frame_t **list )
-{
-    int i = 0;
-    if( !list )
-        return;
-    while( list[i] )
-        x264_frame_delete( list[i++] );
-    x264_free( list );
-}
-
-int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
-{
-    if( max_size < 0 )
-        return -1;
-    slist->i_max_size = max_size;
-    slist->i_size = 0;
-    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
-    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
-        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
-        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
-        return -1;
-    return 0;
-fail:
-    return -1;
-}
-
-void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
-{
-    x264_pthread_mutex_destroy( &slist->mutex );
-    x264_pthread_cond_destroy( &slist->cv_fill );
-    x264_pthread_cond_destroy( &slist->cv_empty );
-    x264_frame_delete_list( slist->list );
-}
-
-void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
-{
-    x264_pthread_mutex_lock( &slist->mutex );
-    while( slist->i_size == slist->i_max_size )
-        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
-    slist->list[ slist->i_size++ ] = frame;
-    x264_pthread_mutex_unlock( &slist->mutex );
-    x264_pthread_cond_broadcast( &slist->cv_fill );
-}
-
-x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
-{
-    x264_frame_t *frame;
-    x264_pthread_mutex_lock( &slist->mutex );
-    while( !slist->i_size )
-        x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
-    frame = slist->list[ --slist->i_size ];
-    slist->list[ slist->i_size ] = NULL;
-    x264_pthread_cond_broadcast( &slist->cv_empty );
-    x264_pthread_mutex_unlock( &slist->mutex );
-    return frame;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/frame.h b/android/src/main/libenc/jni/libx264/common/frame.h
deleted file mode 100755
index b8ff911..0000000
--- a/android/src/main/libenc/jni/libx264/common/frame.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*****************************************************************************
- * frame.h: frame handling
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_FRAME_H
-#define X264_FRAME_H
-
-/* number of pixels past the edge of the frame, for motion estimation/compensation */
-#define PADH 32
-#define PADV 32
-
-typedef struct x264_frame
-{
-    /* */
-    uint8_t *base;       /* Base pointer for all malloced data in this frame. */
-    int     i_poc;
-    int     i_delta_poc[2];
-    int     i_type;
-    int     i_forced_type;
-    int     i_qpplus1;
-    int64_t i_pts;
-    int64_t i_dts;
-    int64_t i_reordered_pts;
-    int64_t i_duration;  /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */
-    float   f_duration;  /* in seconds */
-    int64_t i_cpb_duration;
-    int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */
-    int64_t i_dpb_output_delay;
-    x264_param_t *param;
-
-    int     i_frame;     /* Presentation frame number */
-    int     i_coded;     /* Coded frame number */
-    int64_t i_field_cnt; /* Presentation field count */
-    int     i_frame_num; /* 7.4.3 frame_num */
-    int     b_kept_as_ref;
-    int     i_pic_struct;
-    int     b_keyframe;
-    uint8_t b_fdec;
-    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
-    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
-    float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
-    float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
-    float   f_crf_avg;   /* Average effective CRF for this frame */
-    int     i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
-
-    /* YUV buffer */
-    int     i_csp; /* Internal csp */
-    int     i_plane;
-    int     i_stride[3];
-    int     i_width[3];
-    int     i_lines[3];
-    int     i_stride_lowres;
-    int     i_width_lowres;
-    int     i_lines_lowres;
-    pixel *plane[3];
-    pixel *plane_fld[3];
-    pixel *filtered[3][4]; /* plane[0], H, V, HV */
-    pixel *filtered_fld[3][4];
-    pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
-    uint16_t *integral;
-
-    /* for unrestricted mv we allocate more data than needed
-     * allocated data are stored in buffer */
-    pixel *buffer[4];
-    pixel *buffer_fld[4];
-    pixel *buffer_lowres[4];
-
-    x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
-    pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
-    int b_duplicate;
-    struct x264_frame *orig;
-
-    /* motion data */
-    int8_t  *mb_type;
-    uint8_t *mb_partition;
-    int16_t (*mv[2])[2];
-    int16_t (*mv16x16)[2];
-    int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
-    uint8_t *field;
-    uint8_t *effective_qp;
-
-    /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
-     * Doesn't need special addressing for intra cost because
-     * lists_used is guaranteed to be zero in that cast. */
-    uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
-    #define LOWRES_COST_MASK ((1<<14)-1)
-    #define LOWRES_COST_SHIFT 14
-
-    int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
-    int8_t  *ref[2];
-    int     i_ref[2];
-    int     ref_poc[2][X264_REF_MAX];
-    int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction
-
-    /* for adaptive B-frame decision.
-     * contains the SATD cost of the lowres frame encoded in various modes
-     * FIXME: how big an array do we need? */
-    int     i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
-    int     i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
-    int     i_satd; // the i_cost_est of the selected frametype
-    int     i_intra_mbs[X264_BFRAME_MAX+2];
-    int     *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
-    int     *i_row_satd;
-    int     *i_row_bits;
-    float   *f_row_qp;
-    float   *f_row_qscale;
-    float   *f_qp_offset;
-    float   *f_qp_offset_aq;
-    int     b_intra_calculated;
-    uint16_t *i_intra_cost;
-    uint16_t *i_propagate_cost;
-    uint16_t *i_inv_qscale_factor;
-    int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
-    float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
-    uint32_t i_pixel_sum[3];
-    uint64_t i_pixel_ssd[3];
-
-    /* hrd */
-    x264_hrd_t hrd_timing;
-
-    /* vbv */
-    uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
-    int i_planned_satd[X264_LOOKAHEAD_MAX+1];
-    double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1];
-    int64_t i_coded_fields_lookahead;
-    int64_t i_cpb_delay_lookahead;
-
-    /* threading */
-    int     i_lines_completed; /* in pixels */
-    int     i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
-    int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
-    x264_pthread_mutex_t mutex;
-    x264_pthread_cond_t  cv;
-    int     i_slice_count; /* Atomically written to/read from with slice threads */
-
-    /* periodic intra refresh */
-    float   f_pir_position;
-    int     i_pir_start_col;
-    int     i_pir_end_col;
-    int     i_frames_since_pir;
-
-    /* interactive encoder control */
-    int     b_corrupt;
-
-    /* user sei */
-    x264_sei_t extra_sei;
-
-    /* user data */
-    void *opaque;
-
-    /* user frame properties */
-    uint8_t *mb_info;
-    void (*mb_info_free)( void* );
-
-#if HAVE_OPENCL
-    x264_frame_opencl_t opencl;
-#endif
-} x264_frame_t;
-
-/* synchronized frame list */
-typedef struct
-{
-   x264_frame_t **list;
-   int i_max_size;
-   int i_size;
-   x264_pthread_mutex_t     mutex;
-   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
-   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
-} x264_sync_frame_list_t;
-
-typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
-typedef struct
-{
-    x264_deblock_inter_t deblock_luma[2];
-    x264_deblock_inter_t deblock_chroma[2];
-    x264_deblock_inter_t deblock_h_chroma_420;
-    x264_deblock_inter_t deblock_h_chroma_422;
-    x264_deblock_intra_t deblock_luma_intra[2];
-    x264_deblock_intra_t deblock_chroma_intra[2];
-    x264_deblock_intra_t deblock_h_chroma_420_intra;
-    x264_deblock_intra_t deblock_h_chroma_422_intra;
-    x264_deblock_inter_t deblock_luma_mbaff;
-    x264_deblock_inter_t deblock_chroma_mbaff;
-    x264_deblock_inter_t deblock_chroma_420_mbaff;
-    x264_deblock_inter_t deblock_chroma_422_mbaff;
-    x264_deblock_intra_t deblock_luma_intra_mbaff;
-    x264_deblock_intra_t deblock_chroma_intra_mbaff;
-    x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
-    x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
-    void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
-                               int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
-                               int bframe );
-} x264_deblock_function_t;
-
-void          x264_frame_delete( x264_frame_t *frame );
-
-int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
-
-void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
-void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
-void          x264_frame_expand_border_lowres( x264_frame_t *frame );
-void          x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
-void          x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
-void          x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y );
-
-void          x264_frame_deblock_row( x264_t *h, int mb_y );
-void          x264_macroblock_deblock( x264_t *h );
-
-void          x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
-void          x264_frame_init_lowres( x264_t *h, x264_frame_t *frame );
-
-void          x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff );
-
-void          x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
-void          x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
-int           x264_frame_new_slice( x264_t *h, x264_frame_t *frame );
-
-void          x264_threadslice_cond_broadcast( x264_t *h, int pass );
-void          x264_threadslice_cond_wait( x264_t *h, int pass );
-
-void          x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop( x264_frame_t **list );
-void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
-x264_frame_t *x264_frame_shift( x264_frame_t **list );
-void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
-void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
-void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
-                              int i_width, int i_height, x264_weight_t *w );
-x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
-void          x264_frame_delete_list( x264_frame_t **list );
-
-int           x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem );
-void          x264_sync_frame_list_delete( x264_sync_frame_list_t *slist );
-void          x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame );
-x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/macroblock.c b/android/src/main/libenc/jni/libx264/common/macroblock.c
deleted file mode 100755
index 081683c..0000000
--- a/android/src/main/libenc/jni/libx264/common/macroblock.c
+++ /dev/null
@@ -1,1914 +0,0 @@
-/*****************************************************************************
- * macroblock.c: macroblock common functions
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Fiona Glaser <fiona@x264.com>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#define MC_LUMA(list,p) \
-    h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
-                   &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
-                   mvx, mvy, 4*width, 4*height, \
-                   list ? x264_weight_none : &h->sh.weight[i_ref][p] );
-
-static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
-{
-    int i8    = x264_scan8[0]+x+8*y;
-    int i_ref = h->mb.cache.ref[0][i8];
-    int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
-    int mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
-
-    MC_LUMA( 0, 0 );
-
-    if( CHROMA444 )
-    {
-        MC_LUMA( 0, 1 );
-        MC_LUMA( 0, 2 );
-    }
-    else
-    {
-        int v_shift = CHROMA_V_SHIFT;
-        // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
-        if( v_shift & MB_INTERLACED & i_ref )
-            mvy += (h->mb.i_mb_y & 1)*4 - 2;
-
-        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
-        height = 4*height >> v_shift;
-
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
-                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
-                         h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, 2*mvy>>v_shift, 2*width, height );
-
-        if( h->sh.weight[i_ref][1].weightfn )
-            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][1], height );
-        if( h->sh.weight[i_ref][2].weightfn )
-            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][2], height );
-    }
-}
-static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
-{
-    int i8    = x264_scan8[0]+x+8*y;
-    int i_ref = h->mb.cache.ref[1][i8];
-    int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
-    int mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
-
-    MC_LUMA( 1, 0 );
-
-    if( CHROMA444 )
-    {
-        MC_LUMA( 1, 1 );
-        MC_LUMA( 1, 2 );
-    }
-    else
-    {
-        int v_shift = CHROMA_V_SHIFT;
-        if( v_shift & MB_INTERLACED & i_ref )
-            mvy += (h->mb.i_mb_y & 1)*4 - 2;
-
-        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
-                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
-                         h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
-    }
-}
-
-#define MC_LUMA_BI(p) \
-    src0 = h->mc.get_ref( tmp0, &i_stride0, &h->mb.pic.p_fref[0][i_ref0][p*4], h->mb.pic.i_stride[p], \
-                          mvx0, mvy0, 4*width, 4*height, x264_weight_none ); \
-    src1 = h->mc.get_ref( tmp1, &i_stride1, &h->mb.pic.p_fref[1][i_ref1][p*4], h->mb.pic.i_stride[p], \
-                          mvx1, mvy1, 4*width, 4*height, x264_weight_none ); \
-    h->mc.avg[i_mode]( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
-                       src0, i_stride0, src1, i_stride1, weight );
-
-static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
-{
-    int i8 = x264_scan8[0]+x+8*y;
-    int i_ref0 = h->mb.cache.ref[0][i8];
-    int i_ref1 = h->mb.cache.ref[1][i8];
-    int weight = h->mb.bipred_weight[i_ref0][i_ref1];
-    int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
-    int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
-    int mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
-    int mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
-    int i_mode = x264_size2pixel[height][width];
-    intptr_t i_stride0 = 16, i_stride1 = 16;
-    ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
-    ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
-    pixel *src0, *src1;
-
-    MC_LUMA_BI( 0 );
-
-    if( CHROMA444 )
-    {
-        MC_LUMA_BI( 1 );
-        MC_LUMA_BI( 2 );
-    }
-    else
-    {
-        int v_shift = CHROMA_V_SHIFT;
-        if( v_shift & MB_INTERLACED & i_ref0 )
-            mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
-        if( v_shift & MB_INTERLACED & i_ref1 )
-            mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
-
-        h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
-                         mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
-        h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
-                         mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );
-
-        int chromapix = h->luma2chroma_pixel[i_mode];
-        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
-        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0,   16, tmp1,   16, weight );
-        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
-    }
-}
-
-#undef MC_LUMA
-#undef MC_LUMA_BI
-
-void x264_mb_mc_8x8( x264_t *h, int i8 )
-{
-    int x = 2*(i8&1);
-    int y = 2*(i8>>1);
-
-    if( h->sh.i_type == SLICE_TYPE_P )
-    {
-        switch( h->mb.i_sub_partition[i8] )
-        {
-            case D_L0_8x8:
-                x264_mb_mc_0xywh( h, x, y, 2, 2 );
-                break;
-            case D_L0_8x4:
-                x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
-                x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
-                break;
-            case D_L0_4x8:
-                x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
-                x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
-                break;
-            case D_L0_4x4:
-                x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
-                x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
-                x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
-                x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
-                break;
-        }
-    }
-    else
-    {
-        int scan8 = x264_scan8[0] + x + 8*y;
-
-        if( h->mb.cache.ref[0][scan8] >= 0 )
-            if( h->mb.cache.ref[1][scan8] >= 0 )
-                x264_mb_mc_01xywh( h, x, y, 2, 2 );
-            else
-                x264_mb_mc_0xywh( h, x, y, 2, 2 );
-        else
-            x264_mb_mc_1xywh( h, x, y, 2, 2 );
-    }
-}
-
-void x264_mb_mc( x264_t *h )
-{
-    if( h->mb.i_partition == D_8x8 )
-    {
-        for( int i = 0; i < 4; i++ )
-            x264_mb_mc_8x8( h, i );
-    }
-    else
-    {
-        int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]];
-        int ref0b = h->mb.cache.ref[0][x264_scan8[12]];
-        int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]];
-        int ref1b = h->mb.cache.ref[1][x264_scan8[12]];
-
-        if( h->mb.i_partition == D_16x16 )
-        {
-            if( ref0a >= 0 )
-                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
-                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
-            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
-        }
-        else if( h->mb.i_partition == D_16x8 )
-        {
-            if( ref0a >= 0 )
-                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
-                else             x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
-            else                 x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
-
-            if( ref0b >= 0 )
-                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
-                else             x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
-            else                 x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
-        }
-        else if( h->mb.i_partition == D_8x16 )
-        {
-            if( ref0a >= 0 )
-                if( ref1a >= 0 ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
-                else             x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
-            else                 x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
-
-            if( ref0b >= 0 )
-                if( ref1b >= 0 ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
-                else             x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
-            else                 x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
-        }
-    }
-}
-
-int x264_macroblock_cache_allocate( x264_t *h )
-{
-    int i_mb_count = h->mb.i_mb_count;
-
-    h->mb.i_mb_stride = h->mb.i_mb_width;
-    h->mb.i_b8_stride = h->mb.i_mb_width * 2;
-    h->mb.i_b4_stride = h->mb.i_mb_width * 4;
-
-    h->mb.b_interlaced = PARAM_INTERLACED;
-
-    PREALLOC_INIT
-
-    PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
-    PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
-    PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
-    PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
-
-    /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
-
-    /* all coeffs */
-    PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
-
-    if( h->param.b_cabac )
-    {
-        PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
-        PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
-        if( h->param.i_bframe )
-            PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
-    }
-
-    for( int i = 0; i < 2; i++ )
-    {
-        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
-
-        for( int j = !i; j < i_refs; j++ )
-            PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
-    }
-
-    if( h->param.analyse.i_weighted_pred )
-    {
-        int i_padv = PADV << PARAM_INTERLACED;
-        int luma_plane_size = 0;
-        int numweightbuf;
-
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
-        {
-            // only need buffer for lookahead
-            if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
-            {
-                // Fake analysis only works on lowres
-                luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv);
-                // Only need 1 buffer for analysis
-                numweightbuf = 1;
-            }
-            else
-                numweightbuf = 0;
-        }
-        else
-        {
-            /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
-             * needs the same amount of space and 4:2:2 needs twice that much */
-            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);
-
-            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-                //smart can weight one ref and one offset -1 in 8-bit
-                numweightbuf = 1 + (BIT_DEPTH == 8);
-            else
-                //simple only has one weighted ref
-                numweightbuf = 1;
-        }
-
-        for( int i = 0; i < numweightbuf; i++ )
-            PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
-    }
-
-    PREALLOC_END( h->mb.base );
-
-    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
-
-    for( int i = 0; i < 2; i++ )
-    {
-        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
-
-        for( int j = !i; j < i_refs; j++ )
-        {
-            M32( h->mb.mvr[i][j][0] ) = 0;
-            h->mb.mvr[i][j]++;
-        }
-    }
-
-    return 0;
-fail:
-    return -1;
-}
-void x264_macroblock_cache_free( x264_t *h )
-{
-    x264_free( h->mb.base );
-}
-
-int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
-{
-    if( !b_lookahead )
-    {
-        for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
-            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
-            {
-                CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
-                h->intra_border_backup[i][j] += 16;
-            }
-        for( int i = 0; i <= PARAM_INTERLACED; i++ )
-        {
-            if( h->param.b_sliced_threads )
-            {
-                /* Only allocate the first one, and allocate it for the whole frame, because we
-                 * won't be deblocking until after the frame is fully encoded. */
-                if( h == h->thread[0] && !i )
-                    CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count );
-                else
-                    h->deblock_strength[i] = h->thread[0]->deblock_strength[0];
-            }
-            else
-                CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
-            h->deblock_strength[1] = h->deblock_strength[i];
-        }
-    }
-
-    /* Allocate scratch buffer */
-    int scratch_size = 0;
-    if( !b_lookahead )
-    {
-        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t);
-        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
-        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
-        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-            ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
-        scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
-    }
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
-    scratch_size = X264_MAX( scratch_size, buf_mbtree );
-    if( scratch_size )
-        CHECKED_MALLOC( h->scratch_buffer, scratch_size );
-    else
-        h->scratch_buffer = NULL;
-
-    int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
-    int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
-    scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
-    CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
-
-    return 0;
-fail:
-    return -1;
-}
-
-void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
-{
-    if( !b_lookahead )
-    {
-        for( int i = 0; i <= PARAM_INTERLACED; i++ )
-            if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) )
-                x264_free( h->deblock_strength[i] );
-        for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
-            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
-                x264_free( h->intra_border_backup[i][j] - 16 );
-    }
-    x264_free( h->scratch_buffer );
-    x264_free( h->scratch_buffer2 );
-}
-
-void x264_macroblock_slice_init( x264_t *h )
-{
-    h->mb.mv[0] = h->fdec->mv[0];
-    h->mb.mv[1] = h->fdec->mv[1];
-    h->mb.mvr[0][0] = h->fdec->mv16x16;
-    h->mb.ref[0] = h->fdec->ref[0];
-    h->mb.ref[1] = h->fdec->ref[1];
-    h->mb.type = h->fdec->mb_type;
-    h->mb.partition = h->fdec->mb_partition;
-    h->mb.field = h->fdec->field;
-
-    h->fdec->i_ref[0] = h->i_ref[0];
-    h->fdec->i_ref[1] = h->i_ref[1];
-    for( int i = 0; i < h->i_ref[0]; i++ )
-        h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc;
-    if( h->sh.i_type == SLICE_TYPE_B )
-    {
-        for( int i = 0; i < h->i_ref[1]; i++ )
-            h->fdec->ref_poc[1][i] = h->fref[1][i]->i_poc;
-
-        map_col_to_list0(-1) = -1;
-        map_col_to_list0(-2) = -2;
-        for( int i = 0; i < h->fref[1][0]->i_ref[0]; i++ )
-        {
-            int poc = h->fref[1][0]->ref_poc[0][i];
-            map_col_to_list0(i) = -2;
-            for( int j = 0; j < h->i_ref[0]; j++ )
-                if( h->fref[0][j]->i_poc == poc )
-                {
-                    map_col_to_list0(i) = j;
-                    break;
-                }
-        }
-    }
-    else if( h->sh.i_type == SLICE_TYPE_P )
-    {
-        if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
-        {
-            deblock_ref_table(-2) = -2;
-            deblock_ref_table(-1) = -1;
-            for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ )
-            {
-                /* Mask off high bits to avoid frame num collisions with -1/-2.
-                 * In current x264 frame num values don't cover a range of more
-                 * than 32, so 6 bits is enough for uniqueness. */
-                if( !MB_INTERLACED )
-                    deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63;
-                else
-                    deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1);
-            }
-        }
-    }
-
-    /* init with not available (for top right idx=7,15) */
-    memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
-
-    if( h->i_ref[0] > 0 )
-        for( int field = 0; field <= SLICE_MBAFF; field++ )
-        {
-            int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
-            int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field];
-            int delta = curpoc - refpoc;
-
-            h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
-        }
-
-    h->mb.i_neighbour4[6] =
-    h->mb.i_neighbour4[9] =
-    h->mb.i_neighbour4[12] =
-    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
-    h->mb.i_neighbour4[3] =
-    h->mb.i_neighbour4[7] =
-    h->mb.i_neighbour4[11] =
-    h->mb.i_neighbour4[13] =
-    h->mb.i_neighbour4[15] =
-    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
-}
-
-void x264_macroblock_thread_init( x264_t *h )
-{
-    h->mb.i_me_method = h->param.analyse.i_me_method;
-    h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
-    if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
-        h->mb.i_subpel_refine--;
-    h->mb.b_chroma_me = h->param.analyse.b_chroma_me &&
-                        ((h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5) ||
-                         (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9));
-    h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
-                          (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
-    h->mb.i_mb_prev_xy = -1;
-
-    /*          4:2:0                      4:2:2                      4:4:4
-     * fdec            fenc       fdec            fenc       fdec            fenc
-     * y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y
-     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
-     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
-     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
-     * y Y Y Y Y       U U V V    y Y Y Y Y       U U V V    y Y Y Y Y       U U U U
-     * u u u   v v v   U U V V    u u u   v v v   U U V V    u u u u u u u   U U U U
-     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
-     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
-     *                            u U U   v V V              u U U U U       V V V V
-     *                            u U U   v V V              u U U U U       V V V V
-     *                                                       v v v v v v v   V V V V
-     *                                                       v V V V V       V V V V
-     *                                                       v V V V V
-     *                                                       v V V V V
-     *                                                       v V V V V
-     */
-    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
-    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
-    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
-    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-    if( CHROMA444 )
-    {
-        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
-        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
-    }
-    else
-    {
-        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-    }
-}
-
-void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
-{
-    int stride_y  = fenc->i_stride[0];
-    int stride_uv = fenc->i_stride[1];
-    int off_y  = 16 * i_mb_x + 16 * i_mb_y * stride_y;
-    int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT);
-    h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
-                         fenc->plane[1]+off_uv, stride_uv, i_mb_x );
-}
-
-NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
-{
-    // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
-    for( int i = -4; i < 4; i++ )
-        dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
-}
-
-static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
-{
-    int mb_interlaced = b_mbaff && MB_INTERLACED;
-    int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16;
-    int i_stride = h->fdec->i_stride[i];
-    int i_stride2 = i_stride << mb_interlaced;
-    int i_pix_offset = mb_interlaced
-                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + height * mb_y * i_stride;
-    pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : !(mb_y&1);
-    pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
-    int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
-    /* ref_pix_offset[0] references the current field and [1] the opposite field. */
-    if( mb_interlaced )
-        ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
-    h->mb.pic.i_stride[i] = i_stride2;
-    h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
-    if( b_chroma )
-    {
-        h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
-        memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
-        memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
-        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
-        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
-    }
-    else
-    {
-        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
-        memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
-        h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
-    }
-    if( b_mbaff || h->mb.b_reencode_mb )
-    {
-        for( int j = 0; j < height; j++ )
-            if( b_chroma )
-            {
-                h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
-                h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
-            }
-            else
-                h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
-    }
-    pixel *plane_src, **filtered_src;
-    for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
-    {
-        // Interpolate between pixels in same field.
-        if( mb_interlaced )
-        {
-            plane_src = h->fref[0][j>>1]->plane_fld[i];
-            filtered_src = h->fref[0][j>>1]->filtered_fld[i];
-        }
-        else
-        {
-            plane_src = h->fref[0][j]->plane[i];
-            filtered_src = h->fref[0][j]->filtered[i];
-        }
-        h->mb.pic.p_fref[0][j][i*4] = plane_src + ref_pix_offset[j&1];
-
-        if( !b_chroma )
-        {
-            for( int k = 1; k < 4; k++ )
-                h->mb.pic.p_fref[0][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
-            if( !i )
-            {
-                if( h->sh.weight[j][0].weightfn )
-                    h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> mb_interlaced][ref_pix_offset[j&1]];
-                else
-                    h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
-            }
-        }
-    }
-    if( h->sh.i_type == SLICE_TYPE_B )
-        for( int j = 0; j < h->mb.pic.i_fref[1]; j++ )
-        {
-            if( mb_interlaced )
-            {
-                plane_src = h->fref[1][j>>1]->plane_fld[i];
-                filtered_src = h->fref[1][j>>1]->filtered_fld[i];
-            }
-            else
-            {
-                plane_src = h->fref[1][j]->plane[i];
-                filtered_src = h->fref[1][j]->filtered[i];
-            }
-            h->mb.pic.p_fref[1][j][i*4] = plane_src + ref_pix_offset[j&1];
-
-            if( !b_chroma )
-                for( int k = 1; k < 4; k++ )
-                    h->mb.pic.p_fref[1][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
-        }
-}
-
-static const x264_left_table_t left_indices[4] =
-{
-    /* Current is progressive */
-    {{ 4, 4, 5, 5}, { 3,  3,  7,  7}, {16+1, 16+1, 32+1, 32+1}, {0, 0, 1, 1}, {0, 0, 0, 0}},
-    {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+5, 16+5, 32+5, 32+5}, {2, 2, 3, 3}, {1, 1, 1, 1}},
-    /* Current is interlaced */
-    {{ 4, 6, 4, 6}, { 3, 11,  3, 11}, {16+1, 16+1, 32+1, 32+1}, {0, 2, 0, 2}, {0, 1, 0, 1}},
-    /* Both same */
-    {{ 4, 5, 6, 3}, { 3,  7, 11, 15}, {16+1, 16+5, 32+1, 32+5}, {0, 1, 2, 3}, {0, 0, 1, 1}}
-};
-
-static void ALWAYS_INLINE x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced )
-{
-    const int mb_interlaced = b_interlaced && MB_INTERLACED;
-    int top_y = mb_y - (1 << mb_interlaced);
-    int top = top_y * h->mb.i_mb_stride + mb_x;
-
-    h->mb.i_mb_x = mb_x;
-    h->mb.i_mb_y = mb_y;
-    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
-    h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x);
-    h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x);
-    h->mb.left_b8[0] =
-    h->mb.left_b8[1] = -1;
-    h->mb.left_b4[0] =
-    h->mb.left_b4[1] = -1;
-    h->mb.i_neighbour = 0;
-    h->mb.i_neighbour_intra = 0;
-    h->mb.i_neighbour_frame = 0;
-    h->mb.i_mb_top_xy = -1;
-    h->mb.i_mb_top_y = -1;
-    h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1;
-    h->mb.i_mb_topleft_xy = -1;
-    h->mb.i_mb_topright_xy = -1;
-    h->mb.i_mb_type_top = -1;
-    h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1;
-    h->mb.i_mb_type_topleft = -1;
-    h->mb.i_mb_type_topright = -1;
-    h->mb.left_index_table = &left_indices[3];
-    h->mb.topleft_partition = 0;
-
-    int topleft_y = top_y;
-    int topright_y = top_y;
-    int left[2];
-
-    left[0] = left[1] = h->mb.i_mb_xy - 1;
-    h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2;
-    h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4;
-
-    if( b_interlaced )
-    {
-        h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2*h->mb.i_mb_stride;
-        h->mb.i_mb_topleft_y = -1;
-        h->mb.i_mb_topright_y = -1;
-
-        if( mb_y&1 )
-        {
-            if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
-            {
-                left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride;
-                h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride;
-                h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride;
-
-                if( mb_interlaced )
-                {
-                    h->mb.left_index_table = &left_indices[2];
-                    left[1] += h->mb.i_mb_stride;
-                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
-                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
-                }
-                else
-                {
-                    h->mb.left_index_table = &left_indices[1];
-                    topleft_y++;
-                    h->mb.topleft_partition = 1;
-                }
-            }
-            if( !mb_interlaced )
-                topright_y = -1;
-        }
-        else
-        {
-            if( mb_interlaced && top >= 0 )
-            {
-                if( !h->mb.field[top] )
-                {
-                    top += h->mb.i_mb_stride;
-                    top_y++;
-                }
-                if( mb_x )
-                    topleft_y += !h->mb.field[h->mb.i_mb_stride*topleft_y + mb_x - 1];
-                if( mb_x < h->mb.i_mb_width-1 )
-                    topright_y += !h->mb.field[h->mb.i_mb_stride*topright_y + mb_x + 1];
-            }
-            if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] )
-            {
-                if( mb_interlaced )
-                {
-                    h->mb.left_index_table = &left_indices[2];
-                    left[1] += h->mb.i_mb_stride;
-                    h->mb.left_b8[1] += 2*h->mb.i_b8_stride;
-                    h->mb.left_b4[1] += 4*h->mb.i_b4_stride;
-                }
-                else
-                    h->mb.left_index_table = &left_indices[0];
-            }
-        }
-    }
-
-    if( mb_x > 0 )
-    {
-        h->mb.i_neighbour_frame |= MB_LEFT;
-        h->mb.i_mb_left_xy[0] = left[0];
-        h->mb.i_mb_left_xy[1] = left[1];
-        h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]];
-        h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]];
-        if( h->mb.slice_table[left[0]] == h->sh.i_first_mb )
-        {
-            h->mb.i_neighbour |= MB_LEFT;
-
-            // FIXME: We don't currently support constrained intra + mbaff.
-            if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) )
-                h->mb.i_neighbour_intra |= MB_LEFT;
-        }
-    }
-
-    /* We can't predict from the previous threadslice since it hasn't been encoded yet. */
-    if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) )
-    {
-        if( top >= 0 )
-        {
-            h->mb.i_neighbour_frame |= MB_TOP;
-            h->mb.i_mb_top_xy = top;
-            h->mb.i_mb_top_y = top_y;
-            h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy];
-            if( h->mb.slice_table[top] == h->sh.i_first_mb )
-            {
-                h->mb.i_neighbour |= MB_TOP;
-
-                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
-                    h->mb.i_neighbour_intra |= MB_TOP;
-
-                /* We only need to prefetch the top blocks because the left was just written
-                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
-                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
-                x264_prefetch( &h->mb.cbp[top] );
-                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
-                x264_prefetch( &h->mb.non_zero_count[top][12] );
-                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
-                x264_prefetch( &h->mb.mb_transform_size[top] );
-                x264_prefetch( &h->mb.skipbp[top] );
-            }
-        }
-
-        if( mb_x > 0 && topleft_y >= 0  )
-        {
-            h->mb.i_neighbour_frame |= MB_TOPLEFT;
-            h->mb.i_mb_topleft_xy = h->mb.i_mb_stride*topleft_y + mb_x - 1;
-            h->mb.i_mb_topleft_y = topleft_y;
-            h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy];
-            if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb )
-            {
-                h->mb.i_neighbour |= MB_TOPLEFT;
-
-                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) )
-                    h->mb.i_neighbour_intra |= MB_TOPLEFT;
-            }
-        }
-
-        if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 )
-        {
-            h->mb.i_neighbour_frame |= MB_TOPRIGHT;
-            h->mb.i_mb_topright_xy = h->mb.i_mb_stride*topright_y + mb_x + 1;
-            h->mb.i_mb_topright_y = topright_y;
-            h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy];
-            if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb )
-            {
-                h->mb.i_neighbour |= MB_TOPRIGHT;
-
-                if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) )
-                    h->mb.i_neighbour_intra |= MB_TOPRIGHT;
-            }
-        }
-    }
-}
-
-#define LTOP 0
-#if HAVE_INTERLACED
-#   define LBOT 1
-#else
-#   define LBOT 0
-#endif
-
-static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_mbaff )
-{
-    x264_macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff );
-
-    int *left = h->mb.i_mb_left_xy;
-    int top  = h->mb.i_mb_top_xy;
-    int top_y = h->mb.i_mb_top_y;
-    int s8x8 = h->mb.i_b8_stride;
-    int s4x4 = h->mb.i_b4_stride;
-    int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x;
-    int top_4x4 = (4*top_y+3) * s4x4 + 4*mb_x;
-    int lists = (1 << h->sh.i_type) & 3;
-
-    /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */
-    /* By only dereferencing them once, we avoid this issue. */
-    int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
-    uint8_t (*nnz)[48] = h->mb.non_zero_count;
-    int16_t *cbp = h->mb.cbp;
-
-    const x264_left_table_t *left_index_table = h->mb.left_index_table;
-
-    h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x];
-
-    /* load cache */
-    if( h->mb.i_neighbour & MB_TOP )
-    {
-        h->mb.cache.i_cbp_top = cbp[top];
-        /* load intra4x4 */
-        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
-
-        /* load non_zero_count */
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );
-
-        /* Finish the prefetching */
-        for( int l = 0; l < lists; l++ )
-        {
-            x264_prefetch( &h->mb.mv[l][top_4x4-1] );
-            /* Top right being not in the same cacheline as top left will happen
-             * once every 4 MBs, so one extra prefetch is worthwhile */
-            x264_prefetch( &h->mb.mv[l][top_4x4+4] );
-            x264_prefetch( &h->mb.ref[l][top_8x8-1] );
-            x264_prefetch( &h->mb.mvd[l][top] );
-        }
-    }
-    else
-    {
-        h->mb.cache.i_cbp_top = -1;
-
-        /* load intra4x4 */
-        M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
-
-        /* load non_zero_count */
-        M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8] ) = 0x80808080U;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8] ) = 0x80808080U;
-    }
-
-    if( h->mb.i_neighbour & MB_LEFT )
-    {
-        int ltop = left[LTOP];
-        int lbot = b_mbaff ? left[LBOT] : ltop;
-        if( b_mbaff )
-        {
-            const int16_t top_luma = (cbp[ltop] >> (left_index_table->mv[0]&(~1))) & 2;
-            const int16_t bot_luma = (cbp[lbot] >> (left_index_table->mv[2]&(~1))) & 2;
-            h->mb.cache.i_cbp_left = (cbp[ltop] & 0xfff0) | (bot_luma<<2) | top_luma;
-        }
-        else
-            h->mb.cache.i_cbp_left = cbp[ltop];
-
-        /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = i4x4[ltop][left_index_table->intra[0]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = i4x4[ltop][left_index_table->intra[1]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = i4x4[lbot][left_index_table->intra[2]];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[lbot][left_index_table->intra[3]];
-
-        /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = nnz[ltop][left_index_table->nnz[0]];
-        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = nnz[ltop][left_index_table->nnz[1]];
-        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
-
-        if( CHROMA_FORMAT >= CHROMA_422 )
-        {
-            int offset = (4>>CHROMA_H_SHIFT) - 4;
-            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
-            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
-            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
-        }
-        else
-        {
-            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[0]];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[1]];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[2]];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[3]];
-        }
-    }
-    else
-    {
-        h->mb.cache.i_cbp_left = -1;
-
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1;
-
-        /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[10] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
-        if( CHROMA_FORMAT >= CHROMA_422 )
-        {
-            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
-            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
-            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] =
-            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = 0x80;
-        }
-    }
-
-    if( h->pps->b_transform_8x8_mode )
-    {
-        h->mb.cache.i_neighbour_transform_size =
-            ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] )
-          + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top]  );
-    }
-
-    if( b_mbaff )
-    {
-        h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED;
-        h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED;
-    }
-
-    if( !b_mbaff )
-    {
-        x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
-        x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 0 );
-        if( CHROMA444 )
-        {
-            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+ 4*FDEC_STRIDE );
-            x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+12*FDEC_STRIDE );
-            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+ 4*FDEC_STRIDE );
-            x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+12*FDEC_STRIDE );
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 0 );
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 0 );
-        }
-        else
-        {
-            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
-            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
-            if( CHROMA_FORMAT == CHROMA_422 )
-            {
-                x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
-                x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
-            }
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
-        }
-    }
-    else
-    {
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 1 );
-        if( CHROMA444 )
-        {
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 1 );
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 1 );
-        }
-        else
-            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 1 );
-    }
-
-    if( h->fdec->integral )
-    {
-        int offset = 16 * (mb_x + mb_y * h->fdec->i_stride[0]);
-        for( int list = 0; list < 2; list++ )
-            for( int i = 0; i < h->mb.pic.i_fref[list]; i++ )
-                h->mb.pic.p_integral[list][i] = &h->fref[list][i]->integral[offset];
-    }
-
-    x264_prefetch_fenc( h, h->fenc, mb_x, mb_y );
-
-    /* load ref/mv/mvd */
-    for( int l = 0; l < lists; l++ )
-    {
-        int16_t (*mv)[2] = h->mb.mv[l];
-        int8_t *ref = h->mb.ref[l];
-
-        int i8 = x264_scan8[0] - 1 - 1*8;
-        if( h->mb.i_neighbour & MB_TOPLEFT )
-        {
-            int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1;
-            int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topleft_y + mb_x-1)+3+3*s4x4 : top_4x4 - 1;
-            if( b_mbaff && h->mb.topleft_partition )
-            {
-                /* Take motion vector from the middle of macroblock instead of
-                 * the bottom right as usual. */
-                iv -= 2*s4x4;
-                ir -= s8x8;
-            }
-            h->mb.cache.ref[l][i8] = ref[ir];
-            CP32( h->mb.cache.mv[l][i8], mv[iv] );
-        }
-        else
-        {
-            h->mb.cache.ref[l][i8] = -2;
-            M32( h->mb.cache.mv[l][i8] ) = 0;
-        }
-
-        i8 = x264_scan8[0] - 8;
-        if( h->mb.i_neighbour & MB_TOP )
-        {
-            h->mb.cache.ref[l][i8+0] =
-            h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
-            h->mb.cache.ref[l][i8+2] =
-            h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
-            CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
-        }
-        else
-        {
-            M128( h->mb.cache.mv[l][i8] ) = M128_ZERO;
-            M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
-        }
-
-        i8 = x264_scan8[0] + 4 - 1*8;
-        if( h->mb.i_neighbour & MB_TOPRIGHT )
-        {
-            int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2;
-            int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topright_y + (mb_x+1))+3*s4x4 : top_4x4 + 4;
-            h->mb.cache.ref[l][i8] = ref[ir];
-            CP32( h->mb.cache.mv[l][i8], mv[iv] );
-        }
-        else
-             h->mb.cache.ref[l][i8] = -2;
-
-        i8 = x264_scan8[0] - 1;
-        if( h->mb.i_neighbour & MB_LEFT )
-        {
-            if( b_mbaff )
-            {
-                h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[1]];
-                h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[2]];
-                h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[3]];
-
-                CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[0]] );
-                CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[1]] );
-                CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[2]] );
-                CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[3]] );
-            }
-            else
-            {
-                const int ir = h->mb.i_b8_xy - 1;
-                const int iv = h->mb.i_b4_xy - 1;
-                h->mb.cache.ref[l][i8+0*8] =
-                h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
-                h->mb.cache.ref[l][i8+2*8] =
-                h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
-
-                CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
-                CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
-            }
-        }
-        else
-        {
-            for( int i = 0; i < 4; i++ )
-            {
-                h->mb.cache.ref[l][i8+i*8] = -2;
-                M32( h->mb.cache.mv[l][i8+i*8] ) = 0;
-            }
-        }
-
-        /* Extra logic for top right mv in mbaff.
-         * . . . d  . . a .
-         * . . . e  . . . .
-         * . . . f  b . c .
-         * . . . .  . . . .
-         *
-         * If the top right of the 4x4 partitions labeled a, b and c in the
-         * above diagram do not exist, but the entries d, e and f exist (in
-         * the macroblock to the left) then use those instead.
-         */
-        if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) )
-        {
-            if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] )
-            {
-                h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0];
-                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1];
-                h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0];
-                CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[0]+1)] );
-                CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[1]+1)] );
-                CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table->mv[2]+1)] );
-            }
-            else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] )
-            {
-                // Looking at the bottom field so always take the bottom macroblock of the pair.
-                h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]];
-                h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]];
-                CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] );
-                CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] );
-                CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[2]] );
-            }
-        }
-
-        if( h->param.b_cabac )
-        {
-            uint8_t (*mvd)[8][2] = h->mb.mvd[l];
-            if( h->mb.i_neighbour & MB_TOP )
-                CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
-            else
-                M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;
-
-            if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) )
-            {
-                CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] );
-            }
-            else
-            {
-                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0;
-                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0;
-            }
-            if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >=0) )
-            {
-                CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] );
-                CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] );
-            }
-            else
-            {
-                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0;
-                M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0;
-            }
-        }
-
-        /* If motion vectors are cached from frame macroblocks but this
-         * macroblock is a field macroblock then the motion vector must be
-         * halved. Similarly, motion vectors from field macroblocks are doubled. */
-        if( b_mbaff )
-        {
-#define MAP_MVS\
-                if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\
-                    MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8)\
-                if( FIELD_DIFFERENT(top) )\
-                {\
-                    MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8)\
-                }\
-                if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\
-                    MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8)\
-                if( FIELD_DIFFERENT(left[0]) )\
-                {\
-                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8)\
-                    MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8)\
-                    MAP_F2F(topright_mv, topright_ref, 0)\
-                    MAP_F2F(topright_mv, topright_ref, 1)\
-                    MAP_F2F(topright_mv, topright_ref, 2)\
-                }
-
-            if( MB_INTERLACED )
-            {
-#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock])
-#define MAP_F2F(varmv, varref, index)\
-                if( h->mb.cache.varref[l][index] >= 0 )\
-                {\
-                    h->mb.cache.varref[l][index] <<= 1;\
-                    h->mb.cache.varmv[l][index][1] /= 2;\
-                    h->mb.cache.mvd[l][index][1] >>= 1;\
-                }
-                MAP_MVS
-#undef MAP_F2F
-#undef FIELD_DIFFERENT
-            }
-            else
-            {
-#define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock])
-#define MAP_F2F(varmv, varref, index)\
-                if( h->mb.cache.varref[l][index] >= 0 )\
-                {\
-                    h->mb.cache.varref[l][index] >>= 1;\
-                    h->mb.cache.varmv[l][index][1] <<= 1;\
-                    h->mb.cache.mvd[l][index][1] <<= 1;\
-                }
-                MAP_MVS
-#undef MAP_F2F
-#undef FIELD_DIFFERENT
-            }
-        }
-    }
-
-    if( b_mbaff && mb_x == 0 && !(mb_y&1) )
-    {
-        if( h->mb.i_mb_top_xy >= h->sh.i_first_mb )
-            h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy];
-        else
-            h->mb.field_decoding_flag = 0;
-    }
-
-    /* Check whether skip here would cause decoder to predict interlace mode incorrectly.
-     * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */
-    h->mb.b_allow_skip = 1;
-    if( b_mbaff )
-    {
-        if( MB_INTERLACED != h->mb.field_decoding_flag &&
-            (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
-            h->mb.b_allow_skip = 0;
-    }
-
-    if( h->param.b_cabac )
-    {
-        if( b_mbaff )
-        {
-            int left_xy, top_xy;
-            /* Neighbours here are calculated based on field_decoding_flag */
-            int mb_xy = mb_x + (mb_y&~1)*h->mb.i_mb_stride;
-            left_xy = mb_xy - 1;
-            if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] )
-                left_xy += h->mb.i_mb_stride;
-            if( h->mb.field_decoding_flag )
-            {
-                top_xy = mb_xy - h->mb.i_mb_stride;
-                if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] )
-                    top_xy -= h->mb.i_mb_stride;
-            }
-            else
-                top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride;
-
-            h->mb.cache.i_neighbour_skip =   (mb_x >  0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] ))
-                                         + (top_xy >= 0 && h->mb.slice_table[top_xy]  == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] ));
-        }
-        else
-        {
-            h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] ))
-                                         + ((h->mb.i_neighbour & MB_TOP)  && !IS_SKIP( h->mb.i_mb_type_top ));
-        }
-    }
-
-    /* load skip */
-    if( h->sh.i_type == SLICE_TYPE_B )
-    {
-        h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)];
-        h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)];
-        if( h->param.b_cabac )
-        {
-            uint8_t skipbp;
-            x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
-            if( b_mbaff )
-            {
-                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0;
-                h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1;
-                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0;
-                h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1;
-            }
-            else
-            {
-                skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0;
-                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
-                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
-            }
-            skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0;
-            h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
-            h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
-        }
-    }
-
-    if( h->sh.i_type == SLICE_TYPE_P )
-        x264_mb_predict_mv_pskip( h, h->mb.cache.pskip_mv );
-
-    h->mb.i_neighbour4[0] =
-    h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT))
-                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0);
-    h->mb.i_neighbour4[4] =
-    h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
-    h->mb.i_neighbour4[2] =
-    h->mb.i_neighbour4[8] =
-    h->mb.i_neighbour4[10] =
-    h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
-    h->mb.i_neighbour4[5] =
-    h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT)
-                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
-}
-
-void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y )
-{
-    x264_macroblock_cache_load( h, mb_x, mb_y, 0 );
-}
-
-void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y )
-{
-    x264_macroblock_cache_load( h, mb_x, mb_y, 1 );
-}
-
-static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] )
-{
-    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
-    {
-        static const uint8_t offset[2][2][8] =
-        {   {   { 0, 0, 0, 0, 1, 1, 1, 1 },
-                { 2, 2, 2, 2, 3, 3, 3, 3 }, },
-            {   { 0, 1, 2, 3, 0, 1, 2, 3 },
-                { 0, 1, 2, 3, 0, 1, 2, 3 }, }
-        };
-        ALIGNED_ARRAY_8( uint8_t, tmpbs, [8] );
-
-        const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1];
-        uint8_t (*nnz)[48] = h->mb.non_zero_count;
-
-        for( int i = 0; i < 8; i++ )
-        {
-            int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1];
-            int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)];
-            int nnz_left = nnz[left][3 + 4*off[i]];
-            if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
-            {
-                int j = off[i]&~1;
-                if( h->mb.mb_transform_size[left] )
-                    nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] ));
-            }
-            tmpbs[i] = (nnz_left || nnz_this) ? 2 : 1;
-        }
-
-        if( MB_INTERLACED )
-        {
-            CP32( bs[0][0], &tmpbs[0] );
-            CP32( bs[0][4], &tmpbs[4] );
-        }
-        else
-        {
-            for( int i = 0; i < 4; i++ ) bs[0][0][i] = tmpbs[2*i];
-            for( int i = 0; i < 4; i++ ) bs[0][4][i] = tmpbs[1+2*i];
-        }
-    }
-
-    if( (h->mb.i_neighbour & MB_TOP) && MB_INTERLACED != h->mb.field[h->mb.i_mb_top_xy] )
-    {
-        if( !(h->mb.i_mb_y&1) && !MB_INTERLACED )
-        {
-            /* Need to filter both fields (even for frame macroblocks).
-             * Filter top two rows using the top macroblock of the above
-             * pair and then the bottom one. */
-            int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride;
-            uint8_t *nnz_cur = &h->mb.cache.non_zero_count[x264_scan8[0]];
-
-            for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
-            {
-                uint8_t (*nnz)[48] = h->mb.non_zero_count;
-
-                ALIGNED_4( uint8_t nnz_top[4] );
-                CP32( nnz_top, &nnz[mbn_xy][3*4] );
-
-                if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] )
-                {
-                    nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] );
-                    nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] );
-                }
-
-                for( int i = 0; i < 4; i++ )
-                    bs[1][4*j][i] = (nnz_cur[i] || nnz_top[i]) ? 2 : 1;
-            }
-        }
-        else
-            for( int i = 0; i < 4; i++ )
-                bs[1][0][i] = X264_MAX( bs[1][0][i], 1 );
-    }
-}
-
-void x264_macroblock_deblock_strength( x264_t *h )
-{
-    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
-    if( IS_INTRA( h->mb.i_type ) )
-    {
-        M32( bs[0][1] ) = 0x03030303;
-        M64( bs[0][2] ) = 0x0303030303030303ULL;
-        M32( bs[1][1] ) = 0x03030303;
-        M64( bs[1][2] ) = 0x0303030303030303ULL;
-        return;
-    }
-
-    /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
-    if( h->mb.b_transform_8x8 && !CHROMA444 )
-    {
-        int cbp_mask = 0xf >> CHROMA_V_SHIFT;
-        if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
-        {
-            M32( bs[0][0] ) = 0x02020202;
-            M32( bs[0][2] ) = 0x02020202;
-            M32( bs[0][4] ) = 0x02020202;
-            M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */
-            M64( bs[1][2] ) = 0x0202020202020202ULL;
-            M32( bs[1][4] ) = 0x02020202;
-            return;
-        }
-    }
-
-    int neighbour_changed = 0;
-    if( h->sh.i_disable_deblocking_filter_idc != 2 )
-    {
-        neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour;
-        h->mb.i_neighbour = h->mb.i_neighbour_frame;
-    }
-
-    /* MBAFF deblock uses different left neighbors from encoding */
-    if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) )
-    {
-        h->mb.i_mb_left_xy[1] =
-        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
-        if( h->mb.i_mb_y&1 )
-            h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
-        else
-            h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
-    }
-
-    /* If we have multiple slices and we're deblocking on slice edges, we
-     * have to reload neighbour data. */
-    if( neighbour_changed )
-    {
-        int top_y = h->mb.i_mb_top_y;
-        int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x;
-        int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x;
-        int s8x8 = h->mb.i_b8_stride;
-        int s4x4 = h->mb.i_b4_stride;
-
-        uint8_t (*nnz)[48] = h->mb.non_zero_count;
-        const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];
-
-        if( neighbour_changed & MB_TOP )
-            CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
-
-        if( neighbour_changed & MB_LEFT )
-        {
-            int *left = h->mb.i_mb_left_xy;
-            h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]];
-            h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]];
-            h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]];
-            h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]];
-        }
-
-        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
-        {
-            int16_t (*mv)[2] = h->mb.mv[l];
-            int8_t *ref = h->mb.ref[l];
-
-            int i8 = x264_scan8[0] - 8;
-            if( neighbour_changed & MB_TOP )
-            {
-                h->mb.cache.ref[l][i8+0] =
-                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
-                h->mb.cache.ref[l][i8+2] =
-                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
-                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
-            }
-
-            i8 = x264_scan8[0] - 1;
-            if( neighbour_changed & MB_LEFT )
-            {
-                h->mb.cache.ref[l][i8+0*8] =
-                h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.ref[l][i8+2*8] =
-                h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]];
-
-                CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] );
-                CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] );
-                CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] );
-                CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] );
-            }
-        }
-    }
-
-    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
-    {
-        /* Handle reference frame duplicates */
-        int i8 = x264_scan8[0] - 8;
-        h->mb.cache.ref[0][i8+0] =
-        h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
-        h->mb.cache.ref[0][i8+2] =
-        h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);
-
-        i8 = x264_scan8[0] - 1;
-        h->mb.cache.ref[0][i8+0*8] =
-        h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
-        h->mb.cache.ref[0][i8+2*8] =
-        h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);
-
-        int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
-        int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
-        int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
-        int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
-        uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
-        uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;
-
-        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
-        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
-        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
-        M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
-    }
-
-    /* Munge NNZ for cavlc + 8x8dct */
-    if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
-    {
-        uint8_t (*nnz)[48] = h->mb.non_zero_count;
-        int top = h->mb.i_mb_top_xy;
-        int *left = h->mb.i_mb_left_xy;
-
-        if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
-        {
-            int i8 = x264_scan8[0] - 8;
-            int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
-            int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
-            M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
-            M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
-        }
-
-        if( h->mb.i_neighbour & MB_LEFT )
-        {
-            int i8 = x264_scan8[0] - 1;
-            if( h->mb.mb_transform_size[left[0]] )
-            {
-                int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] );
-                h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
-                h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
-            }
-            if( h->mb.mb_transform_size[left[1]] )
-            {
-                int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] );
-                h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
-                h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
-            }
-        }
-
-        if( h->mb.b_transform_8x8 )
-        {
-            int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
-            int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
-            int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
-            int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
-            uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
-            uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
-
-            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
-            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
-            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
-            M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
-        }
-    }
-
-    h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
-                               bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
-
-    if( SLICE_MBAFF )
-        x264_macroblock_deblock_strength_mbaff( h, bs );
-}
-
-static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
-{
-    int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
-    int i_stride = h->fdec->i_stride[i];
-    int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
-    int i_pix_offset = (b_mbaff && MB_INTERLACED)
-                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + height * mb_y * i_stride;
-    if( b_chroma )
-        h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
-    else
-        h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
-}
-
-static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff )
-{
-    /* In MBAFF we store the last two rows in intra_border_backup[0] and [1].
-     * For progressive mbs this is the bottom two rows, and for interlaced the
-     * bottom row of each field. We also store samples needed for the next
-     * mbpair in intra_border_backup[2]. */
-    int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
-    memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-    if( CHROMA444 )
-    {
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-        memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-    }
-    else
-    {
-        int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
-    }
-    if( b_mbaff )
-    {
-        if( mb_y&1 )
-        {
-            int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
-            backup_dst = MB_INTERLACED ? 2 : 0;
-            memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+backup_src, 16*sizeof(pixel) );
-            if( CHROMA444 )
-            {
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 16*sizeof(pixel) );
-                memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+backup_src, 16*sizeof(pixel) );
-            }
-            else
-            {
-                if( CHROMA_FORMAT == CHROMA_420 )
-                    backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
-            }
-        }
-    }
-}
-
-void x264_macroblock_cache_save( x264_t *h )
-{
-    const int i_mb_xy = h->mb.i_mb_xy;
-    const int i_mb_type = x264_mb_type_fix[h->mb.i_type];
-    const int s8x8 = h->mb.i_b8_stride;
-    const int s4x4 = h->mb.i_b4_stride;
-    const int i_mb_4x4 = h->mb.i_b4_xy;
-    const int i_mb_8x8 = h->mb.i_b8_xy;
-
-    /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing. */
-    /* By only dereferencing them once, we avoid this issue. */
-    int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
-    uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];
-
-    if( SLICE_MBAFF )
-    {
-        x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 1 );
-        if( CHROMA444 )
-        {
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 1 );
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 1 );
-        }
-        else
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 1 );
-    }
-    else
-    {
-        x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 0 );
-        if( CHROMA444 )
-        {
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 0 );
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 0 );
-        }
-        else
-            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 0 );
-    }
-
-    x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
-
-    h->mb.type[i_mb_xy] = i_mb_type;
-    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
-    h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
-    h->mb.i_mb_prev_xy = i_mb_xy;
-
-    /* save intra4x4 */
-    if( i_mb_type == I_4x4 )
-    {
-        CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
-        M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
-                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
-                                     h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
-    }
-    else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
-        M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
-    else
-        M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;
-
-
-    if( i_mb_type == I_PCM )
-    {
-        h->mb.qp[i_mb_xy] = 0;
-        h->mb.i_last_dqp = 0;
-        h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
-        h->mb.i_cbp_luma = 0xf;
-        h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
-        h->mb.b_transform_8x8 = 0;
-        for( int i = 0; i < 48; i++ )
-            h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;
-    }
-    else
-    {
-        if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
-            h->mb.i_qp = h->mb.i_last_qp;
-        h->mb.qp[i_mb_xy] = h->mb.i_qp;
-        h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
-        h->mb.i_last_qp = h->mb.i_qp;
-    }
-
-    /* save non zero count */
-    CP32( &nnz[ 0+0*4], &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
-    CP32( &nnz[ 0+1*4], &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
-    CP32( &nnz[ 0+2*4], &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
-    CP32( &nnz[ 0+3*4], &h->mb.cache.non_zero_count[x264_scan8[10]] );
-    CP32( &nnz[16+0*4], &h->mb.cache.non_zero_count[x264_scan8[16+0]] );
-    CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
-    CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
-    CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
-    if( CHROMA_FORMAT >= CHROMA_422 )
-    {
-        CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
-        CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
-        CP32( &nnz[32+2*4], &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] );
-        CP32( &nnz[32+3*4], &h->mb.cache.non_zero_count[x264_scan8[32+10]] );
-    }
-
-    if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
-        h->mb.b_transform_8x8 = 0;
-    h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8;
-
-    if( h->sh.i_type != SLICE_TYPE_I )
-    {
-        int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
-        int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
-        int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
-        int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
-        if( !IS_INTRA( i_mb_type ) )
-        {
-            ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
-            ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
-            ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
-            ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
-            CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
-            CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
-            CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
-            CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
-            if( h->sh.i_type == SLICE_TYPE_B )
-            {
-                ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
-                ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
-                ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
-                ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
-                CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
-                CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
-                CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
-                CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
-            }
-        }
-        else
-        {
-            M16( &ref0[0*s8x8] ) = (uint8_t)(-1) * 0x0101;
-            M16( &ref0[1*s8x8] ) = (uint8_t)(-1) * 0x0101;
-            M128( &mv0[0*s4x4] ) = M128_ZERO;
-            M128( &mv0[1*s4x4] ) = M128_ZERO;
-            M128( &mv0[2*s4x4] ) = M128_ZERO;
-            M128( &mv0[3*s4x4] ) = M128_ZERO;
-            if( h->sh.i_type == SLICE_TYPE_B )
-            {
-                M16( &ref1[0*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                M16( &ref1[1*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                M128( &mv1[0*s4x4] ) = M128_ZERO;
-                M128( &mv1[1*s4x4] ) = M128_ZERO;
-                M128( &mv1[2*s4x4] ) = M128_ZERO;
-                M128( &mv1[3*s4x4] ) = M128_ZERO;
-            }
-        }
-    }
-
-    if( h->param.b_cabac )
-    {
-        uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
-        uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
-        if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
-            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
-        else
-            h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
-
-        if( (0x3FF30 >> i_mb_type) & 1 ) /* !INTRA && !SKIP && !DIRECT */
-        {
-            CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
-            CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
-            CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
-            CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
-            if( h->sh.i_type == SLICE_TYPE_B )
-            {
-                CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
-                CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
-                CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
-                CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
-            }
-        }
-        else
-        {
-            M128( mvd0[0] ) = M128_ZERO;
-            if( h->sh.i_type == SLICE_TYPE_B )
-                M128( mvd1[0] ) = M128_ZERO;
-        }
-
-        if( h->sh.i_type == SLICE_TYPE_B )
-        {
-            if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
-                h->mb.skipbp[i_mb_xy] = 0xf;
-            else if( i_mb_type == B_8x8 )
-            {
-                int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0;
-                skipbp    |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1;
-                skipbp    |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2;
-                skipbp    |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3;
-                h->mb.skipbp[i_mb_xy] = skipbp;
-            }
-            else
-                h->mb.skipbp[i_mb_xy] = 0;
-        }
-    }
-}
-
-
-void x264_macroblock_bipred_init( x264_t *h )
-{
-    for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ )
-        for( int field = 0; field <= SLICE_MBAFF; field++ )
-            for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ )
-            {
-                x264_frame_t *l0 = h->fref[0][i_ref0>>mbfield];
-                int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)];
-                for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ )
-                {
-                    int dist_scale_factor;
-                    x264_frame_t *l1 = h->fref[1][i_ref1>>mbfield];
-                    int cur_poc = h->fdec->i_poc + mbfield*h->fdec->i_delta_poc[field];
-                    int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)];
-                    int td = x264_clip3( poc1 - poc0, -128, 127 );
-                    if( td == 0 /* || pic0 is a long-term ref */ )
-                        dist_scale_factor = 256;
-                    else
-                    {
-                        int tb = x264_clip3( cur_poc - poc0, -128, 127 );
-                        int tx = (16384 + (abs(td) >> 1)) / td;
-                        dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
-                    }
-
-                    h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor;
-
-                    dist_scale_factor >>= 2;
-                    if( h->param.analyse.b_weighted_bipred
-                          && dist_scale_factor >= -64
-                          && dist_scale_factor <= 128 )
-                    {
-                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor;
-                        // ssse3 implementation of biweight doesn't support the extrema.
-                        // if we ever generate them, we'll have to drop that optimization.
-                        assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
-                    }
-                    else
-                        h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32;
-                }
-            }
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/macroblock.h b/android/src/main/libenc/jni/libx264/common/macroblock.h
deleted file mode 100755
index 9a556ac..0000000
--- a/android/src/main/libenc/jni/libx264/common/macroblock.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/*****************************************************************************
- * macroblock.h: macroblock common functions
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MACROBLOCK_H
-#define X264_MACROBLOCK_H
-
-enum macroblock_position_e
-{
-    MB_LEFT     = 0x01,
-    MB_TOP      = 0x02,
-    MB_TOPRIGHT = 0x04,
-    MB_TOPLEFT  = 0x08,
-
-    MB_PRIVATE  = 0x10,
-
-    ALL_NEIGHBORS = 0xf,
-};
-
-static const uint8_t x264_pred_i4x4_neighbors[12] =
-{
-    MB_TOP,                         // I_PRED_4x4_V
-    MB_LEFT,                        // I_PRED_4x4_H
-    MB_LEFT | MB_TOP,               // I_PRED_4x4_DC
-    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_DDL
-    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_DDR
-    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_VR
-    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_HD
-    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_VL
-    MB_LEFT,                        // I_PRED_4x4_HU
-    MB_LEFT,                        // I_PRED_4x4_DC_LEFT
-    MB_TOP,                         // I_PRED_4x4_DC_TOP
-    0                               // I_PRED_4x4_DC_128
-};
-
-
-/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
-#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM )
-#define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
-#define IS_DIRECT(type)  ( (type) == B_DIRECT )
-enum mb_class_e
-{
-    I_4x4           = 0,
-    I_8x8           = 1,
-    I_16x16         = 2,
-    I_PCM           = 3,
-
-    P_L0            = 4,
-    P_8x8           = 5,
-    P_SKIP          = 6,
-
-    B_DIRECT        = 7,
-    B_L0_L0         = 8,
-    B_L0_L1         = 9,
-    B_L0_BI         = 10,
-    B_L1_L0         = 11,
-    B_L1_L1         = 12,
-    B_L1_BI         = 13,
-    B_BI_L0         = 14,
-    B_BI_L1         = 15,
-    B_BI_BI         = 16,
-    B_8x8           = 17,
-    B_SKIP          = 18,
-
-    X264_MBTYPE_MAX = 19
-};
-static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] =
-{
-    I_4x4, I_4x4, I_16x16, I_PCM,
-    P_L0, P_8x8, P_SKIP,
-    B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
-    B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
-};
-static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
-{
-    {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
-    {{1,1},{0,0}},                                              /* P_L0 */
-    {{0,0},{0,0}},                                              /* P_8x8 */
-    {{1,1},{0,0}},                                              /* P_SKIP */
-    {{0,0},{0,0}},                                              /* B_DIRECT */
-    {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}},                /* B_L0_* */
-    {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}},                /* B_L1_* */
-    {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}},                /* B_BI_* */
-    {{0,0},{0,0}},                                              /* B_8x8 */
-    {{0,0},{0,0}}                                               /* B_SKIP */
-};
-
-#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
-#define IS_SUB4x8(type) ( (type ==D_L0_4x8)||(type ==D_L1_4x8)||(type ==D_BI_4x8))
-#define IS_SUB8x4(type) ( (type ==D_L0_8x4)||(type ==D_L1_8x4)||(type ==D_BI_8x4))
-#define IS_SUB8x8(type) ( (type ==D_L0_8x8)||(type ==D_L1_8x8)||(type ==D_BI_8x8)||(type ==D_DIRECT_8x8))
-enum mb_partition_e
-{
-    /* sub partition type for P_8x8 and B_8x8 */
-    D_L0_4x4          = 0,
-    D_L0_8x4          = 1,
-    D_L0_4x8          = 2,
-    D_L0_8x8          = 3,
-
-    /* sub partition type for B_8x8 only */
-    D_L1_4x4          = 4,
-    D_L1_8x4          = 5,
-    D_L1_4x8          = 6,
-    D_L1_8x8          = 7,
-
-    D_BI_4x4          = 8,
-    D_BI_8x4          = 9,
-    D_BI_4x8          = 10,
-    D_BI_8x8          = 11,
-    D_DIRECT_8x8      = 12,
-
-    /* partition */
-    D_8x8             = 13,
-    D_16x8            = 14,
-    D_8x16            = 15,
-    D_16x16           = 16,
-    X264_PARTTYPE_MAX = 17,
-};
-
-static const uint8_t x264_mb_partition_listX_table[2][17] =
-{{
-    1, 1, 1, 1, /* D_L0_* */
-    0, 0, 0, 0, /* D_L1_* */
-    1, 1, 1, 1, /* D_BI_* */
-    0,          /* D_DIRECT_8x8 */
-    0, 0, 0, 0  /* 8x8 .. 16x16 */
-},
-{
-    0, 0, 0, 0, /* D_L0_* */
-    1, 1, 1, 1, /* D_L1_* */
-    1, 1, 1, 1, /* D_BI_* */
-    0,          /* D_DIRECT_8x8 */
-    0, 0, 0, 0  /* 8x8 .. 16x16 */
-}};
-static const uint8_t x264_mb_partition_count_table[17] =
-{
-    /* sub L0 */
-    4, 2, 2, 1,
-    /* sub L1 */
-    4, 2, 2, 1,
-    /* sub BI */
-    4, 2, 2, 1,
-    /* Direct */
-    1,
-    /* Partition */
-    4, 2, 2, 1
-};
-static const uint8_t x264_mb_partition_pixel_table[17] =
-{
-    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L0_* */
-    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L1_* */
-    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_BI_* */
-    PIXEL_8x8,                                      /* D_DIRECT_8x8 */
-    PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
-};
-
-/* zigzags are transposed with respect to the tables in the standard */
-static const uint8_t x264_zigzag_scan4[2][16] =
-{{ // frame
-    0,  4,  1,  2,  5,  8, 12,  9,  6,  3,  7, 10, 13, 14, 11, 15
-},
-{  // field
-    0,  1,  4,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
-}};
-static const uint8_t x264_zigzag_scan8[2][64] =
-{{
-    0,  8,  1,  2,  9, 16, 24, 17, 10,  3,  4, 11, 18, 25, 32, 40,
-   33, 26, 19, 12,  5,  6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
-   28, 21, 14,  7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
-   23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
-},
-{
-    0,  1,  2,  8,  9,  3,  4, 10, 16, 11,  5,  6,  7, 12, 17, 24,
-   18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34,
-   28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44,
-   45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63
-}};
-
-static const uint8_t block_idx_x[16] =
-{
-    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
-};
-static const uint8_t block_idx_y[16] =
-{
-    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
-};
-static const uint8_t block_idx_xy[4][4] =
-{
-    { 0, 2, 8,  10 },
-    { 1, 3, 9,  11 },
-    { 4, 6, 12, 14 },
-    { 5, 7, 13, 15 }
-};
-static const uint8_t block_idx_xy_1d[16] =
-{
-    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
-};
-static const uint8_t block_idx_yx_1d[16] =
-{
-    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
-};
-static const uint8_t block_idx_xy_fenc[16] =
-{
-    0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
-    0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
-    2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
-    2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
-    0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
-    0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
-    2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
-    2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
-};
-static const uint16_t block_idx_xy_fdec[16] =
-{
-    0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
-    0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
-    2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
-    2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
-    0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
-    0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
-    2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
-    2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
-};
-
-#define QP(qP) ( (qP)+QP_BD_OFFSET )
-static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
-{
-         0,      0,      0,      0,      0,      0,
-         0,      0,      0,      0,      0,      0,
-#if BIT_DEPTH > 9
-   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
-#endif
-#if BIT_DEPTH > 8
-    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
-#endif
-     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
-     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
-    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
-    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
-    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
-    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
-    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
-    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
-    QP(39), QP(39), QP(39), QP(39),
-    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
-    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
-};
-#undef QP
-
-enum cabac_ctx_block_cat_e
-{
-    DCT_LUMA_DC     = 0,
-    DCT_LUMA_AC     = 1,
-    DCT_LUMA_4x4    = 2,
-    DCT_CHROMA_DC   = 3,
-    DCT_CHROMA_AC   = 4,
-    DCT_LUMA_8x8    = 5,
-    DCT_CHROMAU_DC  = 6,
-    DCT_CHROMAU_AC  = 7,
-    DCT_CHROMAU_4x4 = 8,
-    DCT_CHROMAU_8x8 = 9,
-    DCT_CHROMAV_DC  = 10,
-    DCT_CHROMAV_AC  = 11,
-    DCT_CHROMAV_4x4 = 12,
-    DCT_CHROMAV_8x8 = 13,
-};
-
-static const uint8_t ctx_cat_plane[6][3] =
-{
-    { DCT_LUMA_DC,  DCT_CHROMAU_DC,  DCT_CHROMAV_DC},
-    { DCT_LUMA_AC,  DCT_CHROMAU_AC,  DCT_CHROMAV_AC},
-    {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
-    {0},
-    {0},
-    {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
-};
-
-/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
-int  x264_macroblock_cache_allocate( x264_t *h );
-void x264_macroblock_cache_free( x264_t *h );
-
-/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
-int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
-void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
-
-void x264_macroblock_slice_init( x264_t *h );
-void x264_macroblock_thread_init( x264_t *h );
-void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
-void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
-void x264_macroblock_deblock_strength( x264_t *h );
-void x264_macroblock_cache_save( x264_t *h );
-
-void x264_macroblock_bipred_init( x264_t *h );
-
-void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
-
-void x264_copy_column8( pixel *dst, pixel *src );
-
-/* x264_mb_predict_mv_16x16:
- *      set mvp with predicted mv for D_16x16 block
- *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
-/* x264_mb_predict_mv_pskip:
- *      set mvp with predicted mv for P_SKIP
- *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
-/* x264_mb_predict_mv:
- *      set mvp with predicted mv for all blocks except SKIP and DIRECT
- *      h->mb. need valid ref/partition/sub of current block to be valid
- *      and valid mv/ref from other blocks. */
-void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
-/* x264_mb_predict_mv_direct16x16:
- *      set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
- *      h->mb. need only valid values from other blocks.
- *      return 1 on success, 0 on failure.
- *      if b_changed != NULL, set it to whether refs or mvs differ from
- *      before this functioncall. */
-int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
-/* x264_mb_predict_mv_ref16x16:
- *      set mvc with D_16x16 prediction.
- *      uses all neighbors, even those that didn't end up using this ref.
- *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );
-
-void x264_mb_mc( x264_t *h );
-void x264_mb_mc_8x8( x264_t *h, int i8 );
-
-static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
-{
-#if WORDS_BIGENDIAN
-   return b + (a<<16);
-#else
-   return a + (b<<16);
-#endif
-}
-static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
-{
-#if WORDS_BIGENDIAN
-   return b + (a<<8);
-#else
-   return a + (b<<8);
-#endif
-}
-static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
-{
-#if WORDS_BIGENDIAN
-   return d + (c<<8) + (b<<16) + (a<<24);
-#else
-   return a + (b<<8) + (c<<16) + (d<<24);
-#endif
-}
-static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
-{
-#if WORDS_BIGENDIAN
-   return (b&0xFFFF) + (a<<16);
-#else
-   return (a&0xFFFF) + (b<<16);
-#endif
-}
-static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
-{
-#if WORDS_BIGENDIAN
-   return b + ((uint64_t)a<<32);
-#else
-   return a + ((uint64_t)b<<32);
-#endif
-}
-
-#if HIGH_BIT_DEPTH
-#   define pack_pixel_1to2 pack16to32
-#   define pack_pixel_2to4 pack32to64
-#else
-#   define pack_pixel_1to2 pack8to16
-#   define pack_pixel_2to4 pack16to32
-#endif
-
-static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
-{
-    const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
-    const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
-    const int m  = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
-                             x264_mb_pred_mode4x4_fix(mb) );
-
-    if( m < 0 )
-        return I_PRED_4x4_DC;
-
-    return m;
-}
-static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
-{
-    const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
-    const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
-
-    int i_ret = za + zb;
-
-    if( i_ret < 0x80 )
-        i_ret = ( i_ret + 1 ) >> 1;
-    return i_ret & 0x7f;
-}
-
-/* intra and skip are disallowed, p8x8 is conditional. */
-static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] =
-{
-    0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0
-};
-
-/* x264_mb_transform_8x8_allowed:
- *      check whether any partition is smaller than 8x8 (or at least
- *      might be, according to just partition type.)
- *      doesn't check for cbp */
-static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
-{
-    if( !h->pps->b_transform_8x8_mode )
-        return 0;
-    if( h->mb.i_type != P_8x8 )
-        return x264_transform_allowed[h->mb.i_type];
-    return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
-}
-
-#endif
-
diff --git a/android/src/main/libenc/jni/libx264/common/mc.c b/android/src/main/libenc/jni/libx264/common/mc.c
deleted file mode 100755
index dc39c5e..0000000
--- a/android/src/main/libenc/jni/libx264/common/mc.c
+++ /dev/null
@@ -1,770 +0,0 @@
-/*****************************************************************************
- * mc.c: motion compensation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#if HAVE_MMX
-#include "x86/mc.h"
-#endif
-#if ARCH_PPC
-#include "ppc/mc.h"
-#endif
-#if ARCH_ARM
-#include "arm/mc.h"
-#endif
-#if ARCH_AARCH64
-#include "aarch64/mc.h"
-#endif
-#if ARCH_MIPS
-#include "mips/mc.h"
-#endif
-
-
-static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
-                              pixel *src1, intptr_t i_src1_stride,
-                              pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
-{
-    for( int y = 0; y < i_height; y++ )
-    {
-        for( int x = 0; x < i_width; x++ )
-            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
-        dst  += i_dst_stride;
-        src1 += i_src1_stride;
-        src2 += i_src2_stride;
-    }
-}
-
-static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
-                                  pixel *src1, intptr_t i_src1,
-                                  pixel *src2, intptr_t i_src2, int width, int height )
-{
-    for( int y = 0; y < height; y++ )
-    {
-        for( int x = 0; x < width; x++ )
-            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
-        src1 += i_src1;
-        src2 += i_src2;
-        dst += i_dst;
-    }
-}
-
-/* Implicit weighted bipred only:
- * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
-                                         pixel *src1, intptr_t i_src1,
-                                         pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
-{
-    int i_weight2 = 64 - i_weight1;
-    for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
-        for( int x = 0; x<width; x++ )
-            dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
-}
-#undef op_scale2
-
-#define PIXEL_AVG_C( name, width, height ) \
-static void name( pixel *pix1, intptr_t i_stride_pix1, \
-                  pixel *pix2, intptr_t i_stride_pix2, \
-                  pixel *pix3, intptr_t i_stride_pix3, int weight ) \
-{ \
-    if( weight == 32 ) \
-        pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
-    else \
-        pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
-}
-PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
-PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
-PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
-PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
-PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
-PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
-PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
-PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
-PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
-PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
-PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
-PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
-
-static void x264_weight_cache( x264_t *h, x264_weight_t *w )
-{
-    w->weightfn = h->mc.weight;
-}
-#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
-#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
-static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
-                       const x264_weight_t *weight, int i_width, int i_height )
-{
-    int offset = weight->i_offset << (BIT_DEPTH-8);
-    int scale = weight->i_scale;
-    int denom = weight->i_denom;
-    if( denom >= 1 )
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
-            for( int x = 0; x < i_width; x++ )
-                opscale( x );
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
-            for( int x = 0; x < i_width; x++ )
-                opscale_noden( x );
-    }
-}
-
-#define MC_WEIGHT_C( name, width ) \
-    static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
-{ \
-    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
-}
-
-MC_WEIGHT_C( mc_weight_w20, 20 )
-MC_WEIGHT_C( mc_weight_w16, 16 )
-MC_WEIGHT_C( mc_weight_w12, 12 )
-MC_WEIGHT_C( mc_weight_w8,   8 )
-MC_WEIGHT_C( mc_weight_w4,   4 )
-MC_WEIGHT_C( mc_weight_w2,   2 )
-
-static weight_fn_t x264_mc_weight_wtab[6] =
-{
-    mc_weight_w2,
-    mc_weight_w4,
-    mc_weight_w8,
-    mc_weight_w12,
-    mc_weight_w16,
-    mc_weight_w20,
-};
-const x264_weight_t x264_weight_none[3] = { {{0}} };
-static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
-{
-    for( int y = 0; y < i_height; y++ )
-    {
-        memcpy( dst, src, i_width * sizeof(pixel) );
-
-        src += i_src_stride;
-        dst += i_dst_stride;
-    }
-}
-
-#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
-static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         intptr_t stride, int width, int height, int16_t *buf )
-{
-    const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
-    for( int y = 0; y < height; y++ )
-    {
-        for( int x = -2; x < width+3; x++ )
-        {
-            int v = TAPFILTER(src,stride);
-            dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
-            /* transform v for storage in a 16-bit integer */
-            buf[x+2] = v + pad;
-        }
-        for( int x = 0; x < width; x++ )
-            dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
-        for( int x = 0; x < width; x++ )
-            dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
-        dsth += stride;
-        dstv += stride;
-        dstc += stride;
-        src += stride;
-    }
-}
-
-const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
-static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
-                     pixel *src[4], intptr_t i_src_stride,
-                     int mvx, int mvy,
-                     int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        pixel_avg( dst, i_dst_stride, src1, i_src_stride,
-                   src2, i_src_stride, i_width, i_height );
-        if( weight->weightfn )
-            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
-    }
-    else if( weight->weightfn )
-        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
-    else
-        mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
-}
-
-static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
-                       pixel *src[4], intptr_t i_src_stride,
-                       int mvx, int mvy,
-                       int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
-                   src2, i_src_stride, i_width, i_height );
-        if( weight->weightfn )
-            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
-        return dst;
-    }
-    else if( weight->weightfn )
-    {
-        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
-        return dst;
-    }
-    else
-    {
-        *i_dst_stride = i_src_stride;
-        return src1;
-    }
-}
-
-/* full chroma mc (ie until 1/8 pixel)*/
-static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
-                       pixel *src, intptr_t i_src_stride,
-                       int mvx, int mvy,
-                       int i_width, int i_height )
-{
-    pixel *srcp;
-
-    int d8x = mvx&0x07;
-    int d8y = mvy&0x07;
-    int cA = (8-d8x)*(8-d8y);
-    int cB = d8x    *(8-d8y);
-    int cC = (8-d8x)*d8y;
-    int cD = d8x    *d8y;
-
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
-    srcp = &src[i_src_stride];
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        for( int x = 0; x < i_width; x++ )
-        {
-            dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
-                        cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
-            dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
-                        cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
-        }
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-        src   = srcp;
-        srcp += i_src_stride;
-    }
-}
-
-#define MC_COPY(W) \
-static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
-{ \
-    mc_copy( src, i_src, dst, i_dst, W, i_height ); \
-}
-MC_COPY( 16 )
-MC_COPY( 8 )
-MC_COPY( 4 )
-
-void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
-                        pixel *src, intptr_t i_src, int w, int h )
-{
-    while( h-- )
-    {
-        memcpy( dst, src, w * sizeof(pixel) );
-        dst += i_dst;
-        src += i_src;
-    }
-}
-
-void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
-                             pixel *src, intptr_t i_src, int w, int h )
-{
-    for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
-        for( int x=0; x<2*w; x+=2 )
-        {
-            dst[x]   = src[x+1];
-            dst[x+1] = src[x];
-        }
-}
-
-void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
-                                   pixel *srcu, intptr_t i_srcu,
-                                   pixel *srcv, intptr_t i_srcv, int w, int h )
-{
-    for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
-        for( int x=0; x<w; x++ )
-        {
-            dst[2*x]   = srcu[x];
-            dst[2*x+1] = srcv[x];
-        }
-}
-
-static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
-                                            pixel *dstv, intptr_t i_dstv,
-                                            pixel *src,  intptr_t i_src, int w, int h )
-{
-    for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
-        for( int x=0; x<w; x++ )
-        {
-            dstu[x] = src[2*x];
-            dstv[x] = src[2*x+1];
-        }
-}
-
-static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
-                                                pixel *dstb, intptr_t i_dstb,
-                                                pixel *dstc, intptr_t i_dstc,
-                                                pixel *src,  intptr_t i_src, int pw, int w, int h )
-{
-    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
-    {
-        for( int x=0; x<w; x++ )
-        {
-            dsta[x] = src[x*pw];
-            dstb[x] = src[x*pw+1];
-            dstc[x] = src[x*pw+2];
-        }
-    }
-}
-
-void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
-                                          pixel *dstc, intptr_t i_dstc,
-                                          uint32_t *src, intptr_t i_src, int w, int h )
-{
-    for( int l = 0; l < h; l++ )
-    {
-        pixel *dsty0 = dsty;
-        pixel *dstc0 = dstc;
-        uint32_t *src0 = src;
-
-        for( int n = 0; n < w; n += 3 )
-        {
-            *(dstc0++) = *src0 & 0x03FF;
-            *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
-            *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
-            src0++;
-            *(dsty0++) = *src0 & 0x03FF;
-            *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
-            *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
-            src0++;
-        }
-
-        dsty += i_dsty;
-        dstc += i_dstc;
-        src  += i_src;
-    }
-}
-
-static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
-{
-    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
-        for( int x=0; x<8; x++ )
-        {
-            dst[2*x]   = srcu[x];
-            dst[2*x+1] = srcv[x];
-        }
-}
-
-static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
-{
-    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
-}
-
-static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
-{
-    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
-}
-
-static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
-                                pixel *pix_uv, intptr_t stride_uv, int mb_x )
-{}
-
-static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
-{}
-
-static void memzero_aligned( void * dst, size_t n )
-{
-    memset( dst, 0, n );
-}
-
-static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
-{
-    int v = pix[0]+pix[1]+pix[2]+pix[3];
-    for( int x = 0; x < stride-4; x++ )
-    {
-        sum[x] = v + sum[x-stride];
-        v += pix[x+4] - pix[x];
-    }
-}
-
-static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
-{
-    int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
-    for( int x = 0; x < stride-8; x++ )
-    {
-        sum[x] = v + sum[x-stride];
-        v += pix[x+8] - pix[x];
-    }
-}
-
-static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
-{
-    for( int x = 0; x < stride-8; x++ )
-        sum4[x] = sum8[x+4*stride] - sum8[x];
-    for( int x = 0; x < stride-8; x++ )
-        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
-}
-
-static void integral_init8v( uint16_t *sum8, intptr_t stride )
-{
-    for( int x = 0; x < stride-8; x++ )
-        sum8[x] = sum8[x+8*stride] - sum8[x];
-}
-
-void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
-{
-    pixel *src = frame->plane[0];
-    int i_stride = frame->i_stride[0];
-    int i_height = frame->i_lines[0];
-    int i_width  = frame->i_width[0];
-
-    // duplicate last row and column so that their interpolation doesn't have to be special-cased
-    for( int y = 0; y < i_height; y++ )
-        src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
-    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
-    h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
-                                  i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
-    x264_frame_expand_border_lowres( frame );
-
-    memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
-
-    for( int y = 0; y < h->param.i_bframe + 2; y++ )
-        for( int x = 0; x < h->param.i_bframe + 2; x++ )
-            frame->i_row_satds[y][x][0] = -1;
-
-    for( int y = 0; y <= !!h->param.i_bframe; y++ )
-        for( int x = 0; x <= h->param.i_bframe; x++ )
-            frame->lowres_mvs[y][x][0][0] = 0x7FFF;
-}
-
-static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
-                                    intptr_t src_stride, intptr_t dst_stride, int width, int height )
-{
-    for( int y = 0; y < height; y++ )
-    {
-        pixel *src1 = src0+src_stride;
-        pixel *src2 = src1+src_stride;
-        for( int x = 0; x<width; x++ )
-        {
-            // slower than naive bilinear, but matches asm
-#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
-            dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
-            dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
-            dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
-            dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
-#undef FILTER
-        }
-        src0 += src_stride*2;
-        dst0 += dst_stride;
-        dsth += dst_stride;
-        dstv += dst_stride;
-        dstc += dst_stride;
-    }
-}
-
-/* Estimate the total amount of influence on future quality that could be had if we
- * were to improve the reference samples used to inter predict any given macroblock. */
-static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
-{
-    float fps = *fps_factor;
-    for( int i = 0; i < len; i++ )
-    {
-        int intra_cost = intra_costs[i];
-        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
-        float propagate_intra  = intra_cost * inv_qscales[i];
-        float propagate_amount = propagate_in[i] + propagate_intra*fps;
-        float propagate_num    = intra_cost - inter_cost;
-        float propagate_denom  = intra_cost;
-        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
-    }
-}
-
-static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
-                                   int16_t *propagate_amount, uint16_t *lowres_costs,
-                                   int bipred_weight, int mb_y, int len, int list )
-{
-    unsigned stride = h->mb.i_mb_stride;
-    unsigned width = h->mb.i_mb_width;
-    unsigned height = h->mb.i_mb_height;
-
-    for( unsigned i = 0; i < len; i++ )
-    {
-        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
-
-        if( !(lists_used & (1 << list)) )
-            continue;
-
-        int listamount = propagate_amount[i];
-        /* Apply bipred weighting. */
-        if( lists_used == 3 )
-            listamount = (listamount * bipred_weight + 32) >> 6;
-
-        /* Early termination for simple case of mv0. */
-        if( !M32( mvs[i] ) )
-        {
-            MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
-            continue;
-        }
-
-        int x = mvs[i][0];
-        int y = mvs[i][1];
-        unsigned mbx = (x>>5)+i;
-        unsigned mby = (y>>5)+mb_y;
-        unsigned idx0 = mbx + mby * stride;
-        unsigned idx2 = idx0 + stride;
-        x &= 31;
-        y &= 31;
-        int idx0weight = (32-y)*(32-x);
-        int idx1weight = (32-y)*x;
-        int idx2weight = y*(32-x);
-        int idx3weight = y*x;
-        idx0weight = (idx0weight * listamount + 512) >> 10;
-        idx1weight = (idx1weight * listamount + 512) >> 10;
-        idx2weight = (idx2weight * listamount + 512) >> 10;
-        idx3weight = (idx3weight * listamount + 512) >> 10;
-
-        if( mbx < width-1 && mby < height-1 )
-        {
-            MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
-            MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
-            MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
-            MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
-        }
-        else
-        {
-            /* Note: this takes advantage of unsigned representation to
-             * catch negative mbx/mby. */
-            if( mby < height )
-            {
-                if( mbx < width )
-                    MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
-                if( mbx+1 < width )
-                    MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
-            }
-            if( mby+1 < height )
-            {
-                if( mbx < width )
-                    MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
-                if( mbx+1 < width )
-                    MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
-            }
-        }
-    }
-}
-
-/* Conversion between float and Q8.8 fixed point (big-endian) for storage */
-static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
-{
-    for( int i = 0; i < count; i++ )
-        dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
-}
-
-static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
-{
-    for( int i = 0; i < count; i++ )
-        dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
-}
-
-void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
-{
-    pf->mc_luma   = mc_luma;
-    pf->get_ref   = get_ref;
-
-    pf->mc_chroma = mc_chroma;
-
-    pf->avg[PIXEL_16x16]= pixel_avg_16x16;
-    pf->avg[PIXEL_16x8] = pixel_avg_16x8;
-    pf->avg[PIXEL_8x16] = pixel_avg_8x16;
-    pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
-    pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
-    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
-    pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
-    pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
-    pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
-    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
-    pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
-    pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
-
-    pf->weight    = x264_mc_weight_wtab;
-    pf->offsetadd = x264_mc_weight_wtab;
-    pf->offsetsub = x264_mc_weight_wtab;
-    pf->weight_cache = x264_weight_cache;
-
-    pf->copy_16x16_unaligned = mc_copy_w16;
-    pf->copy[PIXEL_16x16] = mc_copy_w16;
-    pf->copy[PIXEL_8x8]   = mc_copy_w8;
-    pf->copy[PIXEL_4x4]   = mc_copy_w4;
-
-    pf->store_interleave_chroma       = store_interleave_chroma;
-    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
-    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
-
-    pf->plane_copy = x264_plane_copy_c;
-    pf->plane_copy_swap = x264_plane_copy_swap_c;
-    pf->plane_copy_interleave = x264_plane_copy_interleave_c;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
-    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
-
-    pf->hpel_filter = hpel_filter;
-
-    pf->prefetch_fenc_420 = prefetch_fenc_null;
-    pf->prefetch_fenc_422 = prefetch_fenc_null;
-    pf->prefetch_ref  = prefetch_ref_null;
-    pf->memcpy_aligned = memcpy;
-    pf->memzero_aligned = memzero_aligned;
-    pf->frame_init_lowres_core = frame_init_lowres_core;
-
-    pf->integral_init4h = integral_init4h;
-    pf->integral_init8h = integral_init8h;
-    pf->integral_init4v = integral_init4v;
-    pf->integral_init8v = integral_init8v;
-
-    pf->mbtree_propagate_cost = mbtree_propagate_cost;
-    pf->mbtree_propagate_list = mbtree_propagate_list;
-    pf->mbtree_fix8_pack      = mbtree_fix8_pack;
-    pf->mbtree_fix8_unpack    = mbtree_fix8_unpack;
-
-#if HAVE_MMX
-    x264_mc_init_mmx( cpu, pf );
-#endif
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-        x264_mc_altivec_init( pf );
-#endif
-#if HAVE_ARMV6
-    x264_mc_init_arm( cpu, pf );
-#endif
-#if ARCH_AARCH64
-    x264_mc_init_aarch64( cpu, pf );
-#endif
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-        x264_mc_init_mips( cpu, pf );
-#endif
-
-    if( cpu_independent )
-    {
-        pf->mbtree_propagate_cost = mbtree_propagate_cost;
-        pf->mbtree_propagate_list = mbtree_propagate_list;
-    }
-}
-
-void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
-{
-    const int b_interlaced = PARAM_INTERLACED;
-    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
-    int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
-
-    if( mb_y & b_interlaced )
-        return;
-
-    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
-    {
-        int stride = frame->i_stride[p];
-        const int width = frame->i_width[p];
-        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
-
-        if( !b_interlaced || h->mb.b_adaptive_mbaff )
-            h->mc.hpel_filter(
-                frame->filtered[p][1] + offs,
-                frame->filtered[p][2] + offs,
-                frame->filtered[p][3] + offs,
-                frame->plane[p] + offs,
-                stride, width + 16, height - start,
-                h->scratch_buffer );
-
-        if( b_interlaced )
-        {
-            /* MC must happen between pixels in the same field. */
-            stride = frame->i_stride[p] << 1;
-            start = (mb_y*16 >> 1) - 8;
-            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
-            offs = start*stride - 8;
-            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
-            {
-                h->mc.hpel_filter(
-                    frame->filtered_fld[p][1] + offs,
-                    frame->filtered_fld[p][2] + offs,
-                    frame->filtered_fld[p][3] + offs,
-                    frame->plane_fld[p] + offs,
-                    stride, width + 16, height_fld - start,
-                    h->scratch_buffer );
-            }
-        }
-    }
-
-    /* generate integral image:
-     * frame->integral contains 2 planes. in the upper plane, each element is
-     * the sum of an 8x8 pixel region with top-left corner on that point.
-     * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
-
-    if( frame->integral )
-    {
-        int stride = frame->i_stride[0];
-        if( start < 0 )
-        {
-            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
-            start = -PADV;
-        }
-        if( b_end )
-            height += PADV-9;
-        for( int y = start; y < height; y++ )
-        {
-            pixel    *pix  = frame->plane[0] + y * stride - PADH;
-            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
-            uint16_t *sum4;
-            if( h->frames.b_have_sub8x8_esa )
-            {
-                h->mc.integral_init4h( sum8, pix, stride );
-                sum8 -= 8*stride;
-                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
-                if( y >= 8-PADV )
-                    h->mc.integral_init4v( sum8, sum4, stride );
-            }
-            else
-            {
-                h->mc.integral_init8h( sum8, pix, stride );
-                if( y >= 8-PADV )
-                    h->mc.integral_init8v( sum8-8*stride, stride );
-            }
-        }
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/common/mc.h b/android/src/main/libenc/jni/libx264/common/mc.h
deleted file mode 100755
index 5a83ec2..0000000
--- a/android/src/main/libenc/jni/libx264/common/mc.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/*****************************************************************************
- * mc.h: motion compensation
- *****************************************************************************
- * Copyright (C) 2004-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MC_H
-#define X264_MC_H
-
-#define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
-#define MC_CLIP_ADD2(s,x)\
-do\
-{\
-    MC_CLIP_ADD((s)[0], (x)[0]);\
-    MC_CLIP_ADD((s)[1], (x)[1]);\
-} while(0)
-
-#define PROPAGATE_LIST(cpu)\
-void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
-                                                uint16_t *lowres_costs, int16_t *output,\
-                                                int bipred_weight, int mb_y, int len );\
-\
-static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
-                                              int16_t *propagate_amount, uint16_t *lowres_costs,\
-                                              int bipred_weight, int mb_y, int len, int list )\
-{\
-    int16_t *current = h->scratch_buffer2;\
-\
-    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
-                                               current, bipred_weight, mb_y, len );\
-\
-    unsigned stride = h->mb.i_mb_stride;\
-    unsigned width = h->mb.i_mb_width;\
-    unsigned height = h->mb.i_mb_height;\
-\
-    for( unsigned i = 0; i < len; current += 32 )\
-    {\
-        int end = X264_MIN( i+8, len );\
-        for( ; i < end; i++, current += 2 )\
-        {\
-            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
-                continue;\
-\
-            unsigned mbx = current[0];\
-            unsigned mby = current[1];\
-            unsigned idx0 = mbx + mby * stride;\
-            unsigned idx2 = idx0 + stride;\
-\
-            /* Shortcut for the simple/common case of zero MV */\
-            if( !M32( mvs[i] ) )\
-            {\
-                MC_CLIP_ADD( ref_costs[idx0], current[16] );\
-                continue;\
-            }\
-\
-            if( mbx < width-1 && mby < height-1 )\
-            {\
-                MC_CLIP_ADD2( ref_costs+idx0, current+16 );\
-                MC_CLIP_ADD2( ref_costs+idx2, current+32 );\
-            }\
-            else\
-            {\
-                /* Note: this takes advantage of unsigned representation to\
-                 * catch negative mbx/mby. */\
-                if( mby < height )\
-                {\
-                    if( mbx < width )\
-                        MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\
-                    if( mbx+1 < width )\
-                        MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\
-                }\
-                if( mby+1 < height )\
-                {\
-                    if( mbx < width )\
-                        MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\
-                    if( mbx+1 < width )\
-                        MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\
-                }\
-            }\
-        }\
-    }\
-}
-
-void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-
-#define PLANE_COPY(align, cpu)\
-static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align) / sizeof(pixel) - 1;\
-    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
-    else if( !(w&c_w) )\
-        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
-        memcpy( dst, src, w*sizeof(pixel) );\
-    }\
-}
-
-void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-
-#define PLANE_COPY_SWAP(align, cpu)\
-static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align>>1) / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else if( w > c_w )\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
-        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
-        {\
-            dst[x]   = src[x+1];\
-            dst[x+1] = src[x];\
-        }\
-    }\
-    else\
-        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
-}
-
-void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
-                                   pixel *srcu, intptr_t i_srcu,
-                                   pixel *srcv, intptr_t i_srcv, int w, int h );
-
-#define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
-                                              pixel *srcu, intptr_t i_srcu,\
-                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
-{\
-    int c_w = 16 / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_srcu > 0 )\
-            {\
-                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
-                dst  += i_dst  * h;\
-                srcu += i_srcu * h;\
-                srcv += i_srcv * h;\
-            }\
-            else\
-                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
-    }\
-    else\
-        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-}
-
-struct x264_weight_t;
-typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
-typedef struct x264_weight_t
-{
-    /* aligning the first member is a gcc hack to force the struct to be
-     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
-    ALIGNED_16( int16_t cachea[8] );
-    int16_t cacheb[8];
-    int32_t i_denom;
-    int32_t i_scale;
-    int32_t i_offset;
-    weight_fn_t *weightfn;
-} ALIGNED_16( x264_weight_t );
-
-extern const x264_weight_t x264_weight_none[3];
-extern const uint8_t x264_hpel_ref0[16];
-extern const uint8_t x264_hpel_ref1[16];
-
-#define SET_WEIGHT( w, b, s, d, o )\
-{\
-    (w).i_scale = (s);\
-    (w).i_denom = (d);\
-    (w).i_offset = (o);\
-    if( b )\
-        h->mc.weight_cache( h, &w );\
-    else\
-        w.weightfn = NULL;\
-}
-
-/* Do the MC
- * XXX: Only width = 4, 8 or 16 are valid
- * width == 4 -> height == 4 or 8
- * width == 8 -> height == 4 or 8 or 16
- * width == 16-> height == 8 or 16
- * */
-
-typedef struct
-{
-    void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
-                     int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
-
-    /* may round up the dimensions if they're not a power of 2 */
-    pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
-                       int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
-
-    /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
-     * so it must be run from left to right. */
-    void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
-                       int mvx, int mvy, int i_width, int i_height );
-
-    void (*avg[12])( pixel *dst,  intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
-                     pixel *src2, intptr_t src2_stride, int i_weight );
-
-    /* only 16x16, 8x8, and 4x4 defined */
-    void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
-    void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
-
-    void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
-    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
-
-    void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
-    void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
-    void (*plane_copy_interleave)( pixel *dst,  intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
-                                   pixel *srcv, intptr_t i_srcv, int w, int h );
-    /* may write up to 15 pixels off the end of each plane */
-    void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
-                                     pixel *src,  intptr_t i_src, int w, int h );
-    void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
-                                         pixel *dstc, intptr_t i_dstc, pixel *src,  intptr_t i_src, int pw, int w, int h );
-    void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
-                                          pixel *dstc, intptr_t i_dstc,
-                                          uint32_t *src, intptr_t i_src, int w, int h );
-    void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         intptr_t i_stride, int i_width, int i_height, int16_t *buf );
-
-    /* prefetch the next few macroblocks of fenc or fdec */
-    void (*prefetch_fenc)    ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
-    void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
-    void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
-    /* prefetch the next few macroblocks of a hpel reference frame */
-    void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
-
-    void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
-    void (*memzero_aligned)( void *dst, size_t n );
-
-    /* successive elimination prefilter */
-    void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
-    void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
-    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
-    void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
-
-    void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
-                                    intptr_t src_stride, intptr_t dst_stride, int width, int height );
-    weight_fn_t *weight;
-    weight_fn_t *offsetadd;
-    weight_fn_t *offsetsub;
-    void (*weight_cache)( x264_t *, x264_weight_t * );
-
-    void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-    void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
-                                   int16_t *propagate_amount, uint16_t *lowres_costs,
-                                   int bipred_weight, int mb_y, int len, int list );
-    void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
-    void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
-} x264_mc_functions_t;
-
-void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/dct-c.c b/android/src/main/libenc/jni/libx264/common/mips/dct-c.c
deleted file mode 100755
index c9f5687..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/dct-c.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*****************************************************************************
- * dct-c.c: msa transform and zigzag
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-
-#if !HIGH_BIT_DEPTH
-#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 )          \
-{                                                                           \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    tmp0_m = in0 + in2;                                                     \
-    tmp1_m = in0 - in2;                                                     \
-    tmp2_m = in1 >> 1;                                                      \
-    tmp2_m = tmp2_m - in3;                                                  \
-    tmp3_m = in3 >> 1;                                                      \
-    tmp3_m = in1 + tmp3_m;                                                  \
-                                                                            \
-    BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 );  \
-}
-
-static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
-                              int32_t i_src_stride )
-{
-    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
-    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
-    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
-    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
-
-    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
-    UNPCK_R_SH_SW( src0, src0_r );
-    UNPCK_R_SH_SW( src1, src1_r );
-    UNPCK_R_SH_SW( src2, src2_r );
-    UNPCK_R_SH_SW( src3, src3_r );
-    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
-                 tmp0, tmp3, tmp2, tmp1 );
-    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
-                 hor_res0, hor_res3, hor_res2, hor_res1 );
-    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
-                        hor_res0, hor_res1, hor_res2, hor_res3 );
-    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
-                 tmp0, tmp3, tmp2, tmp1 );
-    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
-                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
-    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
-    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
-                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
-                 ver_res0, ver_res1, ver_res2, ver_res3 );
-    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
-    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
-}
-
-static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref, int32_t i_dst_stride,
-                                int16_t *p_dst )
-{
-    uint32_t i_src0, i_src1, i_src2, i_src3;
-    uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
-    v16i8 src = { 0 };
-    v16i8 ref = { 0 };
-    v16u8 inp0, inp1;
-    v8i16 diff0, diff1, diff2, diff3;
-    v8i16 temp0, temp1, temp2, temp3;
-
-    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
-    LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
-
-    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
-    INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
-
-    ILVRL_B2_UB( src, ref, inp0, inp1 );
-
-    HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
-
-    diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
-    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
-
-    BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
-
-    diff0 = temp0 + temp1;
-    diff1 = ( temp3 << 1 ) + temp2;
-    diff2 = temp0 - temp1;
-    diff3 = temp3 - ( temp2 << 1 );
-
-    TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
-                        temp0, temp1, temp2, temp3 );
-    BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
-
-    temp0 = diff0 + diff1;
-    temp1 = ( diff3 << 1 ) + diff2;
-    temp2 = diff0 - diff1;
-    temp3 = diff3 - ( diff2 << 1 );
-
-    ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
-    ST_UB2( inp0, inp1, p_dst, 8 );
-}
-
-static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
-                                           int16_t pi_level[16] )
-{
-    v8i16 src0, src1;
-    v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
-    v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
-
-    LD_SH2( pi_dct, 8, src0, src1 );
-    VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
-    ST_SH2( mask0, mask1, pi_level, 8 );
-}
-
-static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
-                                    int32_t i_dst_stride )
-{
-    v8i16 src0, src1, src2, src3;
-    v8i16 hres0, hres1, hres2, hres3;
-    v8i16 vres0, vres1, vres2, vres3;
-    v8i16 zeros = { 0 };
-
-    LD4x4_SH( p_src, src0, src1, src2, src3 );
-    AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
-    TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
-                        hres0, hres1, hres2, hres3 );
-    AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
-    SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
-    ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
-    ST_SH2( zeros, zeros, p_src, 8 );
-}
-
-static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
-                                       int32_t i_dst_stride )
-{
-    int16_t i_dc;
-    uint32_t i_src0, i_src1, i_src2, i_src3;
-    v16u8 pred = { 0 };
-    v16i8 out;
-    v8i16 input_dc, pred_r, pred_l;
-
-    i_dc = ( p_src[0] + 32 ) >> 6;
-    input_dc = __msa_fill_h( i_dc );
-    p_src[ 0 ] = 0;
-
-    LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
-    INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
-    UNPCK_UB_SH( pred, pred_r, pred_l );
-
-    pred_r += input_dc;
-    pred_l += input_dc;
-
-    CLIP_SH2_0_255( pred_r, pred_l );
-    out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
-    ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
-}
-
-static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
-                                  int32_t i_dst_stride )
-{
-    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
-    v8i16 vec0, vec1, vec2, vec3;
-    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
-    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
-    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
-    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
-    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
-    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-    v16i8 zeros = { 0 };
-
-    p_src[ 0 ] += 32;
-
-    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
-
-    vec0 = src0 + src4;
-    vec1 = src0 - src4;
-    vec2 = src2 >> 1;
-    vec2 = vec2 - src6;
-    vec3 = src6 >> 1;
-    vec3 = src2 + vec3;
-
-    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
-
-    vec0 = src7 >> 1;
-    vec0 = src5 - vec0 - src3 - src7;
-    vec1 = src3 >> 1;
-    vec1 = src1 - vec1 + src7 - src3;
-    vec2 = src5 >> 1;
-    vec2 = vec2 - src1 + src7 + src5;
-    vec3 = src1 >> 1;
-    vec3 = vec3 + src3 + src5 + src1;
-    tmp4 = vec3 >> 2;
-    tmp4 += vec0;
-    tmp5 = vec2 >> 2;
-    tmp5 += vec1;
-    tmp6 = vec1 >> 2;
-    tmp6 -= vec2;
-    tmp7 = vec0 >> 2;
-    tmp7 = vec3 - tmp7;
-
-    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-                 res0, res1, res2, res3, res4, res5, res6, res7 );
-    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
-                        res0, res1, res2, res3, res4, res5, res6, res7 );
-    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
-    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
-    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
-    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
-    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
-    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
-    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
-    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
-    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
-                 vec0_r, vec0_l, vec1_l, vec1_r );
-
-    vec2_r = tmp2_r >> 1;
-    vec2_l = tmp2_l >> 1;
-    vec2_r -= tmp6_r;
-    vec2_l -= tmp6_l;
-    vec3_r = tmp6_r >> 1;
-    vec3_l = tmp6_l >> 1;
-    vec3_r += tmp2_r;
-    vec3_l += tmp2_l;
-
-    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
-                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
-    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
-                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );
-
-    vec0_r = tmp7_r >> 1;
-    vec0_l = tmp7_l >> 1;
-    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
-    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
-    vec1_r = tmp3_r >> 1;
-    vec1_l = tmp3_l >> 1;
-    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
-    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
-    vec2_r = tmp5_r >> 1;
-    vec2_l = tmp5_l >> 1;
-    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
-    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
-    vec3_r = tmp1_r >> 1;
-    vec3_l = tmp1_l >> 1;
-    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
-    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
-    tmp1_r = vec3_r >> 2;
-    tmp1_l = vec3_l >> 2;
-    tmp1_r += vec0_r;
-    tmp1_l += vec0_l;
-    tmp3_r = vec2_r >> 2;
-    tmp3_l = vec2_l >> 2;
-    tmp3_r += vec1_r;
-    tmp3_l += vec1_l;
-    tmp5_r = vec1_r >> 2;
-    tmp5_l = vec1_l >> 2;
-    tmp5_r -= vec2_r;
-    tmp5_l -= vec2_l;
-    tmp7_r = vec0_r >> 2;
-    tmp7_l = vec0_l >> 2;
-    tmp7_r = vec3_r - tmp7_r;
-    tmp7_l = vec3_l - tmp7_l;
-
-    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
-                 res0_r, res0_l, res7_l, res7_r );
-    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
-                 res1_r, res1_l, res6_l, res6_r );
-    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
-                 res2_r, res2_l, res5_l, res5_r );
-    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
-                 res3_r, res3_l, res4_l, res4_r );
-    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
-    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
-    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
-    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
-    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
-                 res0, res1, res2, res3 );
-    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
-                 res4, res5, res6, res7 );
-    LD_SB8( p_dst, i_dst_stride,
-            dst0, dst1, dst2, dst3,
-            dst4, dst5, dst6, dst7 );
-    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
-                tmp0, tmp1, tmp2, tmp3 );
-    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
-                tmp4, tmp5, tmp6, tmp7 );
-    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
-          res0, res1, res2, res3 );
-    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
-          res4, res5, res6, res7 );
-    CLIP_SH4_0_255( res0, res1, res2, res3 );
-    CLIP_SH4_0_255( res4, res5, res6, res7 );
-    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
-                 dst0, dst1, dst2, dst3 );
-    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
-    p_dst += ( 4 * i_dst_stride );
-    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
-}
-
-static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
-                               int16_t *p_dst, int32_t i_dst_stride )
-{
-    v8i16 src0, src1, src2, src3;
-    v4i32 src0_r, src1_r, src2_r, src3_r;
-    v4i32 hres0, hres1, hres2, hres3;
-    v8i16 vres0, vres1, vres2, vres3;
-    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-    v2i64 res0, res1;
-
-    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
-    UNPCK_R_SH_SW( src0, src0_r );
-    UNPCK_R_SH_SW( src1, src1_r );
-    UNPCK_R_SH_SW( src2, src2_r );
-    UNPCK_R_SH_SW( src3, src3_r );
-    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
-    BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
-    TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
-                        hres0, hres1, hres2, hres3 );
-    BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
-    BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
-    PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
-                 vres0, vres1, vres2, vres3 );
-    PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
-    ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
-}
-
-static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
-                                    uint8_t *pred_ptr, int32_t i_pred_stride )
-{
-    int16_t i_sum;
-    uint32_t i_src0, i_src1, i_src2, i_src3;
-    uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
-    v16i8 src = { 0 };
-    v16i8 pred = { 0 };
-    v16u8 src_l0, src_l1;
-    v8i16 diff0, diff1;
-
-    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
-    LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
-    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
-    INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
-    ILVRL_B2_UB( src, pred, src_l0, src_l1 );
-    HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
-    i_sum = HADD_UH_U32( diff0 + diff1 );
-
-    return i_sum;
-}
-
-void x264_dct4x4dc_msa( int16_t d[16] )
-{
-    avc_dct4x4dc_msa( d, d, 4 );
-}
-
-void x264_idct4x4dc_msa( int16_t d[16] )
-{
-    avc_idct4x4dc_msa( d, 4, d, 4 );
-}
-
-void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
-{
-    avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
-}
-
-void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
-{
-    avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
-    avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
-    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
-                            &pi_dct[2][0], FDEC_STRIDE );
-    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
-                            &pi_dct[3][0], FDEC_STRIDE );
-}
-
-void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
-{
-    x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
-    x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
-    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
-    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
-}
-
-void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
-{
-    avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
-}
-
-void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
-{
-    avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
-    avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
-    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
-                          &pi_dct[2][0], FDEC_STRIDE );
-    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
-                          &pi_dct[3][0], FDEC_STRIDE );
-}
-
-void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
-{
-    avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
-    avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
-    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
-                               &pi_dct[2], FDEC_STRIDE );
-    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
-                               &pi_dct[3], FDEC_STRIDE );
-}
-
-void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
-{
-    for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
-    {
-        avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
-        avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
-        avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
-        avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
-    }
-}
-
-void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
-                          uint8_t *p_ref )
-{
-    avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
-}
-
-void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
-                          uint8_t *p_ref )
-{
-    avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
-                        &p_ref[0], FDEC_STRIDE, p_dst[0] );
-    avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
-                        FDEC_STRIDE, p_dst[1] );
-    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
-                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
-                        FDEC_STRIDE, p_dst[2] );
-    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
-                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
-                        FDEC_STRIDE, p_dst[3] );
-}
-
-void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
-                            uint8_t *p_src,
-                            uint8_t *p_ref )
-{
-    x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
-    x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
-    x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
-                         &p_ref[8*FDEC_STRIDE+0] );
-    x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
-                         &p_ref[8*FDEC_STRIDE+8] );
-}
-
-void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
-                             uint8_t *p_pix1, uint8_t *p_pix2 )
-{
-    int32_t d0, d1, d2, d3;
-
-    pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
-                                     &p_pix2[0], FDEC_STRIDE );
-    pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
-                                     &p_pix2[4], FDEC_STRIDE );
-    pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
-                                     &p_pix2[4 * FDEC_STRIDE + 0],
-                                     FDEC_STRIDE );
-    pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
-                                     &p_pix2[4 * FDEC_STRIDE + 4],
-                                     FDEC_STRIDE );
-
-    BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
-    BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
-}
-
-void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
-                              uint8_t *p_pix1, uint8_t *p_pix2 )
-{
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
-    int32_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-    a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
-                              &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
-    a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
-                              &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
-    a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
-                              &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
-    a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
-                              &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
-    a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
-                              &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
-    a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
-                              &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
-    a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
-                              &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
-    a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
-                              &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
-
-    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
-                 b0, b1, b2, b3, b7, b6, b5, b4 );
-    BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
-                 a0, a1, a2, a3, a7, a6, a5, a4 );
-    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
-                 pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
-                 pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
-}
-
-void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
-{
-    avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/dct.h b/android/src/main/libenc/jni/libx264/common/mips/dct.h
deleted file mode 100755
index a57fb4e..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/dct.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*****************************************************************************
- * dct.h: msa transform and zigzag
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_DCT_H
-#define X264_MIPS_DCT_H
-
-void x264_dct4x4dc_msa( int16_t d[16] );
-void x264_idct4x4dc_msa( int16_t d[16] );
-void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
-void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
-void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
-void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
-void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
-void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
-void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
-void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
-void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
-                          uint8_t *p_ref );
-void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
-                            uint8_t *p_ref );
-void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
-                             uint8_t *p_pix2 );
-void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
-                              uint8_t *p_pix2 );
-void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/deblock-c.c b/android/src/main/libenc/jni/libx264/common/mips/deblock-c.c
deleted file mode 100755
index 4ced1dc..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/deblock-c.c
+++ /dev/null
@@ -1,2010 +0,0 @@
-/*****************************************************************************
- * deblock-c.c: msa deblocking
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Neha Rana <neha.rana@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-
-#if !HIGH_BIT_DEPTH
-#define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in,           \
-                                  q3_or_p3_org_in, p1_or_q1_org_in,           \
-                                  p2_or_q2_org_in, q1_or_p1_org_in,           \
-                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out )  \
-{                                                                             \
-    v8i16 threshold;                                                          \
-    v8i16 const3 = __msa_ldi_h( 3 );                                          \
-                                                                              \
-    threshold = p0_or_q0_org_in + q3_or_p3_org_in;                            \
-    threshold += p1_or_q1_org_in;                                             \
-                                                                              \
-    p0_or_q0_out = threshold << 1;                                            \
-    p0_or_q0_out += p2_or_q2_org_in;                                          \
-    p0_or_q0_out += q1_or_p1_org_in;                                          \
-    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 );                          \
-                                                                              \
-    p1_or_q1_out = p2_or_q2_org_in + threshold;                               \
-    p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 );                          \
-                                                                              \
-    p2_or_q2_out = p2_or_q2_org_in * const3;                                  \
-    p2_or_q2_out += p3_or_q3_org_in;                                          \
-    p2_or_q2_out += p3_or_q3_org_in;                                          \
-    p2_or_q2_out += threshold;                                                \
-    p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 );                          \
-}
-
-/* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
-#define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in,  \
-                          p1_or_q1_org_in, p0_or_q0_out )    \
-{                                                            \
-    p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in;        \
-    p0_or_q0_out += p1_or_q1_org_in;                         \
-    p0_or_q0_out += p1_or_q1_org_in;                         \
-    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 );         \
-}
-
-#define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in,          \
-                          p1_or_q1_org_in, p2_or_q2_org_in,          \
-                          negate_tc_in, tc_in, p1_or_q1_out )        \
-{                                                                    \
-    v8i16 clip3, temp;                                               \
-                                                                     \
-    clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in,     \
-                                      ( v8u16 ) q0_or_p0_org_in );   \
-    temp = p1_or_q1_org_in << 1;                                     \
-    clip3 -= temp;                                                   \
-    clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 );                 \
-    clip3 = CLIP_SH( clip3, negate_tc_in, tc_in );                   \
-    p1_or_q1_out = p1_or_q1_org_in + clip3;                          \
-}
-
-#define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in,           \
-                      p1_or_q1_org_in, q1_or_p1_org_in,           \
-                      negate_threshold_in, threshold_in,          \
-                      p0_or_q0_out, q0_or_p0_out )                \
-{                                                                 \
-    v8i16 q0_sub_p0, p1_sub_q1, delta;                            \
-                                                                  \
-    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;                \
-    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;                \
-    q0_sub_p0 <<= 2;                                              \
-    p1_sub_q1 += 4;                                               \
-    delta = q0_sub_p0 + p1_sub_q1;                                \
-    delta >>= 3;                                                  \
-                                                                  \
-    delta = CLIP_SH( delta, negate_threshold_in, threshold_in );  \
-                                                                  \
-    p0_or_q0_out = p0_or_q0_org_in + delta;                       \
-    q0_or_p0_out = q0_or_p0_org_in - delta;                       \
-                                                                  \
-    CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out );                 \
-}
-
-static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
-                                                    uint8_t u_alpha_in,
-                                                    uint8_t u_beta_in,
-                                                    uint32_t u_img_width )
-{
-    v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
-    v16u8 alpha, beta;
-    v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
-    v16u8 p2, p1, p0, q0, q1, q2;
-    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
-    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-    v8i16 p2_r = { 0 };
-    v8i16 p1_r = { 0 };
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 q1_r = { 0 };
-    v8i16 q2_r = { 0 };
-    v8i16 p2_l = { 0 };
-    v8i16 p1_l = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-    v8i16 q1_l = { 0 };
-    v8i16 q2_l = { 0 };
-    v16u8 tmp_flag;
-    v16i8 zero = { 0 };
-
-    alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-    beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-    LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
-            p1_org, p0_org, q0_org, q1_org );
-
-    {
-        v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-    }
-
-    if( !__msa_test_bz_v( is_less_than ) )
-    {
-        q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
-        p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
-        p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
-
-        UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-        UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-        UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-
-        tmp_flag = alpha >> 2;
-        tmp_flag = tmp_flag + 2;
-        tmp_flag = ( p0_asub_q0 < tmp_flag );
-
-        p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
-        is_less_than_beta = ( p2_asub_p0 < beta );
-        is_less_than_beta = is_less_than_beta & tmp_flag;
-        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
-        is_less_than_beta = is_less_than_beta & is_less_than;
-        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
-        {
-            v8u16 is_less_than_beta_l, is_less_than_beta_r;
-
-            q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
-
-            is_less_than_beta_r =
-                ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
-            {
-                v8i16 p3_org_r;
-
-                ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
-                                          q0_org_r, p1_org_r,
-                                          p2_r, q1_org_r, p0_r, p1_r, p2_r );
-            }
-
-            q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
-
-            is_less_than_beta_l =
-                ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
-
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
-            {
-                v8i16 p3_org_l;
-
-                ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
-                                          q0_org_l, p1_org_l,
-                                          p2_l, q1_org_l, p0_l, p1_l, p2_l );
-            }
-        }
-        /* combine and store */
-        if( !__msa_test_bz_v( is_less_than_beta ) )
-        {
-            PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
-
-            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
-            p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
-            p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
-
-            ST_UB( p1_org, p_data - ( 2 * u_img_width ) );
-            ST_UB( p2_org, p_data - ( 3 * u_img_width ) );
-        }
-        {
-            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
-
-            negate_is_less_than_beta_r =
-                ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
-                                        zero, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
-            {
-                AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
-            }
-
-            negate_is_less_than_beta_l =
-                ( v8u16 ) __msa_sldi_b( zero,
-                                        ( v16i8 ) negate_is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
-            {
-                AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
-            }
-        }
-        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
-        {
-            p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
-            p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
-        }
-
-        ST_UB( p0_org, p_data - u_img_width );
-
-        q3_org = LD_UB( p_data + ( 3 * u_img_width ) );
-        q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
-        is_less_than_beta = ( q2_asub_q0 < beta );
-        is_less_than_beta = is_less_than_beta & tmp_flag;
-        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
-        is_less_than_beta = is_less_than_beta & is_less_than;
-        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
-
-        {
-            v8u16 is_less_than_beta_l, is_less_than_beta_r;
-            is_less_than_beta_r =
-                ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
-            {
-                v8i16 q3_org_r;
-
-                ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
-                                          p0_org_r, q1_org_r,
-                                          q2_r, p1_org_r, q0_r, q1_r, q2_r );
-            }
-            is_less_than_beta_l =
-                ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
-            {
-                v8i16 q3_org_l;
-
-                ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
-                                          p0_org_l, q1_org_l,
-                                          q2_l, p1_org_l, q0_l, q1_l, q2_l );
-            }
-        }
-
-        if( !__msa_test_bz_v( is_less_than_beta ) )
-        {
-            PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
-            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
-            q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
-            q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
-
-            ST_UB( q1_org, p_data + u_img_width );
-            ST_UB( q2_org, p_data + 2 * u_img_width );
-        }
-        {
-            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
-            negate_is_less_than_beta_r =
-                ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
-                                        zero, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
-            {
-                AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
-            }
-
-            negate_is_less_than_beta_l =
-                ( v8u16 ) __msa_sldi_b( zero,
-                                        ( v16i8 ) negate_is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
-            {
-                AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
-            }
-        }
-        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
-        {
-            q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
-            q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
-        }
-
-        ST_UB( q0_org, p_data );
-    }
-}
-
-static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data,
-                                                    uint8_t u_alpha_in,
-                                                    uint8_t u_beta_in,
-                                                    uint32_t u_img_width )
-{
-    uint8_t *p_src;
-    v16u8 alpha, beta, p0_asub_q0;
-    v16u8 is_less_than_alpha, is_less_than;
-    v16u8 is_less_than_beta, negate_is_less_than_beta;
-    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
-    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-    v8i16 p2_r = { 0 };
-    v8i16 p1_r = { 0 };
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 q1_r = { 0 };
-    v8i16 q2_r = { 0 };
-    v8i16 p2_l = { 0 };
-    v8i16 p1_l = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-    v8i16 q1_l = { 0 };
-    v8i16 q2_l = { 0 };
-    v16i8 zero = { 0 };
-    v16u8 tmp_flag;
-
-    p_src = p_data - 4;
-
-    {
-        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-
-        LD_UB8( p_src, u_img_width,
-                row0, row1, row2, row3, row4, row5, row6, row7 );
-        LD_UB8( p_src + ( 8 * u_img_width ), u_img_width,
-                row8, row9, row10, row11, row12, row13, row14, row15 );
-
-        TRANSPOSE16x8_UB_UB( row0, row1, row2, row3,
-                             row4, row5, row6, row7,
-                             row8, row9, row10, row11,
-                             row12, row13, row14, row15,
-                             p3_org, p2_org, p1_org, p0_org,
-                             q0_org, q1_org, q2_org, q3_org );
-    }
-
-    UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-    UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-    UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-    UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
-
-    {
-        v16u8 p1_asub_p0, q1_asub_q0;
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-    }
-
-    if( !__msa_test_bz_v( is_less_than ) )
-    {
-        tmp_flag = alpha >> 2;
-        tmp_flag = tmp_flag + 2;
-        tmp_flag = ( p0_asub_q0 < tmp_flag );
-
-        {
-            v16u8 p2_asub_p0;
-
-            p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
-            is_less_than_beta = ( p2_asub_p0 < beta );
-        }
-        is_less_than_beta = tmp_flag & is_less_than_beta;
-        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
-        is_less_than_beta = is_less_than_beta & is_less_than;
-        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
-
-        {
-            v16u8 is_less_than_beta_r;
-
-            is_less_than_beta_r =
-                ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
-            if( !__msa_test_bz_v( is_less_than_beta_r ) )
-            {
-                v8i16 p3_org_r;
-
-                ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
-                                          q0_org_r, p1_org_r,
-                                          p2_r, q1_org_r, p0_r, p1_r, p2_r );
-            }
-        }
-
-        {
-            v16u8 is_less_than_beta_l;
-
-            is_less_than_beta_l =
-                ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( is_less_than_beta_l ) )
-            {
-                v8i16 p3_org_l;
-
-                ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
-                                          q0_org_l, p1_org_l,
-                                          p2_l, q1_org_l, p0_l, p1_l, p2_l );
-            }
-        }
-        if( !__msa_test_bz_v( is_less_than_beta ) )
-        {
-            v16u8 p0, p2, p1;
-
-            PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
-            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
-            p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
-            p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
-        }
-        {
-            v16u8 negate_is_less_than_beta_r;
-
-            negate_is_less_than_beta_r =
-                ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
-                                        zero, 8 );
-
-            if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
-            {
-                AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
-            }
-        }
-        {
-            v16u8 negate_is_less_than_beta_l;
-
-            negate_is_less_than_beta_l =
-                ( v16u8 ) __msa_sldi_b( zero,
-                                        ( v16i8 ) negate_is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
-            {
-                AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
-            }
-        }
-
-        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
-        {
-            v16u8 p0;
-
-            p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
-            p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
-        }
-
-        {
-            v16u8 q2_asub_q0;
-
-            q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
-            is_less_than_beta = ( q2_asub_q0 < beta );
-        }
-
-        is_less_than_beta = is_less_than_beta & tmp_flag;
-        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
-
-        is_less_than_beta = is_less_than_beta & is_less_than;
-        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
-
-        {
-            v16u8 is_less_than_beta_r;
-
-            is_less_than_beta_r =
-                ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
-            if( !__msa_test_bz_v( is_less_than_beta_r ) )
-            {
-                v8i16 q3_org_r;
-
-                ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
-                                          p0_org_r, q1_org_r,
-                                          q2_r, p1_org_r, q0_r, q1_r, q2_r );
-            }
-        }
-        {
-            v16u8 is_less_than_beta_l;
-
-            is_less_than_beta_l =
-                ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( is_less_than_beta_l ) )
-            {
-                v8i16 q3_org_l;
-
-                ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
-                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
-                                          p0_org_l, q1_org_l,
-                                          q2_l, p1_org_l, q0_l, q1_l, q2_l );
-            }
-        }
-        if( !__msa_test_bz_v( is_less_than_beta ) )
-        {
-            v16u8 q0, q1, q2;
-
-            PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
-            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
-            q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
-            q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
-        }
-
-        {
-            v16u8 negate_is_less_than_beta_r;
-
-            negate_is_less_than_beta_r =
-                ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
-                                        zero, 8 );
-            if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
-            {
-                AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
-            }
-        }
-        {
-            v16u8 negate_is_less_than_beta_l;
-
-            negate_is_less_than_beta_l =
-                ( v16u8 ) __msa_sldi_b( zero,
-                                        ( v16i8 ) negate_is_less_than_beta, 8 );
-            if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
-            {
-                AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
-            }
-        }
-        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
-        {
-            v16u8 q0;
-
-            q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
-            q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
-        }
-    }
-    {
-        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-        ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 );
-        ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 );
-        ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
-
-        ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 );
-        ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 );
-
-        p_src = p_data - 3;
-        ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width );
-        ST2x4_UB( tmp2, 0, p_src + 4, u_img_width );
-        p_src += 4 * u_img_width;
-        ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width );
-        ST2x4_UB( tmp2, 4, p_src + 4, u_img_width );
-        p_src += 4 * u_img_width;
-
-        ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width );
-        ST2x4_UB( tmp5, 0, p_src + 4, u_img_width );
-        p_src += 4 * u_img_width;
-        ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width );
-        ST2x4_UB( tmp5, 4, p_src + 4, u_img_width );
-    }
-}
-
-static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma,
-                                                         uint8_t u_alpha_in,
-                                                         uint8_t u_beta_in,
-                                                         uint32_t u_img_width )
-{
-    v16u8 alpha, beta, is_less_than;
-    v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-
-    alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-    beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-    LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
-            p1_org, p0_org, q0_org, q1_org );
-
-    {
-        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
-        v16u8 is_less_than_alpha, is_less_than_beta;
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-    }
-
-    if( !__msa_test_bz_v( is_less_than ) )
-    {
-        v16i8 zero = { 0 };
-        v16u8 is_less_than_r, is_less_than_l;
-
-        is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
-                                                 zero, 8 );
-        if( !__msa_test_bz_v( is_less_than_r ) )
-        {
-            v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-
-            ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
-                        zero, q1_org, p1_org_r, p0_org_r, q0_org_r,
-                        q1_org_r );
-            AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
-            AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
-        }
-
-        is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
-                                                 ( v16i8 ) is_less_than, 8 );
-        if( !__msa_test_bz_v( is_less_than_l ) )
-        {
-            v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-
-            ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
-                        zero, q1_org, p1_org_l, p0_org_l, q0_org_l,
-                        q1_org_l );
-            AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
-            AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
-        }
-
-        PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-        p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
-        q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
-
-        ST_UB( p0_org, ( p_chroma - u_img_width ) );
-        ST_UB( q0_org, p_chroma );
-    }
-}
-
-static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma,
-                                                         uint8_t u_alpha_in,
-                                                         uint8_t u_beta_in,
-                                                         uint32_t u_img_width )
-{
-    v16u8 is_less_than;
-    v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-    v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
-    v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
-    v16i8 tmp0, tmp1, tmp2, tmp3;
-    v4i32 vec0, vec1;
-    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-
-    LD_UB8( ( p_chroma - 4 ), u_img_width,
-            row0, row1, row2, row3, row4, row5, row6, row7 );
-
-    TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
-                        p1_u_org, p1_v_org, p0_u_org, p0_v_org,
-                        q0_u_org, q0_v_org, q1_u_org, q1_v_org );
-
-    ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
-                q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
-
-    {
-        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
-        v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta;
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-    }
-
-    if( !__msa_test_bz_v( is_less_than ) )
-    {
-        v16u8 is_less_than_r, is_less_than_l;
-        v16i8 zero = { 0 };
-
-        is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
-                                                 zero, 8 );
-        if( !__msa_test_bz_v( is_less_than_r ) )
-        {
-            v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-
-            ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
-                        zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r );
-            AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
-            AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
-        }
-
-        is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
-                                                 ( v16i8 ) is_less_than, 8 );
-        if( !__msa_test_bz_v( is_less_than_l ) )
-        {
-            v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-
-            ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
-                        zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l );
-            AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
-            AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
-        }
-
-        PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-        p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
-        q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
-
-        SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
-        ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
-        ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
-        ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
-
-        ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
-    }
-}
-
-static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data,
-                                                    uint8_t u_bs0,
-                                                    uint8_t u_bs1,
-                                                    uint8_t u_bs2,
-                                                    uint8_t u_bs3,
-                                                    uint8_t u_tc0,
-                                                    uint8_t u_tc1,
-                                                    uint8_t u_tc2,
-                                                    uint8_t u_tc3,
-                                                    uint8_t u_alpha_in,
-                                                    uint8_t u_beta_in,
-                                                    uint32_t u_img_width )
-{
-    uint8_t *p_src;
-    v16u8 beta, tmp_vec, bs = { 0 };
-    v16u8 tc = { 0 };
-    v16u8 is_less_than, is_less_than_beta;
-    v16u8 p1, p0, q0, q1;
-    v8i16 p0_r, q0_r, p1_r = { 0 };
-    v8i16 q1_r = { 0 };
-    v8i16 p0_l, q0_l, p1_l = { 0 };
-    v8i16 q1_l = { 0 };
-    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
-    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
-    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
-    v8i16 tc_r, tc_l;
-    v16i8 zero = { 0 };
-    v16u8 is_bs_greater_than0;
-
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
-
-    if( !__msa_test_bz_v( bs ) )
-    {
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
-        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
-        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
-        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
-        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
-
-        is_bs_greater_than0 = ( zero < bs );
-
-        {
-            v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-            v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-
-            p_src = p_data;
-            p_src -= 4;
-
-            LD_UB8( p_src, u_img_width,
-                    row0, row1, row2, row3, row4, row5, row6, row7 );
-            p_src += ( 8 * u_img_width );
-            LD_UB8( p_src, u_img_width,
-                    row8, row9, row10, row11, row12, row13, row14, row15 );
-
-            TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
-                                 row8, row9, row10, row11,
-                                 row12, row13, row14, row15,
-                                 p3_org, p2_org, p1_org, p0_org,
-                                 q0_org, q1_org, q2_org, q3_org );
-        }
-        {
-            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
-            v16u8 is_less_than_alpha;
-
-            p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-            p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-            q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-            alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-            beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-            is_less_than_alpha = ( p0_asub_q0 < alpha );
-            is_less_than_beta = ( p1_asub_p0 < beta );
-            is_less_than = is_less_than_beta & is_less_than_alpha;
-            is_less_than_beta = ( q1_asub_q0 < beta );
-            is_less_than = is_less_than_beta & is_less_than;
-            is_less_than = is_less_than & is_bs_greater_than0;
-        }
-        if( !__msa_test_bz_v( is_less_than ) )
-        {
-            v16i8 negate_tc, sign_negate_tc;
-            v8i16 negate_tc_r, i16_negatetc_l;
-
-            negate_tc = zero - ( v16i8 ) tc;
-            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
-
-            ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
-                         i16_negatetc_l );
-
-            UNPCK_UB_SH( tc, tc_r, tc_l );
-            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-
-            {
-                v16u8 p2_asub_p0;
-                v16u8 is_less_than_beta_r, is_less_than_beta_l;
-
-                p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
-                is_less_than_beta = ( p2_asub_p0 < beta );
-                is_less_than_beta = is_less_than_beta & is_less_than;
-
-                is_less_than_beta_r =
-                    ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
-                                            zero, 8 );
-                if( !__msa_test_bz_v( is_less_than_beta_r ) )
-                {
-                    p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
-                                      negate_tc_r, tc_r, p1_r );
-                }
-
-                is_less_than_beta_l =
-                    ( v16u8 ) __msa_sldi_b( zero,
-                                            ( v16i8 ) is_less_than_beta, 8 );
-                if( !__msa_test_bz_v( is_less_than_beta_l ) )
-                {
-                    p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
-                                      i16_negatetc_l, tc_l, p1_l );
-                }
-            }
-
-            if( !__msa_test_bz_v( is_less_than_beta ) )
-            {
-                p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
-                p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
-
-                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
-                tc = tc + is_less_than_beta;
-            }
-
-            {
-                v16u8 u8_q2asub_q0;
-                v16u8 is_less_than_beta_l, is_less_than_beta_r;
-
-                u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
-                is_less_than_beta = ( u8_q2asub_q0 < beta );
-                is_less_than_beta = is_less_than_beta & is_less_than;
-
-                q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
-
-                is_less_than_beta_r =
-                    ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
-                                            zero, 8 );
-                if( !__msa_test_bz_v( is_less_than_beta_r ) )
-                {
-                    q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
-                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
-                                      negate_tc_r, tc_r, q1_r );
-                }
-
-                q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
-
-                is_less_than_beta_l =
-                    ( v16u8 ) __msa_sldi_b( zero,
-                                            ( v16i8 ) is_less_than_beta, 8 );
-                if( !__msa_test_bz_v( is_less_than_beta_l ) )
-                {
-                    q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
-                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
-                                      i16_negatetc_l, tc_l, q1_l );
-                }
-            }
-
-            if( !__msa_test_bz_v( is_less_than_beta ) )
-            {
-                q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
-                q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
-
-                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
-                tc = tc + is_less_than_beta;
-            }
-
-            {
-                v8i16 threshold_r, negate_thresh_r;
-                v8i16 threshold_l, negate_thresh_l;
-                v16i8 negate_thresh, sign_negate_thresh;
-
-                negate_thresh = zero - ( v16i8 ) tc;
-                sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
-
-                ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
-                            threshold_r, negate_thresh_r );
-
-                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
-                              negate_thresh_r, threshold_r, p0_r, q0_r );
-
-                threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc );
-                negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
-                                                          negate_thresh );
-
-                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
-                              negate_thresh_l, threshold_l, p0_l, q0_l );
-            }
-
-            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
-            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
-        }
-        {
-            v16i8 tp0, tp1, tp2, tp3;
-            v8i16 tmp2, tmp5;
-            v4i32 tmp3, tmp4, tmp6, tmp7;
-            uint32_t u_out0, u_out2;
-            uint16_t u_out1, u_out3;
-
-            p_src = p_data - 3;
-
-            ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 );
-            ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 );
-            ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
-
-            ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 );
-            ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 );
-
-            u_out0 = __msa_copy_u_w( tmp3, 0 );
-            u_out1 = __msa_copy_u_h( tmp2, 0 );
-            u_out2 = __msa_copy_u_w( tmp3, 1 );
-            u_out3 = __msa_copy_u_h( tmp2, 1 );
-
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp3, 2 );
-            u_out1 = __msa_copy_u_h( tmp2, 2 );
-            u_out2 = __msa_copy_u_w( tmp3, 3 );
-            u_out3 = __msa_copy_u_h( tmp2, 3 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp4, 0 );
-            u_out1 = __msa_copy_u_h( tmp2, 4 );
-            u_out2 = __msa_copy_u_w( tmp4, 1 );
-            u_out3 = __msa_copy_u_h( tmp2, 5 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp4, 2 );
-            u_out1 = __msa_copy_u_h( tmp2, 6 );
-            u_out2 = __msa_copy_u_w( tmp4, 3 );
-            u_out3 = __msa_copy_u_h( tmp2, 7 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp6, 0 );
-            u_out1 = __msa_copy_u_h( tmp5, 0 );
-            u_out2 = __msa_copy_u_w( tmp6, 1 );
-            u_out3 = __msa_copy_u_h( tmp5, 1 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp6, 2 );
-            u_out1 = __msa_copy_u_h( tmp5, 2 );
-            u_out2 = __msa_copy_u_w( tmp6, 3 );
-            u_out3 = __msa_copy_u_h( tmp5, 3 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp7, 0 );
-            u_out1 = __msa_copy_u_h( tmp5, 4 );
-            u_out2 = __msa_copy_u_w( tmp7, 1 );
-            u_out3 = __msa_copy_u_h( tmp5, 5 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-
-            u_out0 = __msa_copy_u_w( tmp7, 2 );
-            u_out1 = __msa_copy_u_h( tmp5, 6 );
-            u_out2 = __msa_copy_u_w( tmp7, 3 );
-            u_out3 = __msa_copy_u_h( tmp5, 7 );
-
-            p_src += u_img_width;
-            SW( u_out0, p_src );
-            SH( u_out1, ( p_src + 4 ) );
-            p_src += u_img_width;
-            SW( u_out2, p_src );
-            SH( u_out3, ( p_src + 4 ) );
-        }
-    }
-}
-
-static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data,
-                                                    uint8_t u_bs0,
-                                                    uint8_t u_bs1,
-                                                    uint8_t u_bs2,
-                                                    uint8_t u_bs3,
-                                                    uint8_t u_tc0,
-                                                    uint8_t u_tc1,
-                                                    uint8_t u_tc2,
-                                                    uint8_t u_tc3,
-                                                    uint8_t u_alpha_in,
-                                                    uint8_t u_beta_in,
-                                                    uint32_t u_image_width )
-{
-    v16u8 p2_asub_p0, u8_q2asub_q0;
-    v16u8 alpha, beta, is_less_than, is_less_than_beta;
-    v16u8 p1, p0, q0, q1;
-    v8i16 p1_r = { 0 };
-    v8i16 p0_r, q0_r, q1_r = { 0 };
-    v8i16 p1_l = { 0 };
-    v8i16 p0_l, q0_l, q1_l = { 0 };
-    v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
-    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
-    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
-    v16i8 zero = { 0 };
-    v16u8 tmp_vec;
-    v16u8 bs = { 0 };
-    v16i8 tc = { 0 };
-
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
-    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
-    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
-
-    if( !__msa_test_bz_v( bs ) )
-    {
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
-        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
-        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
-        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
-        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
-        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
-
-        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-        LD_UB5( p_data - ( 3 * u_image_width ), u_image_width,
-                p2_org, p1_org, p0_org, q0_org, q1_org );
-
-        {
-            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
-            v16u8 is_less_than_alpha, is_bs_greater_than0;
-
-            is_bs_greater_than0 = ( ( v16u8 ) zero < bs );
-            p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-            p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-            q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-            is_less_than_alpha = ( p0_asub_q0 < alpha );
-            is_less_than_beta = ( p1_asub_p0 < beta );
-            is_less_than = is_less_than_beta & is_less_than_alpha;
-            is_less_than_beta = ( q1_asub_q0 < beta );
-            is_less_than = is_less_than_beta & is_less_than;
-            is_less_than = is_less_than & is_bs_greater_than0;
-        }
-
-        if( !__msa_test_bz_v( is_less_than ) )
-        {
-            v16i8 sign_negate_tc, negate_tc;
-            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
-
-            q2_org = LD_UB( p_data + ( 2 * u_image_width ) );
-            negate_tc = zero - tc;
-            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
-
-            ILVRL_B2_SH( sign_negate_tc, negate_tc,
-                         negate_tc_r, i16_negatetc_l );
-
-            UNPCK_UB_SH( tc, tc_r, tc_l );
-            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-
-            p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
-            is_less_than_beta = ( p2_asub_p0 < beta );
-            is_less_than_beta = is_less_than_beta & is_less_than;
-            {
-                v8u16 is_less_than_beta_r, is_less_than_beta_l;
-
-                is_less_than_beta_r =
-                    ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
-                                            zero, 8 );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
-                {
-                    p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
-                                      negate_tc_r, tc_r, p1_r );
-                }
-
-                is_less_than_beta_l =
-                    ( v8u16 ) __msa_sldi_b( zero,
-                                            ( v16i8 ) is_less_than_beta, 8 );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
-                {
-                    p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
-                                      i16_negatetc_l, tc_l, p1_l );
-                }
-            }
-            if( !__msa_test_bz_v( is_less_than_beta ) )
-            {
-                p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
-                p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
-                ST_UB( p1_org, p_data - ( 2 * u_image_width ) );
-
-                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
-                tc = tc + ( v16i8 ) is_less_than_beta;
-            }
-
-            u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
-            is_less_than_beta = ( u8_q2asub_q0 < beta );
-            is_less_than_beta = is_less_than_beta & is_less_than;
-
-            {
-                v8u16 is_less_than_beta_r, is_less_than_beta_l;
-                is_less_than_beta_r =
-                    ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
-                                            zero, 8 );
-
-                q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
-                {
-                    q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
-                                      negate_tc_r, tc_r, q1_r );
-                }
-                is_less_than_beta_l =
-                    ( v8u16 ) __msa_sldi_b( zero,
-                                            ( v16i8 ) is_less_than_beta, 8 );
-
-                q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
-                {
-                    q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
-
-                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
-                                      i16_negatetc_l, tc_l, q1_l );
-                }
-            }
-            if( !__msa_test_bz_v( is_less_than_beta ) )
-            {
-                q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
-                q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
-                ST_UB( q1_org, p_data + u_image_width );
-
-                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
-                tc = tc + ( v16i8 ) is_less_than_beta;
-            }
-            {
-                v16i8 negate_thresh, sign_negate_thresh;
-                v8i16 threshold_r, threshold_l;
-                v8i16 negate_thresh_l, negate_thresh_r;
-
-                negate_thresh = zero - tc;
-                sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
-
-                ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
-                            threshold_r, negate_thresh_r );
-                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
-                              negate_thresh_r, threshold_r, p0_r, q0_r );
-
-                threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc );
-                negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
-                                                          negate_thresh );
-                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
-                              negate_thresh_l, threshold_l, p0_l, q0_l );
-            }
-
-            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
-            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
-
-            ST_UB( p0_org, ( p_data - u_image_width ) );
-            ST_UB( q0_org, p_data );
-        }
-    }
-}
-
-static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma,
-                                                         uint8_t u_bs0,
-                                                         uint8_t u_bs1,
-                                                         uint8_t u_bs2,
-                                                         uint8_t u_bs3,
-                                                         uint8_t u_tc0,
-                                                         uint8_t u_tc1,
-                                                         uint8_t u_tc2,
-                                                         uint8_t u_tc3,
-                                                         uint8_t u_alpha_in,
-                                                         uint8_t u_beta_in,
-                                                         uint32_t u_img_width )
-{
-    v16u8 alpha, beta;
-    v4i32 tmp_vec, bs = { 0 };
-    v4i32 tc = { 0 };
-    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
-    v16u8 is_less_than;
-    v8i16 is_less_than_r, is_less_than_l;
-    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
-    v16u8 p0, q0;
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-    v16u8 p1_org, p0_org, q0_org, q1_org;
-    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-    v16i8 negate_tc, sign_negate_tc;
-    v8i16 negate_tc_r, i16_negatetc_l;
-    v8i16 tc_r, tc_l;
-    v16i8 zero = { 0 };
-    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-
-    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 );
-    bs = __msa_insve_w( bs, 0, tmp_vec );
-    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 );
-    bs = __msa_insve_w( bs, 1, tmp_vec );
-    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 );
-    bs = __msa_insve_w( bs, 2, tmp_vec );
-    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 );
-    bs = __msa_insve_w( bs, 3, tmp_vec );
-
-    if( !__msa_test_bz_v( ( v16u8 ) bs ) )
-    {
-        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 );
-        tc = __msa_insve_w( tc, 0, tmp_vec );
-        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 );
-        tc = __msa_insve_w( tc, 1, tmp_vec );
-        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 );
-        tc = __msa_insve_w( tc, 2, tmp_vec );
-        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 );
-        tc = __msa_insve_w( tc, 3, tmp_vec );
-
-        is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
-
-        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-        LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
-                p1_org, p0_org, q0_org, q1_org );
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-
-        is_less_than = is_less_than & is_bs_greater_than0;
-
-        if( !__msa_test_bz_v( is_less_than ) )
-        {
-            negate_tc = zero - ( v16i8 ) tc;
-            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
-
-            ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
-                         i16_negatetc_l );
-
-            UNPCK_UB_SH( tc, tc_r, tc_l );
-            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-            UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
-
-            is_less_than_r =
-                ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
-            {
-                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
-                              negate_tc_r, tc_r, p0_r, q0_r );
-            }
-
-            is_less_than_l =
-                ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 );
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
-            {
-                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
-                              i16_negatetc_l, tc_l, p0_l, q0_l );
-            }
-
-            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
-            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
-
-            ST_UB( p0_org, p_chroma - u_img_width );
-            ST_UB( q0_org, p_chroma );
-        }
-    }
-}
-
-static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma,
-                                                         uint8_t u_bs0,
-                                                         uint8_t u_bs1,
-                                                         uint8_t u_bs2,
-                                                         uint8_t u_bs3,
-                                                         uint8_t u_tc0,
-                                                         uint8_t u_tc1,
-                                                         uint8_t u_tc2,
-                                                         uint8_t u_tc3,
-                                                         uint8_t u_alpha_in,
-                                                         uint8_t u_beta_in,
-                                                         uint32_t u_img_width )
-{
-    v16u8 alpha, beta;
-    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
-    v16u8 is_less_than, is_less_than1;
-    v8i16 is_less_than_r, is_less_than_l;
-    v16u8 is_less_than_beta, is_less_than_alpha;
-    v8i16 p0_r = { 0 };
-    v8i16 q0_r = { 0 };
-    v8i16 p0_l = { 0 };
-    v8i16 q0_l = { 0 };
-    v16u8 p1_org, p0_org, q0_org, q1_org;
-    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
-    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
-    v16u8 is_bs_less_than4, is_bs_greater_than0;
-    v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
-    v16u8 const4;
-    v16i8 zero = { 0 };
-    v8i16 tmp_vec, bs = { 0 };
-    v8i16 tc = { 0 };
-    v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
-    v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
-    v16i8 tmp0, tmp1, tmp2, tmp3;
-    v4i32 vec0, vec1;
-    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-    v16i8 negate_tc, sign_negate_tc;
-
-    const4 = ( v16u8 ) __msa_ldi_b( 4 );
-
-    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 );
-    bs = __msa_insve_h( bs, 0, tmp_vec );
-    bs = __msa_insve_h( bs, 4, tmp_vec );
-
-    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 );
-    bs = __msa_insve_h( bs, 1, tmp_vec );
-    bs = __msa_insve_h( bs, 5, tmp_vec );
-
-    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 );
-    bs = __msa_insve_h( bs, 2, tmp_vec );
-    bs = __msa_insve_h( bs, 6, tmp_vec );
-
-    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 );
-    bs = __msa_insve_h( bs, 3, tmp_vec );
-    bs = __msa_insve_h( bs, 7, tmp_vec );
-
-    if( !__msa_test_bz_v( ( v16u8 ) bs ) )
-    {
-        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 );
-        tc = __msa_insve_h( tc, 0, tmp_vec );
-        tc = __msa_insve_h( tc, 4, tmp_vec );
-
-        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 );
-        tc = __msa_insve_h( tc, 1, tmp_vec );
-        tc = __msa_insve_h( tc, 5, tmp_vec );
-
-        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 );
-        tc = __msa_insve_h( tc, 2, tmp_vec );
-        tc = __msa_insve_h( tc, 6, tmp_vec );
-
-        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 );
-        tc = __msa_insve_h( tc, 3, tmp_vec );
-        tc = __msa_insve_h( tc, 7, tmp_vec );
-
-        is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
-
-        LD_UB8( ( p_chroma - 4 ), u_img_width,
-                row0, row1, row2, row3, row4, row5, row6, row7 );
-
-        TRANSPOSE8x8_UB_UB( row0, row1, row2, row3,
-                            row4, row5, row6, row7,
-                            p1_u_org, p1_v_org, p0_u_org, p0_v_org,
-                            q0_u_org, q0_v_org, q1_u_org, q1_v_org );
-
-        ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
-                    q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
-
-        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
-        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
-        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
-
-        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
-        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
-
-        is_less_than_alpha = ( p0_asub_q0 < alpha );
-        is_less_than_beta = ( p1_asub_p0 < beta );
-        is_less_than = is_less_than_beta & is_less_than_alpha;
-        is_less_than_beta = ( q1_asub_q0 < beta );
-        is_less_than = is_less_than_beta & is_less_than;
-        is_less_than = is_bs_greater_than0 & is_less_than;
-
-        if( !__msa_test_bz_v( is_less_than ) )
-        {
-            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
-            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
-            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
-            UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
-
-            is_bs_less_than4 = ( ( v16u8 ) bs < const4 );
-
-            is_less_than1 = is_less_than & is_bs_less_than4;
-            if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) )
-            {
-                negate_tc = zero - ( v16i8 ) tc;
-                sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
-
-                ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
-                             i16_negatetc_l );
-
-                UNPCK_UB_SH( tc, tc_r, tc_l );
-
-                is_less_than_r =
-                    ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
-                {
-                    AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
-                                  negate_tc_r, tc_r, p0_r, q0_r );
-                }
-
-                is_less_than_l =
-                    ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 );
-                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
-                {
-                    AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
-                                  i16_negatetc_l, tc_l, p0_l, q0_l );
-                }
-
-                PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
-
-                p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 );
-                q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 );
-            }
-
-            SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
-            ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
-            ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
-            ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
-            ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
-        }
-    }
-}
-
-static void avc_deblock_strength_msa( uint8_t *nnz,
-                                      int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
-                                      int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
-                                      uint8_t pu_bs[2][8][4],
-                                      int32_t i_mvy_limit )
-{
-    uint32_t u_tmp;
-    v16u8 nnz0, nnz1, nnz2, nnz3, nnz4;
-    v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 };
-    v16i8 ref0, ref1, ref2, ref3, ref4;
-    v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5;
-    v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b;
-    v8u16 four, mvy_limit_vec, sub0, sub1;
-
-    nnz0 = LD_UB( nnz + 4 );
-    nnz2 = LD_UB( nnz + 20 );
-    nnz4 = LD_UB( nnz + 36 );
-
-    ref0 = LD_SB( pi_ref[0] + 4 );
-    ref2 = LD_SB( pi_ref[0] + 20 );
-    ref4 = LD_SB( pi_ref[0] + 36 );
-
-    mv0 = LD_SH( ( pi_mv[0] + 4 )[0] );
-    mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
-    mv2 = LD_SH( ( pi_mv[0] + 20 )[0] );
-    mv3 = LD_SH( ( pi_mv[0] + 28 )[0] );
-    mv4 = LD_SH( ( pi_mv[0] + 36 )[0] );
-
-    mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
-    four = ( v8u16 ) __msa_fill_h( 4 );
-    mask = ( v16u8 ) __msa_ldi_b( 0 );
-    one = ( v16u8 ) __msa_ldi_b( 1 );
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-
-    mv5 = __msa_pckod_h( mv0, mv0 );
-    mv6 = __msa_pckod_h( mv1, mv1 );
-    mv_a = __msa_pckev_h( mv0, mv0 );
-    mv_b = __msa_pckev_h( mv1, mv1 );
-    nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 );
-    ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 );
-    nnz_mask = nnz0 | nnz1;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[1][0] );
-
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-
-    mv5 = __msa_pckod_h( mv1, mv1 );
-    mv6 = __msa_pckod_h( mv2, mv2 );
-    mv_a = __msa_pckev_h( mv1, mv1 );
-    mv_b = __msa_pckev_h( mv2, mv2 );
-
-    nnz_mask = nnz2 | nnz1;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[1][1] );
-
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-
-    mv5 = __msa_pckod_h( mv2, mv2 );
-    mv6 = __msa_pckod_h( mv3, mv3 );
-    mv_a = __msa_pckev_h( mv2, mv2 );
-    mv_b = __msa_pckev_h( mv3, mv3 );
-
-    nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 );
-    ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 );
-
-    nnz_mask = nnz3 | nnz2;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[1][2] );
-
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-
-    mv5 = __msa_pckod_h( mv3, mv3 );
-    mv6 = __msa_pckod_h( mv4, mv4 );
-    mv_a = __msa_pckev_h( mv3, mv3 );
-    mv_b = __msa_pckev_h( mv4, mv4 );
-
-    nnz_mask = nnz4 | nnz3;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[1][3] );
-
-    nnz0 = LD_UB( nnz + 8 );
-    nnz2 = LD_UB( nnz + 24 );
-
-    ref0 = LD_SB( pi_ref[0] + 8 );
-    ref2 = LD_SB( pi_ref[0] + 24 );
-
-    mv0 = LD_SH( ( pi_mv[0] + 8 )[0] );
-    mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
-    mv2 = LD_SH( ( pi_mv[0] + 16 )[0] );
-    mv3 = LD_SH( ( pi_mv[0] + 20 )[0] );
-    mv4 = LD_SH( ( pi_mv[0] + 24 )[0] );
-    mv7 = LD_SH( ( pi_mv[0] + 28 )[0] );
-    mv8 = LD_SH( ( pi_mv[0] + 32 )[0] );
-    mv9 = LD_SH( ( pi_mv[0] + 36 )[0] );
-
-    nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 );
-    nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 );
-
-    ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 );
-
-    ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 );
-
-    nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
-    nnz1 = ( v16u8 ) temp_vec4;
-    nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 );
-    nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 );
-    nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 );
-
-    ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 );
-    ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 );
-
-    ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 );
-
-    ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 );
-
-    ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
-
-    ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 );
-    ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 );
-    ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 );
-
-    TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 );
-    TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 );
-
-    mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
-    four = ( v8u16 ) __msa_fill_h( 4 );
-    mask = ( v16u8 ) __msa_ldi_b( 0 );
-    one = ( v16u8 ) __msa_ldi_b( 1 );
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-
-    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 );
-    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
-    mv_a = mv0;
-    mv_b = mv1;
-
-    nnz_mask = nnz0 | nnz1;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[0][0] );
-
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-
-    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
-    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
-    mv_a = mv1;
-    mv_b = mv2;
-
-    nnz_mask = nnz1 | nnz2;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[0][1] );
-
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-
-    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
-    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
-    mv_a = mv2;
-    mv_b = mv3;
-
-    nnz_mask = nnz2 | nnz3;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[0][2] );
-
-    two = ( v16u8 ) __msa_ldi_b( 2 );
-    dst = ( v16u8 ) __msa_ldi_b( 0 );
-
-    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
-    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 );
-    mv_a = mv3;
-    mv_b = mv4;
-
-    nnz_mask = nnz3 | nnz4;
-    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
-    two = __msa_bmnz_v( two, mask, nnz_mask );
-
-    ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
-    ref_mask = ref_mask ^ 255;
-
-    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
-    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
-    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
-    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
-
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
-    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
-
-    dst = __msa_bmnz_v( dst, one, ref_mask );
-    dst = __msa_bmnz_v( two, dst, nnz_mask );
-
-    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
-    SW( u_tmp, pu_bs[0][3] );
-}
-
-void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
-                                    int32_t i_alpha, int32_t i_beta )
-{
-    avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
-                                            ( uint8_t ) i_beta, i_stride );
-}
-
-void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
-                                    int32_t i_alpha, int32_t i_beta )
-{
-    avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
-                                            ( uint8_t ) i_beta, i_stride );
-}
-
-void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
-                                      int32_t i_alpha, int32_t i_beta )
-{
-    avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
-                                                 ( uint8_t ) i_beta, i_stride );
-}
-
-void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
-                                      int32_t i_alpha, int32_t i_beta )
-{
-    avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
-                                                 ( uint8_t ) i_beta, i_stride );
-}
-
-void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride,
-                              int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
-{
-    uint8_t u_bs0 = 1;
-    uint8_t u_bs1 = 1;
-    uint8_t u_bs2 = 1;
-    uint8_t u_bs3 = 1;
-
-    if( p_tc0[0] < 0 ) u_bs0 = 0;
-    if( p_tc0[1] < 0 ) u_bs1 = 0;
-    if( p_tc0[2] < 0 ) u_bs2 = 0;
-    if( p_tc0[3] < 0 ) u_bs3 = 0;
-
-    avc_loopfilter_luma_inter_edge_ver_msa( p_pix,
-                                            u_bs0, u_bs1, u_bs2, u_bs3,
-                                            p_tc0[0], p_tc0[1], p_tc0[2],
-                                            p_tc0[3], i_alpha, i_beta,
-                                            i_stride );
-}
-
-void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride,
-                              int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
-{
-    uint8_t u_bs0 = 1;
-    uint8_t u_bs1 = 1;
-    uint8_t u_bs2 = 1;
-    uint8_t u_bs3 = 1;
-
-    if( p_tc0[0] < 0 ) u_bs0 = 0;
-    if( p_tc0[1] < 0 ) u_bs1 = 0;
-    if( p_tc0[2] < 0 ) u_bs2 = 0;
-    if( p_tc0[3] < 0 ) u_bs3 = 0;
-
-    avc_loopfilter_luma_inter_edge_hor_msa( p_pix,
-                                            u_bs0, u_bs1, u_bs2, u_bs3,
-                                            p_tc0[0], p_tc0[1], p_tc0[2],
-                                            p_tc0[3], i_alpha, i_beta,
-                                            i_stride );
-}
-
-void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
-                                int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
-{
-    uint8_t u_bs0 = 1;
-    uint8_t u_bs1 = 1;
-    uint8_t u_bs2 = 1;
-    uint8_t u_bs3 = 1;
-
-    if( p_tc0[0] < 0 ) u_bs0 = 0;
-    if( p_tc0[1] < 0 ) u_bs1 = 0;
-    if( p_tc0[2] < 0 ) u_bs2 = 0;
-    if( p_tc0[3] < 0 ) u_bs3 = 0;
-
-    avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix,
-                                                 u_bs0, u_bs1, u_bs2, u_bs3,
-                                                 p_tc0[0], p_tc0[1], p_tc0[2],
-                                                 p_tc0[3], i_alpha, i_beta,
-                                                 i_stride );
-}
-
-void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
-                                int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
-{
-    uint8_t u_bs0 = 1;
-    uint8_t u_bs1 = 1;
-    uint8_t u_bs2 = 1;
-    uint8_t u_bs3 = 1;
-
-    if( p_tc0[0] < 0 ) u_bs0 = 0;
-    if( p_tc0[1] < 0 ) u_bs1 = 0;
-    if( p_tc0[2] < 0 ) u_bs2 = 0;
-    if( p_tc0[3] < 0 ) u_bs3 = 0;
-
-    avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix,
-                                                 u_bs0, u_bs1, u_bs2, u_bs3,
-                                                 p_tc0[0], p_tc0[1], p_tc0[2],
-                                                 p_tc0[3], i_alpha, i_beta,
-                                                 i_stride );
-}
-
-void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE],
-                                int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
-                                int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
-                                uint8_t pu_bs[2][8][4], int32_t i_mvy_limit,
-                                int32_t i_bframe )
-{
-    if( i_bframe )
-    {
-        for( int32_t i_dir = 0; i_dir < 2; i_dir++ )
-        {
-            int32_t s1 = i_dir ? 1 : 8;
-            int32_t s2 = i_dir ? 8 : 1;
-
-            for( int32_t i_edge = 0; i_edge < 4; i_edge++ )
-            {
-                for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4;
-                     i++, loc += s1 )
-                {
-                    int32_t locn = loc - s2;
-                    if( u_nnz[loc] || u_nnz[locn] )
-                    {
-                        pu_bs[i_dir][i_edge][i] = 2;
-                    }
-                    else if( pi_ref[0][loc] != pi_ref[0][locn] ||
-                             abs(  pi_mv[0][loc][0] -
-                                   pi_mv[0][locn][0]  ) >= 4 ||
-                             abs(  pi_mv[0][loc][1] -
-                                   pi_mv[0][locn][1]  ) >= i_mvy_limit ||
-                             ( i_bframe &&
-                                 ( pi_ref[1][loc] != pi_ref[1][locn] ||
-                                   abs(  pi_mv[1][loc][0] -
-                                         pi_mv[1][locn][0]  ) >= 4 ||
-                                   abs(  pi_mv[1][loc][1] -
-                                         pi_mv[1][locn][1]  ) >= i_mvy_limit ) )
-                           )
-                    {
-                        pu_bs[i_dir][i_edge][i] = 1;
-                    }
-                    else
-                    {
-                        pu_bs[i_dir][i_edge][i] = 0;
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit );
-    }
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/macros.h b/android/src/main/libenc/jni/libx264/common/mips/macros.h
deleted file mode 100755
index 8a7b7b2..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/macros.h
+++ /dev/null
@@ -1,1952 +0,0 @@
-/*****************************************************************************
- * macros.h: msa macros
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_MACROS_H
-#define X264_MIPS_MACROS_H
-
-#include <stdint.h>
-#include <msa.h>
-
-#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
-#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
-#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
-
-#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
-#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
-
-#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
-#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
-
-#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
-#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
-#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
-
-#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
-#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
-#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
-
-#if ( __mips_isa_rev >= 6 )
-    #define LH( p_src )                              \
-    ( {                                              \
-        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-        uint16_t u_val_h_m;                          \
-                                                     \
-        asm volatile (                               \
-            "lh  %[u_val_h_m],  %[p_src_m]  \n\t"    \
-                                                     \
-            : [u_val_h_m] "=r" ( u_val_h_m )         \
-            : [p_src_m] "m" ( *p_src_m )             \
-        );                                           \
-                                                     \
-        u_val_h_m;                                   \
-    } )
-
-    #define LW( p_src )                              \
-    ( {                                              \
-        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-        uint32_t u_val_w_m;                          \
-                                                     \
-        asm volatile (                               \
-            "lw  %[u_val_w_m],  %[p_src_m]  \n\t"    \
-                                                     \
-            : [u_val_w_m] "=r" ( u_val_w_m )         \
-            : [p_src_m] "m" ( *p_src_m )             \
-        );                                           \
-                                                     \
-        u_val_w_m;                                   \
-    } )
-
-    #if ( __mips == 64 )
-        #define LD( p_src )                              \
-        ( {                                              \
-            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-            uint64_t u_val_d_m = 0;                      \
-                                                         \
-            asm volatile (                               \
-                "ld  %[u_val_d_m],  %[p_src_m]  \n\t"    \
-                                                         \
-                : [u_val_d_m] "=r" ( u_val_d_m )         \
-                : [p_src_m] "m" ( *p_src_m )             \
-            );                                           \
-                                                         \
-            u_val_d_m;                                   \
-        } )
-    #else  // !( __mips == 64 )
-        #define LD( p_src )                                                  \
-        ( {                                                                  \
-            uint8_t *p_src_m = ( uint8_t * ) ( p_src );                      \
-            uint32_t u_val0_m, u_val1_m;                                     \
-            uint64_t u_val_d_m = 0;                                          \
-                                                                             \
-            u_val0_m = LW( p_src_m );                                        \
-            u_val1_m = LW( p_src_m + 4 );                                    \
-                                                                             \
-            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
-            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
-                                       0xFFFFFFFF00000000 );                 \
-            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
-                                                                             \
-            u_val_d_m;                                                       \
-        } )
-    #endif  // ( __mips == 64 )
-
-    #define SH( u_val, p_dst )                       \
-    {                                                \
-        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
-        uint16_t u_val_h_m = ( u_val );              \
-                                                     \
-        asm volatile (                               \
-            "sh  %[u_val_h_m],  %[p_dst_m]  \n\t"    \
-                                                     \
-            : [p_dst_m] "=m" ( *p_dst_m )            \
-            : [u_val_h_m] "r" ( u_val_h_m )          \
-        );                                           \
-    }
-
-    #define SW( u_val, p_dst )                       \
-    {                                                \
-        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
-        uint32_t u_val_w_m = ( u_val );              \
-                                                     \
-        asm volatile (                               \
-            "sw  %[u_val_w_m],  %[p_dst_m]  \n\t"    \
-                                                     \
-            : [p_dst_m] "=m" ( *p_dst_m )            \
-            : [u_val_w_m] "r" ( u_val_w_m )          \
-        );                                           \
-    }
-
-    #define SD( u_val, p_dst )                       \
-    {                                                \
-        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
-        uint64_t u_val_d_m = ( u_val );              \
-                                                     \
-        asm volatile (                               \
-            "sd  %[u_val_d_m],  %[p_dst_m]  \n\t"    \
-                                                     \
-            : [p_dst_m] "=m" ( *p_dst_m )            \
-            : [u_val_d_m] "r" ( u_val_d_m )          \
-        );                                           \
-    }
-
-#else  // !( __mips_isa_rev >= 6 )
-    #define LH( p_src )                              \
-    ( {                                              \
-        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-        uint16_t u_val_h_m;                          \
-                                                     \
-        asm volatile (                               \
-            "ulh  %[u_val_h_m],  %[p_src_m]  \n\t"   \
-                                                     \
-            : [u_val_h_m] "=r" ( u_val_h_m )         \
-            : [p_src_m] "m" ( *p_src_m )             \
-        );                                           \
-                                                     \
-        u_val_h_m;                                   \
-    } )
-
-    #define LW( p_src )                              \
-    ( {                                              \
-        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-        uint32_t u_val_w_m;                          \
-                                                     \
-        asm volatile (                               \
-            "ulw  %[u_val_w_m],  %[p_src_m]  \n\t"   \
-                                                     \
-            : [u_val_w_m] "=r" ( u_val_w_m )         \
-            : [p_src_m] "m" ( *p_src_m )             \
-        );                                           \
-                                                     \
-        u_val_w_m;                                   \
-    } )
-
-    #if ( __mips == 64 )
-        #define LD( p_src )                              \
-        ( {                                              \
-            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
-            uint64_t u_val_d_m = 0;                      \
-                                                         \
-            asm volatile (                               \
-                "uld  %[u_val_d_m],  %[p_src_m]  \n\t"   \
-                                                         \
-                : [u_val_d_m] "=r" ( u_val_d_m )         \
-                : [p_src_m] "m" ( *p_src_m )             \
-            );                                           \
-                                                         \
-            u_val_d_m;                                   \
-        } )
-    #else  // !( __mips == 64 )
-        #define LD( p_src )                                                  \
-        ( {                                                                  \
-            uint8_t *psrc_m1 = ( uint8_t * ) ( p_src );                      \
-            uint32_t u_val0_m, u_val1_m;                                     \
-            uint64_t u_val_d_m = 0;                                          \
-                                                                             \
-            u_val0_m = LW( psrc_m1 );                                        \
-            u_val1_m = LW( psrc_m1 + 4 );                                    \
-                                                                             \
-            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
-            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
-                                       0xFFFFFFFF00000000 );                 \
-            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
-                                                                             \
-            u_val_d_m;                                                       \
-        } )
-    #endif  // ( __mips == 64 )
-
-    #define SH( u_val, p_dst )                       \
-    {                                                \
-        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
-        uint16_t u_val_h_m = ( u_val );              \
-                                                     \
-        asm volatile (                               \
-            "ush  %[u_val_h_m],  %[p_dst_m]  \n\t"   \
-                                                     \
-            : [p_dst_m] "=m" ( *p_dst_m )            \
-            : [u_val_h_m] "r" ( u_val_h_m )          \
-        );                                           \
-    }
-
-    #define SW( u_val, p_dst )                       \
-    {                                                \
-        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
-        uint32_t u_val_w_m = ( u_val );              \
-                                                     \
-        asm volatile (                               \
-            "usw  %[u_val_w_m],  %[p_dst_m]  \n\t"   \
-                                                     \
-            : [p_dst_m] "=m" ( *p_dst_m )            \
-            : [u_val_w_m] "r" ( u_val_w_m )          \
-        );                                           \
-    }
-
-    #define SD( u_val, p_dst )                                                 \
-    {                                                                          \
-        uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst );                           \
-        uint32_t u_val0_m, u_val1_m;                                           \
-                                                                               \
-        u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF );            \
-        u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF );  \
-                                                                               \
-        SW( u_val0_m, p_dst_m1 );                                              \
-        SW( u_val1_m, p_dst_m1 + 4 );                                          \
-    }
-
-#endif // ( __mips_isa_rev >= 6 )
-
-/* Description : Load 4 words with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
-                 Outputs - out0, out1, out2, out3
-   Details     : Load word in 'out0' from (psrc)
-                 Load word in 'out1' from (psrc + stride)
-                 Load word in 'out2' from (psrc + 2 * stride)
-                 Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4( p_src, stride, out0, out1, out2, out3 )  \
-{                                                     \
-    out0 = LW( ( p_src ) );                           \
-    out1 = LW( ( p_src ) + stride );                  \
-    out2 = LW( ( p_src ) + 2 * stride );              \
-    out3 = LW( ( p_src ) + 3 * stride );              \
-}
-
-/* Description : Store 4 words with stride
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-   Details     : Store word from 'in0' to (pdst)
-                 Store word from 'in1' to (pdst + stride)
-                 Store word from 'in2' to (pdst + 2 * stride)
-                 Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4( in0, in1, in2, in3, p_dst, stride )  \
-{                                                 \
-    SW( in0, ( p_dst ) )                          \
-    SW( in1, ( p_dst ) + stride );                \
-    SW( in2, ( p_dst ) + 2 * stride );            \
-    SW( in3, ( p_dst ) + 3 * stride );            \
-}
-
-/* Description : Store 4 double words with stride
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-   Details     : Store double word from 'in0' to (pdst)
-                 Store double word from 'in1' to (pdst + stride)
-                 Store double word from 'in2' to (pdst + 2 * stride)
-                 Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4( in0, in1, in2, in3, p_dst, stride )  \
-{                                                 \
-    SD( in0, ( p_dst ) )                          \
-    SD( in1, ( p_dst ) + stride );                \
-    SD( in2, ( p_dst ) + 2 * stride );            \
-    SD( in3, ( p_dst ) + 3 * stride );            \
-}
-
-/* Description : Load vectors with 16 byte elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Load 16 byte elements in 'out0' from (psrc)
-                 Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2( RTYPE, p_src, stride, out0, out1 )  \
-{                                                  \
-    out0 = LD_B( RTYPE, ( p_src ) );               \
-    out1 = LD_B( RTYPE, ( p_src ) + stride );      \
-}
-#define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
-#define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
-
-#define LD_B3( RTYPE, p_src, stride, out0, out1, out2 )  \
-{                                                        \
-    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );       \
-    out2 = LD_B( RTYPE, ( p_src ) + 2 * stride );        \
-}
-#define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
-#define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
-
-#define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 )     \
-{                                                                 \
-    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );                \
-    LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 );  \
-}
-#define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
-#define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
-
-#define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 )  \
-{                                                                    \
-    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );       \
-    out4 = LD_B( RTYPE, ( p_src ) + 4 * stride );                    \
-}
-#define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
-#define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
-
-#define LD_B8( RTYPE, p_src, stride,                                         \
-               out0, out1, out2, out3, out4, out5, out6, out7 )              \
-{                                                                            \
-    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
-    LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
-}
-#define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
-#define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
-
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2( RTYPE, p_src, stride, out0, out1 )  \
-{                                                  \
-    out0 = LD_H( RTYPE, ( p_src ) );               \
-    out1 = LD_H( RTYPE, ( p_src ) + ( stride ) );  \
-}
-#define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
-
-#define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 )    \
-{                                                                \
-    LD_H2( RTYPE, ( p_src ), stride, out0, out1 );               \
-    LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 );  \
-}
-#define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
-
-#define LD_H8( RTYPE, p_src, stride,                                         \
-               out0, out1, out2, out3, out4, out5, out6, out7 )              \
-{                                                                            \
-    LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
-    LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
-}
-#define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
-                 data into 4 vectors (Each vector with 4 signed halfwords)
-   Arguments   : Inputs  - psrc
-                 Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH( p_src, out0, out1, out2, out3 )                     \
-{                                                                     \
-    out0 = LD_SH( p_src );                                            \
-    out2 = LD_SH( p_src + 8 );                                        \
-    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );  \
-    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 );  \
-}
-
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2( p_src, stride, out0, out1 )    \
-{                                              \
-    out0 = LD_SW( ( p_src ) );                 \
-    out1 = LD_SW( ( p_src ) + stride );        \
-}
-
-/* Description : Store vectors of 16 byte elements with stride
-   Arguments   : Inputs  - in0, in1, stride
-                         - pdst    (destination pointer to store to)
-   Details     : Store 16 byte elements from 'in0' to (pdst)
-                 Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2( RTYPE, in0, in1, p_dst, stride )  \
-{                                                \
-    ST_B( RTYPE, in0, ( p_dst ) );               \
-    ST_B( RTYPE, in1, ( p_dst ) + stride );      \
-}
-#define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
-
-#define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
-{                                                              \
-    ST_B2( RTYPE, in0, in1, ( p_dst ), stride );               \
-    ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
-}
-#define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
-#define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
-
-#define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,            \
-               p_dst, stride )                                           \
-{                                                                        \
-    ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride );                   \
-    ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );  \
-}
-#define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs  - in0, in1, stride
-                         - pdst    (destination pointer to store to)
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2( RTYPE, in0, in1, p_dst, stride )  \
-{                                                \
-    ST_H( RTYPE, in0, ( p_dst ) );               \
-    ST_H( RTYPE, in1, ( p_dst ) + stride );      \
-}
-#define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
-
-#define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
-{                                                              \
-    ST_H2( RTYPE, in0, in1, ( p_dst ), stride );               \
-    ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
-}
-#define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
-
-#define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride )  \
-{                                                                              \
-    ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride );                     \
-    ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );        \
-}
-#define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
-
-/* Description : Store 2x4 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, stidx, pdst, stride
-   Details     : Index 'stidx' halfword element from 'in' vector is copied to
-                 GP register and stored to (pdst)
-                 Index 'stidx+1' halfword element from 'in' vector is copied to
-                 GP register and stored to (pdst + stride)
-                 Index 'stidx+2' halfword element from 'in' vector is copied to
-                 GP register and stored to (pdst + 2 * stride)
-                 Index 'stidx+3' halfword element from 'in' vector is copied to
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB( in, stidx, p_dst, stride )                   \
-{                                                              \
-    uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;           \
-    uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst );             \
-                                                               \
-    u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) );      \
-    u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) );  \
-    u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) );  \
-    u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) );  \
-                                                               \
-    SH( u_out0_m, pblk_2x4_m );                                \
-    SH( u_out1_m, pblk_2x4_m + stride );                       \
-    SH( u_out2_m, pblk_2x4_m + 2 * stride );                   \
-    SH( u_out3_m, pblk_2x4_m + 3 * stride );                   \
-}
-
-/* Description : Store 4x4 byte block to destination memory from input vector
-   Arguments   : Inputs  - in0, in1, pdst, stride
-   Details     : 'Idx0' word element from input vector 'in0' is copied to
-                 GP register and stored to (pdst)
-                 'Idx1' word element from input vector 'in0' is copied to
-                 GP register and stored to (pdst + stride)
-                 'Idx2' word element from input vector 'in0' is copied to
-                 GP register and stored to (pdst + 2 * stride)
-                 'Idx3' word element from input vector 'in0' is copied to
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride )     \
-{                                                                       \
-    uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
-    uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst );                      \
-                                                                        \
-    u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 );                   \
-    u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 );                   \
-    u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 );                   \
-    u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 );                   \
-                                                                        \
-    SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride );  \
-}
-
-#define ST4x8_UB( in0, in1, p_dst, stride )                           \
-{                                                                     \
-    uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst );                      \
-                                                                      \
-    ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride );               \
-    ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride );  \
-}
-
-/* Description : Store 8x1 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, pdst
-   Details     : Index 0 double word element from 'in' vector is copied to
-                 GP register and stored to (pdst)
-*/
-#define ST8x1_UB( in, p_dst )                      \
-{                                                  \
-    uint64_t u_out0_m;                             \
-    u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 );  \
-    SD( u_out0_m, p_dst );                         \
-}
-
-/* Description : Store 8x4 byte block to destination memory from input
-                 vectors
-   Arguments   : Inputs  - in0, in1, pdst, stride
-   Details     : Index 0 double word element from 'in0' vector is copied to
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in0' vector is copied to
-                 GP register and stored to (pdst + stride)
-                 Index 0 double word element from 'in1' vector is copied to
-                 GP register and stored to (pdst + 2 * stride)
-                 Index 1 double word element from 'in1' vector is copied to
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB( in0, in1, p_dst, stride )                             \
-{                                                                       \
-    uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
-    uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst );                      \
-                                                                        \
-    u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 );                      \
-    u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 );                      \
-    u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 );                      \
-    u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 );                      \
-                                                                        \
-    SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride );  \
-}
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
-   Arguments   : Inputs  - in0, in1, in2, in3,
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from 'in0' vector is added with
-                 each unsigned byte element from 'in1' vector.
-                 Average with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
-{                                                                     \
-    out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 );  \
-}
-#define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
-
-#define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
-    AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 )             \
-}
-#define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
-
-/* Description : Immediate number of elements to slide with zero
-   Arguments   : Inputs  - in0, in1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
-                 value specified in 'slide_val'
-*/
-#define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val )     \
-{                                                               \
-    v16i8 zero_m = { 0 };                                       \
-    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
-                                   ( v16i8 ) in0, slide_val );  \
-    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
-                                   ( v16i8 ) in1, slide_val );  \
-}
-#define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
-
-/* Description : Immediate number of elements to slide
-   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
-                 value specified in 'slide_val'
-*/
-#define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val )  \
-{                                                                            \
-    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0,         \
-                                   slide_val );                              \
-    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1,         \
-                                   slide_val );                              \
-}
-#define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
-
-/* Description : Shuffle byte vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Selective byte elements from 'in0' & 'in1' are copied to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
-{                                                                       \
-    out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0,                     \
-                                   ( v16i8 ) in1, ( v16i8 ) in0 );      \
-    out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1,                     \
-                                   ( v16i8 ) in3, ( v16i8 ) in2 );      \
-}
-#define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
-#define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
-
-/* Description : Shuffle halfword vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Selective byte elements from 'in0' & 'in1' are copied to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
-{                                                                       \
-    out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0,                     \
-                                   ( v8i16 ) in1, ( v8i16 ) in0 );      \
-    out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1,                     \
-                                   ( v8i16 ) in3, ( v8i16 ) in2 );      \
-}
-#define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Unsigned byte elements from 'mult0' are multiplied with
-                 unsigned byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. unsigned halfword.
-                 Multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
-{                                                                         \
-    out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 );  \
-    out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 );  \
-}
-#define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
-
-#define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3,            \
-                  cnst0, cnst1, cnst2, cnst3,                   \
-                  out0, out1, out2, out3 )                      \
-{                                                               \
-    DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );  \
-    DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );  \
-}
-#define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 Multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
-{                                                                          \
-    out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0,                      \
-                                      ( v16i8 ) mult0, ( v16i8 ) cnst0 );  \
-    out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1,                      \
-                                      ( v16i8 ) mult1, ( v16i8 ) cnst1 );  \
-}
-#define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
-
-#define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3,                    \
-                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 )  \
-{                                                                        \
-    DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );          \
-    DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );          \
-}
-#define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
-
-/* Description : Dot product of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 Multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
-{                                                                          \
-    out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0,                      \
-                                      ( v8i16 ) mult0, ( v8i16 ) cnst0 );  \
-    out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1,                      \
-                                      ( v8i16 ) mult1, ( v8i16 ) cnst1 );  \
-}
-#define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
-
-/* Description : Clips all halfword elements of input vector between min & max
-                 out = (in < min) ? min : ((in > max) ? max : in)
-   Arguments   : Inputs  - in, min, max
-                 Output - out_m
-                 Return Type - signed halfword
-*/
-#define CLIP_SH( in, min, max )                               \
-( {                                                           \
-    v8i16 out_m;                                              \
-                                                              \
-    out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in );     \
-    out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m );  \
-    out_m;                                                    \
-} )
-
-/* Description : Clips all signed halfword elements of input vector
-                 between 0 & 255
-   Arguments   : Input  - in
-                 Output - out_m
-                 Return Type - signed halfword
-*/
-#define CLIP_SH_0_255( in )                                     \
-( {                                                             \
-    v8i16 max_m = __msa_ldi_h( 255 );                           \
-    v8i16 out_m;                                                \
-                                                                \
-    out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 );                  \
-    out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m );  \
-    out_m;                                                      \
-} )
-#define CLIP_SH2_0_255( in0, in1 )  \
-{                                   \
-    in0 = CLIP_SH_0_255( in0 );     \
-    in1 = CLIP_SH_0_255( in1 );     \
-}
-#define CLIP_SH4_0_255( in0, in1, in2, in3 )  \
-{                                             \
-    CLIP_SH2_0_255( in0, in1 );               \
-    CLIP_SH2_0_255( in2, in3 );               \
-}
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
-   Arguments   : Input  - in       (signed word vector)
-                 Output - sum_m    (i32 sum)
-                 Return Type - signed word (GP)
-   Details     : 4 signed word elements of 'in' vector are added together and
-                 the resulting integer sum is returned
-*/
-#define HADD_SW_S32( in )                                   \
-( {                                                         \
-    v2i64 res0_m, res1_m;                                   \
-    int32_t i_sum_m;                                        \
-                                                            \
-    res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in );  \
-    res1_m = __msa_splati_d( res0_m, 1 );                   \
-    res0_m = res0_m + res1_m;                               \
-    i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 );        \
-    i_sum_m;                                                \
-} )
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
-   Arguments   : Input  - in       (signed word vector)
-                 Output - sum_m    (i32 sum)
-                 Return Type - signed word (GP)
-   Details     : 4 signed word elements of 'in' vector are added together and
-                 the resulting integer sum is returned
-*/
-#define HADD_UH_U32( in )                                      \
-( {                                                            \
-    v4u32 res_m;                                               \
-    v2u64 res0_m, res1_m;                                      \
-    uint32_t u_sum_m;                                          \
-                                                               \
-    res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in );      \
-    res0_m = __msa_hadd_u_d( res_m, res_m );                   \
-    res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 );  \
-    res0_m = res0_m + res1_m;                                  \
-    u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 );           \
-    u_sum_m;                                                   \
-} )
-
-/* Description : Horizontal addition of signed byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed odd byte element from 'in0' is added to
-                 even signed byte element from 'in0' (pairwise) and the
-                 halfword result is written in 'out0'
-*/
-#define HADD_SB2( RTYPE, in0, in1, out0, out1 )                       \
-{                                                                     \
-    out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 );  \
-    out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 );  \
-}
-#define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                      \
-    HADD_SB2( RTYPE, in0, in1, out0, out1 );                           \
-    HADD_SB2( RTYPE, in2, in3, out2, out3 );                           \
-}
-#define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
-
-/* Description : Horizontal addition of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is added to
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HADD_UB2( RTYPE, in0, in1, out0, out1 )                       \
-{                                                                     \
-    out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
-    out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
-}
-#define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
-
-#define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                      \
-    HADD_UB2( RTYPE, in0, in1, out0, out1 );                           \
-    HADD_UB2( RTYPE, in2, in3, out2, out3 );                           \
-}
-#define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is subtracted from
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HSUB_UB2( RTYPE, in0, in1, out0, out1 )                       \
-{                                                                     \
-    out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
-    out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
-}
-#define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
-
-#define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                      \
-    HSUB_UB2( RTYPE, in0, in1, out0, out1 );                           \
-    HSUB_UB2( RTYPE, in2, in3, out2, out3 );                           \
-}
-#define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
-
-/* Description : SAD (Sum of Absolute Difference)
-   Arguments   : Inputs  - in0, in1, ref0, ref1
-                 Outputs - sad_m                 (halfword vector)
-                 Return Type - unsigned halfword
-   Details     : Absolute difference of all the byte elements from 'in0' with
-                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
-                 pairs are added together to generate 8 halfword results.
-*/
-#define SAD_UB2_UH( in0, in1, ref0, ref1 )                            \
-( {                                                                   \
-    v16u8 diff0_m, diff1_m;                                           \
-    v8u16 sad_m = { 0 };                                              \
-                                                                      \
-    diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 );        \
-    diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 );        \
-                                                                      \
-    sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m );  \
-    sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m );  \
-                                                                      \
-    sad_m;                                                            \
-} )
-
-/* Description : Set element n input vector to GPR value
-   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
-                 Output - out                 (output vector)
-                 Return Type - as per RTYPE
-   Details     : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2( RTYPE, in0, in1, out )                     \
-{                                                             \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
-}
-#define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
-
-#define INSERT_W4( RTYPE, in0, in1, in2, in3, out )           \
-{                                                             \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 );  \
-    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 );  \
-}
-#define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
-#define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
-
-#define INSERT_D2( RTYPE, in0, in1, out )                     \
-{                                                             \
-    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 );  \
-    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 );  \
-}
-#define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
-
-/* Description : Interleave even halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 );  \
-    out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 );  \
-}
-#define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
-
-/* Description : Interleave even double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 );  \
-    out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 );  \
-}
-#define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
-
-/* Description : Interleave left half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
-}
-#define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
-#define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
-
-#define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3 )                        \
-{                                                                \
-    ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
-#define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
-#define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
-#define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
-
-/* Description : Interleave left half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
-}
-#define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
-#define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
-
-#define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3 )                        \
-{                                                                \
-    ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
-
-/* Description : Interleave left half of word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
-}
-#define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
-
-/* Description : Interleave right half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to out0.
-*/
-#define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
-}
-#define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
-#define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
-#define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
-
-#define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3 )                        \
-{                                                                \
-    ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
-#define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
-#define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
-#define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
-
-/* Description : Interleave right half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
-}
-#define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
-#define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
-
-#define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3 )                        \
-{                                                                \
-    ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
-#define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
-
-#define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
-}
-#define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
-
-/* Description : Interleave right half of double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of double word elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 )                    \
-{                                                                           \
-    out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) );  \
-    out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) );  \
-}
-#define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
-#define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
-#define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
-
-#define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3 )                        \
-{                                                                \
-    ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
-
-/* Description : Interleave both left and right half of input vectors
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and written to 'out0'
-*/
-#define ILVRL_B2( RTYPE, in0, in1, out0, out1 )                     \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-}
-#define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
-#define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
-#define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
-#define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
-#define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
-
-#define ILVRL_H2( RTYPE, in0, in1, out0, out1 )                     \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-}
-#define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
-#define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
-
-#define ILVRL_W2( RTYPE, in0, in1, out0, out1 )                     \
-{                                                                   \
-    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
-    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
-}
-#define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
-#define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
-
-/* Description : Maximum values between signed elements of vector and
-                 5-bit signed immediate value are copied to the output vector
-   Arguments   : Inputs  - in0, in1, in2, in3, max_val
-                 Outputs - in place operation
-                 Return Type - unsigned halfword
-   Details     : Maximum of signed halfword element values from 'in0' and
-                 'max_val' are written in place
-*/
-#define MAXI_SH2( RTYPE, in0, in1, max_val )                       \
-{                                                                  \
-    in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) );  \
-    in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) );  \
-}
-#define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
-#define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
-
-#define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val )  \
-{                                                       \
-    MAXI_SH2( RTYPE, in0, in1, max_val );               \
-    MAXI_SH2( RTYPE, in2, in3, max_val );               \
-}
-#define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1 bits)
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range.
-                 The results are written in place
-*/
-#define SAT_UH2( RTYPE, in0, in1, sat_val )                   \
-{                                                             \
-    in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val );  \
-    in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val );  \
-}
-#define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
-
-#define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val )  \
-{                                                      \
-    SAT_UH2( RTYPE, in0, in1, sat_val );               \
-    SAT_UH2( RTYPE, in2, in3, sat_val )                \
-}
-#define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val+1 bits)
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range
-                 The results are written in place
-*/
-#define SAT_SH2( RTYPE, in0, in1, sat_val )                   \
-{                                                             \
-    in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val );  \
-    in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val );  \
-}
-#define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
-
-#define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val )  \
-{                                                      \
-    SAT_SH2( RTYPE, in0, in1, sat_val );               \
-    SAT_SH2( RTYPE, in2, in3, sat_val );               \
-}
-#define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
-
-/* Description : Saturate the word element values to the max
-                 unsigned value of (sat_val+1 bits)
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned word element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range
-                 The results are written in place
-*/
-#define SAT_SW2( RTYPE, in0, in1, sat_val )                   \
-{                                                             \
-    in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val );  \
-    in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val );  \
-}
-#define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
-
-/* Description : Pack even byte elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' are copied to the left half of
-                 'out0' & even byte elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
-}
-#define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
-#define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
-#define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
-#define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
-
-#define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
-{                                                                         \
-    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );                    \
-    out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 );       \
-}
-#define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
-
-#define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
-#define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
-
-/* Description : Pack even halfword elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' are copied to the left half of
-                 'out0' & even halfword elements of 'in1' are copied to the
-                 right half of 'out0'.
-*/
-#define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-    out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
-}
-#define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
-
-#define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
-
-/* Description : Pack even double word elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double elements of 'in0' are copied to the left half of
-                 'out0' & even double elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
-    out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
-}
-#define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
-
-#define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
-
-/* Description : Pack odd byte elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Odd byte elements of 'in0' are copied to the left half of
-                 'out0' & odd byte elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
-    out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
-}
-#define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
-
-#define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
-
-/* Description : Pack odd double word elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Odd double word elements of 'in0' are copied to the left half
-                 of 'out0' & odd double word elements of 'in1' are copied to
-                 the right half of 'out0'.
-*/
-#define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
-{                                                                    \
-    out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
-    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
-}
-#define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
-#define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
-
-/* Description : Each byte element is logically xor'ed with immediate 128
-   Arguments   : Inputs  - in0, in1
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128( RTYPE, in0, in1 )                   \
-{                                                        \
-    in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 );  \
-    in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 );  \
-}
-#define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
-#define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
-
-#define XORI_B3_128( RTYPE, in0, in1, in2 )              \
-{                                                        \
-    XORI_B2_128( RTYPE, in0, in1 );                      \
-    in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 );  \
-}
-#define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
-
-#define XORI_B4_128( RTYPE, in0, in1, in2, in3 )  \
-{                                                 \
-    XORI_B2_128( RTYPE, in0, in1 );               \
-    XORI_B2_128( RTYPE, in2, in3 );               \
-}
-#define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
-#define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
-
-#define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 )  \
-{                                                      \
-    XORI_B3_128( RTYPE, in0, in1, in2 );               \
-    XORI_B2_128( RTYPE, in3, in4 );                    \
-}
-#define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
-
-/* Description : Addition of signed halfword elements and signed saturation
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'in0' are added to signed
-                 halfword elements of 'in1'. The result is then signed saturated
-                 between halfword data type range
-*/
-#define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
-{                                                                     \
-    out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
-    out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
-}
-#define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
-
-#define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                  out0, out1, out2, out3 )                        \
-{                                                                 \
-    ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
-    ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
-}
-#define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
-
-/* Description : Shift left all elements of vector (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is left shifted by 'shift' and
-                 the result is written in-place.
-*/
-#define SLLI_4V( in0, in1, in2, in3, shift )  \
-{                                             \
-    in0 = in0 << shift;                       \
-    in1 = in1 << shift;                       \
-    in2 = in2 << shift;                       \
-    in3 = in3 << shift;                       \
-}
-
-/* Description : Arithmetic shift right all elements of vector
-                 (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V( in0, in1, in2, in3, shift )  \
-{                                            \
-    in0 = in0 >> shift;                      \
-    in1 = in1 >> shift;                      \
-    in2 = in2 >> shift;                      \
-    in3 = in3 >> shift;                      \
-}
-
-/* Description : Shift right arithmetic rounded halfwords
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetic by
-                 number of bits respective element holds in vector 'shift'.
-                 The last discarded bit is added to shifted value for rounding
-                 and the result is written in-place.
-                 'shift' is a vector.
-*/
-#define SRAR_H2( RTYPE, in0, in1, shift )                            \
-{                                                                    \
-    in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
-    in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
-}
-#define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
-
-#define SRAR_H4( RTYPE, in0, in1, in2, in3, shift )  \
-{                                                    \
-    SRAR_H2( RTYPE, in0, in1, shift )                \
-    SRAR_H2( RTYPE, in2, in3, shift )                \
-}
-#define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
-
-/* Description : Shift right logical all halfword elements of vector
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right logical by
-                 number of bits respective element holds in vector 'shift' and
-                 the result is stored in-place.'shift' is a vector.
-*/
-#define SRL_H4( RTYPE, in0, in1, in2, in3, shift )                  \
-{                                                                   \
-    in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
-    in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
-    in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift );  \
-    in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift );  \
-}
-#define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
-
-/* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetic by
-                 value in 'shift'. The last discarded bit is added to shifted
-                 value for rounding and the result is written in-place.
-                 'shift' is an immediate value.
-*/
-#define SRARI_H2( RTYPE, in0, in1, shift )                  \
-{                                                           \
-    in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift );  \
-    in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift );  \
-}
-#define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
-#define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
-
-#define SRARI_H4( RTYPE, in0, in1, in2, in3, shift )    \
-{                                                       \
-    SRARI_H2( RTYPE, in0, in1, shift );                 \
-    SRARI_H2( RTYPE, in2, in3, shift );                 \
-}
-#define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
-#define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
-
-#define SRARI_W2( RTYPE, in0, in1, shift )                  \
-{                                                           \
-    in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift );  \
-    in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift );  \
-}
-#define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
-
-#define SRARI_W4( RTYPE, in0, in1, in2, in3, shift )  \
-{                                                     \
-    SRARI_W2( RTYPE, in0, in1, shift );               \
-    SRARI_W2( RTYPE, in2, in3, shift );               \
-}
-#define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
-
-/* Description : Multiplication of pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element from 'in0' is multiplied with elements from 'in1'
-                 and the result is written to 'out0'
-*/
-#define MUL2( in0, in1, in2, in3, out0, out1 )  \
-{                                               \
-    out0 = in0 * in1;                           \
-    out1 = in2 * in3;                           \
-}
-#define MUL4( in0, in1, in2, in3, in4, in5, in6, in7,  \
-              out0, out1, out2, out3 )                 \
-{                                                      \
-    MUL2( in0, in1, in2, in3, out0, out1 );            \
-    MUL2( in4, in5, in6, in7, out2, out3 );            \
-}
-
-/* Description : Addition of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in0' is added to 'in1' and result is written
-                 to 'out0'.
-*/
-#define ADD2( in0, in1, in2, in3, out0, out1 )  \
-{                                               \
-    out0 = in0 + in1;                           \
-    out1 = in2 + in3;                           \
-}
-#define ADD4( in0, in1, in2, in3, in4, in5, in6, in7,  \
-              out0, out1, out2, out3 )                 \
-{                                                      \
-    ADD2( in0, in1, in2, in3, out0, out1 );            \
-    ADD2( in4, in5, in6, in7, out2, out3 );            \
-}
-
-#define SUB4( in0, in1, in2, in3, in4, in5, in6, in7,  \
-              out0, out1, out2, out3 )                 \
-{                                                      \
-    out0 = in0 - in1;                                  \
-    out1 = in2 - in3;                                  \
-    out2 = in4 - in5;                                  \
-    out3 = in6 - in7;                                  \
-}
-
-/* Description : Sign extend halfword elements from right half of the vector
-   Arguments   : Input  - in    (halfword vector)
-                 Output - out   (sign extended word vector)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved with same vector 'in0' to generate
-                 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW( in, out )                           \
-{                                                          \
-    v8i16 sign_m;                                          \
-                                                           \
-    sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 );            \
-    out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in );  \
-}
-
-/* Description : Zero extend unsigned byte elements to halfword elements
-   Arguments   : Input  - in           (unsigned byte vector)
-                 Outputs - out0, out1  (unsigned  halfword vectors)
-                 Return Type - signed halfword
-   Details     : Zero extended right half of vector is returned in 'out0'
-                 Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH( in, out0, out1 )       \
-{                                           \
-    v16i8 zero_m = { 0 };                   \
-                                            \
-    ILVRL_B2_SH( zero_m, in, out0, out1 );  \
-}
-
-/* Description : Sign extend halfword elements from input vector and return
-                 the result in pair of vectors
-   Arguments   : Input  - in            (halfword vector)
-                 Outputs - out0, out1   (sign extended word vectors)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved right with same vector 'in0' to
-                 generate 4 signed word elements in 'out0'
-                 Then interleaved left with same vector 'in0' to
-                 generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW( in, out0, out1 )           \
-{                                               \
-    v8i16 tmp_m;                                \
-                                                \
-    tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 );  \
-    ILVRL_H2_SW( tmp_m, in, out0, out1 );       \
-}
-
-/* Description : Butterfly of 4 input vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                  \
-    out0 = in0 + in3;                                              \
-    out1 = in1 + in2;                                              \
-                                                                   \
-    out2 = in1 - in2;                                              \
-    out3 = in0 - in3;                                              \
-}
-
-/* Description : Butterfly of 8 input vectors
-   Arguments   : Inputs  - in0 ...  in7
-                 Outputs - out0 .. out7
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7,           \
-                     out0, out1, out2, out3, out4, out5, out6, out7 )  \
-{                                                                      \
-    out0 = in0 + in7;                                                  \
-    out1 = in1 + in6;                                                  \
-    out2 = in2 + in5;                                                  \
-    out3 = in3 + in4;                                                  \
-                                                                       \
-    out4 = in3 - in4;                                                  \
-    out5 = in2 - in5;                                                  \
-    out6 = in1 - in6;                                                  \
-    out7 = in0 - in7;                                                  \
-}
-
-/* Description : Transpose input 8x8 byte block
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
-                         out0, out1, out2, out3, out4, out5, out6, out7 )  \
-{                                                                          \
-    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
-    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
-                                                                           \
-    ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5,                    \
-                tmp0_m, tmp1_m, tmp2_m, tmp3_m );                          \
-    ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m );                         \
-    ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m );                         \
-    ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 );                         \
-    ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 );                         \
-    SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 );                         \
-    SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 );                         \
-}
-#define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
-                           in8, in9, in10, in11, in12, in13, in14, in15
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7,           \
-                             in8, in9, in10, in11, in12, in13, in14, in15,     \
-                             out0, out1, out2, out3, out4, out5, out6, out7 )  \
-{                                                                              \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
-                                                                               \
-    ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 );                             \
-    ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 );                           \
-    ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 );                           \
-    ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 );                           \
-                                                                               \
-    tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
-    tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
-    tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
-    tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
-    out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 );          \
-    tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 );        \
-    out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 );          \
-    tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 );        \
-                                                                               \
-    ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m );                 \
-    out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-    out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-                                                                               \
-    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );    \
-    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 );        \
-    out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-    out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-                                                                               \
-    ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m );             \
-    out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-    out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-                                                                               \
-    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
-    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
-    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
-    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
-    out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-    out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
-}
-
-/* Description : Transpose 4x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                         \
-    v8i16 s0_m, s1_m;                                                     \
-                                                                          \
-    ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m );                         \
-    ILVRL_W2_SH( s1_m, s0_m, out0, out2 );                                \
-    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );      \
-    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 );      \
-}
-
-/* Description : Transpose 4x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7,           \
-                            out0, out1, out2, out3, out4, out5, out6, out7 )  \
-{                                                                             \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
-    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                     \
-    v8i16 zero_m = { 0 };                                                     \
-                                                                              \
-    ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6,                       \
-                tmp0_n, tmp1_n, tmp2_n, tmp3_n );                             \
-    ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m );                            \
-    ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m );                            \
-                                                                              \
-    out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
-    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
-    out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
-    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
-                                                                              \
-    out4 = zero_m;                                                            \
-    out5 = zero_m;                                                            \
-    out6 = zero_m;                                                            \
-    out7 = zero_m;                                                            \
-}
-
-/* Description : Transpose 8x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                         \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
-                                                                          \
-    ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m );                     \
-    ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m );                     \
-    ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 );             \
-    ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 );             \
-}
-
-/* Description : Transpose 8x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                        out0, out1, out2, out3, out4, out5, out6, out7 )   \
-{                                                                          \
-    v8i16 s0_m, s1_m;                                                      \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
-    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
-                                                                           \
-    ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
-    ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m );                             \
-    ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
-    ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m );                             \
-    ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
-    ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m );                             \
-    ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
-    ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m );                             \
-    PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,       \
-              tmp3_m, tmp7_m, out0, out2, out4, out6 );                    \
-    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
-    out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
-    out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
-    out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
-}
-#define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
-
-/* Description : Transpose 4x4 block with word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 )  \
-{                                                                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                                         \
-                                                                          \
-    ILVRL_W2_SW( in1, in0, s0_m, s1_m );                                  \
-    ILVRL_W2_SW( in3, in2, s2_m, s3_m );                                  \
-                                                                          \
-    out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
-    out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
-    out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
-    out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
-}
-
-/* Description : Add block 4x4
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-   Details     : Least significant 4 bytes from each input vector are added to
-                 the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )        \
-{                                                                   \
-    uint32_t src0_m, src1_m, src2_m, src3_m;                        \
-    uint32_t out0_m, out1_m, out2_m, out3_m;                        \
-    v8i16 inp0_m, inp1_m, res0_m, res1_m;                           \
-    v16i8 dst0_m = { 0 };                                           \
-    v16i8 dst1_m = { 0 };                                           \
-    v16i8 zero_m = { 0 };                                           \
-                                                                    \
-    ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m )                \
-    LW4( p_dst, stride,  src0_m, src1_m, src2_m, src3_m );          \
-    INSERT_W2_SB( src0_m, src1_m, dst0_m );                         \
-    INSERT_W2_SB( src2_m, src3_m, dst1_m );                         \
-    ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m );   \
-    ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m );         \
-    CLIP_SH2_0_255( res0_m, res1_m );                               \
-    PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m );  \
-                                                                    \
-    out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 );                 \
-    out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 );                 \
-    out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 );                 \
-    out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 );                 \
-    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );           \
-}
-
-/* Description : Dot product and addition of 3 signed halfword input vectors
-   Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
-                 Output - out0_m
-                 Return Type - signed halfword
-   Details     : Dot product of 'in0' with 'coeff0'
-                 Dot product of 'in1' with 'coeff1'
-                 Dot product of 'in2' with 'coeff2'
-                 Addition of all the 3 vector results
-                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
-*/
-#define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 )             \
-( {                                                                       \
-    v8i16 tmp1_m;                                                         \
-    v8i16 out0_m;                                                         \
-                                                                          \
-    out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 );           \
-    out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 );  \
-    tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 );           \
-    out0_m = __msa_adds_s_h( out0_m, tmp1_m );                            \
-                                                                          \
-    out0_m;                                                               \
-} )
-
-/* Description : Pack even elements of input vectors & xor with 128
-   Arguments   : Inputs  - in0, in1
-                 Output - out_m
-                 Return Type - unsigned byte
-   Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulting vector is xor'ed with
-                 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB( in0, in1 )                                  \
-( {                                                                   \
-    v16u8 out_m;                                                      \
-    out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
-    out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 );           \
-    out_m;                                                            \
-} )
-
-/* Description : Pack even byte elements, extract 0 & 2 index words from pair
-                 of results and store 4 words in destination memory as per
-                 stride
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-*/
-#define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )  \
-{                                                            \
-    uint32_t out0_m, out1_m, out2_m, out3_m;                 \
-    v16i8 tmp0_m, tmp1_m;                                    \
-                                                             \
-    PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m );       \
-                                                             \
-    out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 );          \
-    out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 );          \
-    out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 );          \
-    out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 );          \
-                                                             \
-    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );    \
-}
-
-/* Description : Pack even byte elements and store byte vector in destination
-                 memory
-   Arguments   : Inputs  - in0, in1, pdst
-*/
-#define PCKEV_ST_SB( in0, in1, p_dst )                      \
-{                                                           \
-    v16i8 tmp_m;                                            \
-    tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
-    ST_SB( tmp_m, ( p_dst ) );                              \
-}
-
-#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 )    \
-( {                                                                        \
-    v4i32 tmp0_m, tmp1_m;                                                  \
-    v8i16 out0_m, out1_m, out2_m, out3_m;                                  \
-    v8i16 minus5h_m = __msa_ldi_h( -5 );                                   \
-    v8i16 plus20h_m = __msa_ldi_h( 20 );                                   \
-                                                                           \
-    ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m );                               \
-                                                                           \
-    tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m );         \
-    tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m );         \
-                                                                           \
-    ILVRL_H2_SH( in1, in4, out0_m, out1_m );                               \
-    DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m );  \
-    ILVRL_H2_SH( in2, in3, out2_m, out3_m );                               \
-    DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m );  \
-                                                                           \
-    SRARI_W2_SW( tmp0_m, tmp1_m, 10 );                                     \
-    SAT_SW2_SW( tmp0_m, tmp1_m, 7 );                                       \
-    out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );          \
-                                                                           \
-    out0_m;                                                                \
-} )
-
-#define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 )      \
-( {                                                        \
-    v8i16 out0_m, out1_m;                                  \
-    v16i8 tmp0_m, tmp1_m;                                  \
-    v16i8 minus5b = __msa_ldi_b( -5 );                     \
-    v16i8 plus20b = __msa_ldi_b( 20 );                     \
-                                                           \
-    tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in );      \
-    out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m );             \
-                                                           \
-    tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in );      \
-    out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m );   \
-                                                           \
-    tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in );  \
-    out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m );   \
-                                                           \
-    out1_m;                                                \
-} )
-
-#endif  /* X264_MIPS_MACROS_H */
diff --git a/android/src/main/libenc/jni/libx264/common/mips/mc-c.c b/android/src/main/libenc/jni/libx264/common/mips/mc-c.c
deleted file mode 100755
index cdf25ec..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/mc-c.c
+++ /dev/null
@@ -1,3809 +0,0 @@
-/*****************************************************************************
- * mc-c.c: msa motion compensation
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Neha Rana <neha.rana@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-#include "mc.h"
-
-#if !HIGH_BIT_DEPTH
-static const uint8_t pu_luma_mask_arr[16 * 8] =
-{
-    /* 8 width cases */
-    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
-    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
-    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-    /* 4 width cases */
-    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
-    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
-    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
-    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
-    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
-};
-
-static const uint8_t pu_chroma_mask_arr[16 * 5] =
-{
-    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
-    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
-    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
-    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
-};
-
-void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                           uint8_t *p_src, intptr_t i_src_stride,
-                           int32_t i_height );
-void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                          uint8_t *p_src, intptr_t i_src_stride,
-                          int32_t i_height );
-void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
-                          intptr_t i_src_stride, int32_t i_height );
-void x264_memzero_aligned_msa( void *p_dst, size_t n );
-
-void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                               uint8_t *p_pix2, intptr_t i_pix2_stride,
-                               uint8_t *p_pix3, intptr_t i_pix3_stride,
-                               int32_t i_weight );
-void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                              uint8_t *p_pix2, intptr_t i_pix2_stride,
-                              uint8_t *p_pix3, intptr_t i_pix3_stride,
-                              int32_t i_weight );
-void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                              uint8_t *p_pix2, intptr_t i_pix2_stride,
-                              uint8_t *p_pix3, intptr_t i_pix3_stride,
-                              int32_t i_weight );
-void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                             uint8_t *p_pix2, intptr_t i_pix2_stride,
-                             uint8_t *p_pix3, intptr_t i_pix3_stride,
-                             int32_t i_weight );
-void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                             uint8_t *p_pix2, intptr_t i_pix2_stride,
-                             uint8_t *p_pix3, intptr_t i_pix3_stride,
-                             int32_t i_weight );
-void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                              uint8_t *p_pix2, intptr_t pix2_stride,
-                              uint8_t *p_pix3, intptr_t pix3_stride,
-                              int32_t i_weight );
-void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                             uint8_t *p_pix2, intptr_t i_pix2_stride,
-                             uint8_t *p_pix3, intptr_t i_pix3_stride,
-                             int32_t i_weight );
-void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                             uint8_t *p_pix2, intptr_t i_pix2_stride,
-                             uint8_t *p_pix3, intptr_t i_pix3_stride,
-                             int32_t i_weight );
-void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
-                             uint8_t *p_pix2, intptr_t i_pix2_stride,
-                             uint8_t *p_pix3, intptr_t i_pix3_stride,
-                             int32_t i_weight );
-
-void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                             uint8_t *p_src, intptr_t i_src_stride,
-                             const x264_weight_t *pWeight, int32_t i_height );
-void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                            uint8_t *p_src, intptr_t i_src_stride,
-                            const x264_weight_t *pWeight, int32_t i_height );
-void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                            uint8_t *p_src, intptr_t i_src_stride,
-                            const x264_weight_t *pWeight, int32_t i_height );
-void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                             uint8_t *p_src, intptr_t i_src_stride,
-                             const x264_weight_t *pWeight, int32_t i_height );
-
-weight_fn_t x264_mc_weight_wtab_msa[6] =
-{
-    x264_mc_weight_w4_msa,
-    x264_mc_weight_w4_msa,
-    x264_mc_weight_w8_msa,
-    x264_mc_weight_w16_msa,
-    x264_mc_weight_w16_msa,
-    x264_mc_weight_w20_msa,
-};
-
-void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                       uint8_t *p_src[4], intptr_t i_src_stride,
-                       int32_t m_vx, int32_t m_vy,
-                       int32_t i_width, int32_t i_height,
-                       const x264_weight_t *pWeight );
-uint8_t *x264_get_ref_msa( uint8_t *p_dst,   intptr_t *p_dst_stride,
-                           uint8_t *p_src[4], intptr_t i_src_stride,
-                           int32_t m_vx, int32_t m_vy,
-                           int32_t i_width, int32_t i_height,
-                           const x264_weight_t *pWeight );
-void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
-                         intptr_t i_dst_stride,
-                         uint8_t *p_src, intptr_t i_src_stride,
-                         int32_t m_vx, int32_t m_vy,
-                         int32_t i_width, int32_t i_height );
-void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
-                           uint8_t *p_dstc, uint8_t *p_src,
-                           intptr_t i_stride, int32_t i_width,
-                           int32_t i_height, int16_t *p_buf );
-
-void x264_plane_copy_interleave_msa( uint8_t *p_dst,  intptr_t i_dst_stride,
-                                     uint8_t *p_src0, intptr_t i_src_stride0,
-                                     uint8_t *p_src1, intptr_t i_src_stride1,
-                                     int32_t i_width, int32_t i_height );
-void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
-                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
-                                       uint8_t *p_src,  intptr_t i_src_stride,
-                                       int32_t i_width, int32_t i_height );
-void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
-                                           intptr_t i_dst_stride0,
-                                           uint8_t *p_dst1,
-                                           intptr_t i_dst_stride1,
-                                           uint8_t *p_dst2,
-                                           intptr_t i_dst_stride2,
-                                           uint8_t *p_src,
-                                           intptr_t i_src_stride,
-                                           int32_t i_src_width, int32_t i_width,
-                                           int32_t i_height );
-void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                                       uint8_t *p_src0, uint8_t *p_src1,
-                                       int32_t i_height );
-void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
-                                             intptr_t i_src_stride,
-                                             int32_t i_height );
-void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
-                                             intptr_t i_src_stride,
-                                             int32_t i_height );
-void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
-                                      uint8_t *p_dst1, uint8_t *p_dst2,
-                                      uint8_t *p_dst3, intptr_t i_src_stride,
-                                      intptr_t i_dst_stride, int32_t i_width,
-                                      int32_t i_height );
-
-static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_dst, int32_t i_dst_stride,
-                                 int32_t i_height )
-{
-    uint32_t u_loop_cnt, u_h4w;
-    v16u8 dst0;
-    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-    v16i8 mask0, mask1, mask2;
-    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
-    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
-    v16i8 minus5b = __msa_ldi_b( -5 );
-    v16i8 plus20b = __msa_ldi_b( 20 );
-
-    u_h4w = i_height % 4;
-    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
-
-    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
-    {
-        LD_SB2( p_src, 8, src0, src1 );
-        p_src += i_src_stride;
-        LD_SB2( p_src, 8, src2, src3 );
-        p_src += i_src_stride;
-
-        XORI_B4_128_SB( src0, src1, src2, src3 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
-        VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
-        VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
-        VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
-        HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
-        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
-                      minus5b, res0, res1, res2, res3 );
-        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
-                      plus20b, res0, res1, res2, res3 );
-
-        LD_SB2( p_src, 8, src4, src5 );
-        p_src += i_src_stride;
-        LD_SB2( p_src, 8, src6, src7 );
-        p_src += i_src_stride;
-
-        XORI_B4_128_SB( src4, src5, src6, src7 );
-        VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
-        VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
-        VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
-        VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
-        VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
-        VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
-        HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
-        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
-                      minus5b, res4, res5, res6, res7 );
-        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
-                      plus20b, res4, res5, res6, res7 );
-        SRARI_H4_SH( res0, res1, res2, res3, 5 );
-        SRARI_H4_SH( res4, res5, res6, res7, 5 );
-        SAT_SH4_SH( res0, res1, res2, res3, 7 );
-        SAT_SH4_SH( res4, res5, res6, res7, 7 );
-        PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
-                     vec0, vec1, vec2, vec3 );
-        XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
-
-        ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-
-    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
-    {
-        LD_SB2( p_src, 8, src0, src1 );
-        p_src += i_src_stride;
-
-        XORI_B2_128_SB( src0, src1 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
-        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
-        res0 = __msa_hadd_s_h( vec0, vec0 );
-        DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
-        res1 = __msa_hadd_s_h( vec3, vec3 );
-        DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
-        SRARI_H2_SH( res0, res1, 5 );
-        SAT_SH2_SH( res0, res1, 7 );
-        dst0 = PCKEV_XORI128_UB( res0, res1 );
-        ST_UB( dst0, p_dst );
-        p_dst += i_dst_stride;
-    }
-}
-
-static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_dst, int32_t i_dst_stride,
-                                 int32_t i_height )
-{
-    uint32_t u_loop_cnt, u_h4w;
-    const int16_t i_filt_const0 = 0xfb01;
-    const int16_t i_filt_const1 = 0x1414;
-    const int16_t i_filt_const2 = 0x1fb;
-    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
-    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
-    v16i8 src65_l, src87_l;
-    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-    v16u8 res0, res1, res2, res3;
-    v16i8 filt0, filt1, filt2;
-
-    u_h4w = i_height % 4;
-    filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
-    filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
-    filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
-
-    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
-    p_src += ( 5 * i_src_stride );
-
-    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
-    ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
-                src10_r, src21_r, src32_r, src43_r );
-    ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
-                src10_l, src21_l, src32_l, src43_l );
-
-    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
-    {
-        LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
-        p_src += ( 4 * i_src_stride );
-
-        XORI_B4_128_SB( src5, src6, src7, src8 );
-        ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
-                    src54_r, src65_r, src76_r, src87_r );
-        ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
-                    src54_l, src65_l, src76_l, src87_l );
-        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
-                               filt0, filt1, filt2 );
-        out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
-                               filt0, filt1, filt2 );
-        out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
-                               filt0, filt1, filt2 );
-        out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
-                               filt0, filt1, filt2 );
-        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
-                               filt0, filt1, filt2 );
-        out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
-                               filt0, filt1, filt2 );
-        out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
-                               filt0, filt1, filt2 );
-        out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
-                               filt0, filt1, filt2 );
-        SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
-        SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
-        SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
-        SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
-        PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
-                     out3_r, res0, res1, res2, res3 );
-        XORI_B4_128_UB( res0, res1, res2, res3 );
-
-        ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-
-        src10_r = src54_r;
-        src32_r = src76_r;
-        src21_r = src65_r;
-        src43_r = src87_r;
-        src10_l = src54_l;
-        src32_l = src76_l;
-        src21_l = src65_l;
-        src43_l = src87_l;
-        src4 = src8;
-    }
-
-    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
-    {
-        src5 = LD_SB( p_src );
-        p_src += ( i_src_stride );
-        src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
-        ILVRL_B2_SB( src5, src4, src54_r, src54_l );
-        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
-                               filt0, filt1, filt2 );
-        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
-                               filt0, filt1, filt2 );
-        SRARI_H2_SH( out0_r, out0_l, 5 );
-        SAT_SH2_SH( out0_r, out0_l, 7 );
-        out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
-        res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
-        ST_UB( res0, p_dst );
-        p_dst += i_dst_stride;
-
-        src10_r = src21_r;
-        src21_r = src32_r;
-        src32_r = src43_r;
-        src43_r = src54_r;
-
-        src10_l = src21_l;
-        src21_l = src32_l;
-        src32_l = src43_l;
-        src43_l = src54_l;
-
-        src4 = src5;
-    }
-}
-
-static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_dst, int32_t i_dst_stride,
-                                 int32_t i_height )
-{
-    uint32_t u_loop_cnt, u_h4w;
-    uint64_t u_out0;
-    v16i8 tmp0;
-    v16i8 src0, src1, src2, src3, src4;
-    v16i8 mask0, mask1, mask2;
-    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
-    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
-    v8i16 dst0, dst1, dst2, dst3;
-    v16u8 out0, out1;
-
-    u_h4w = i_height % 4;
-    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
-
-    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
-    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
-    p_src += ( 5 * i_src_stride );
-
-    hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
-    hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
-    hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
-    hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
-    hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
-
-    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
-    {
-        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        XORI_B4_128_SB( src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-
-        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
-        hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
-        hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
-        hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
-        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
-                                                hz_out3, hz_out4, hz_out5 );
-        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
-                                                hz_out4, hz_out5, hz_out6 );
-        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
-                                                hz_out5, hz_out6, hz_out7 );
-        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
-                                                hz_out6, hz_out7, hz_out8 );
-        out0 = PCKEV_XORI128_UB( dst0, dst1 );
-        out1 = PCKEV_XORI128_UB( dst2, dst3 );
-        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
-
-        p_dst += ( 4 * i_dst_stride );
-        hz_out3 = hz_out7;
-        hz_out1 = hz_out5;
-        hz_out5 = hz_out4;
-        hz_out4 = hz_out8;
-        hz_out2 = hz_out6;
-        hz_out0 = hz_out5;
-    }
-
-    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
-    {
-        src0 = LD_SB( p_src );
-        p_src += i_src_stride;
-
-        src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
-        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
-
-        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
-                                                hz_out2, hz_out3,
-                                                hz_out4, hz_out5 );
-
-        tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
-        tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
-        u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
-        SD( u_out0, p_dst );
-        p_dst += i_dst_stride;
-
-        hz_out0 = hz_out1;
-        hz_out1 = hz_out2;
-        hz_out2 = hz_out3;
-        hz_out3 = hz_out4;
-        hz_out4 = hz_out5;
-    }
-}
-
-static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
-                                  uint8_t *p_dst, int32_t i_dst_stride,
-                                  int32_t i_height )
-{
-    uint32_t u_multiple8_cnt;
-
-    for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
-    {
-        avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
-                             i_height );
-        p_src += 8;
-        p_dst += 8;
-    }
-}
-
-static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
-                                               int32_t i_src_stride,
-                                               uint8_t *p_dst_u,
-                                               uint8_t *p_dst_v,
-                                               int32_t i_dst_stride,
-                                               uint32_t u_coef_hor0,
-                                               uint32_t u_coef_hor1,
-                                               uint32_t u_coef_ver0,
-                                               uint32_t u_coef_ver1 )
-{
-    uint16_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 src0, src1, src2, src3, src4;
-    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
-    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
-    v16i8 mask;
-    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
-    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
-    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
-    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
-    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
-    v8i16 res0, res1;
-
-    mask = LD_SB( &pu_chroma_mask_arr[16] );
-
-    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
-    VSHF_B2_UB( src0, src1, src1, src2,
-                ( mask + 1 ), ( mask + 1 ), src3, src4 );
-    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
-    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
-                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
-                 res_hz3 );
-    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-          res_vt3 );
-    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
-    SRARI_H2_UH( res_vt0, res_vt2, 6 );
-    SAT_UH2_UH( res_vt0, res_vt2, 7 );
-    PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
-
-    u_out0 = __msa_copy_u_h( res0, 0 );
-    u_out1 = __msa_copy_u_h( res0, 2 );
-    u_out2 = __msa_copy_u_h( res1, 0 );
-    u_out3 = __msa_copy_u_h( res1, 2 );
-
-    SH( u_out0, p_dst_u );
-    p_dst_u += i_dst_stride;
-    SH( u_out1, p_dst_u );
-
-    SH( u_out2, p_dst_v );
-    p_dst_v += i_dst_stride;
-    SH( u_out3, p_dst_v );
-}
-
-static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
-                                               int32_t i_src_stride,
-                                               uint8_t *p_dst_u,
-                                               uint8_t *p_dst_v,
-                                               int32_t i_dst_stride,
-                                               uint32_t u_coef_hor0,
-                                               uint32_t u_coef_hor1,
-                                               uint32_t u_coef_ver0,
-                                               uint32_t u_coef_ver1 )
-{
-    uint16_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
-    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
-    v16i8 mask;
-    v8i16 res0, res1;
-    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
-    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
-    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
-    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
-    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
-
-    mask = LD_SB( &pu_chroma_mask_arr[16] );
-
-    LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
-
-    VSHF_B2_UB( src0, src1, src1, src2,
-                ( mask + 1 ), ( mask + 1 ), src5, src6 );
-    VSHF_B2_UB( src2, src3, src3, src4,
-                ( mask + 1 ), ( mask + 1 ), src7, src8 );
-    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
-    VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
-    DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
-                 coeff_hz_vec, coeff_hz_vec, res_hz0,
-                 res_hz1, res_hz2, res_hz3 );
-    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-          res_vt3 );
-    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
-    SRARI_H2_UH( res_vt0, res_vt1, 6 );
-    SAT_UH2_UH( res_vt0, res_vt1, 7 );
-    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
-
-    u_out0 = __msa_copy_u_h( res0, 0 );
-    u_out1 = __msa_copy_u_h( res0, 2 );
-    u_out2 = __msa_copy_u_h( res1, 0 );
-    u_out3 = __msa_copy_u_h( res1, 2 );
-
-    SH( u_out0, p_dst_u );
-    p_dst_u += i_dst_stride;
-    SH( u_out1, p_dst_u );
-    p_dst_u += i_dst_stride;
-    SH( u_out2, p_dst_u );
-    p_dst_u += i_dst_stride;
-    SH( u_out3, p_dst_u );
-
-    DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
-                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
-                 res_hz3 );
-    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-          res_vt3 );
-    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
-    SRARI_H2_UH( res_vt0, res_vt1, 6 );
-    SAT_UH2_UH( res_vt0, res_vt1, 7 );
-    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
-
-    u_out0 = __msa_copy_u_h( res0, 0 );
-    u_out1 = __msa_copy_u_h( res0, 2 );
-    u_out2 = __msa_copy_u_h( res1, 0 );
-    u_out3 = __msa_copy_u_h( res1, 2 );
-
-    SH( u_out0, p_dst_v );
-    p_dst_v += i_dst_stride;
-    SH( u_out1, p_dst_v );
-    p_dst_v += i_dst_stride;
-    SH( u_out2, p_dst_v );
-    p_dst_v += i_dst_stride;
-    SH( u_out3, p_dst_v );
-}
-
-static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
-                                              int32_t i_src_stride,
-                                              uint8_t *p_dst_u,
-                                              uint8_t *p_dst_v,
-                                              int32_t i_dst_stride,
-                                              uint32_t u_coef_hor0,
-                                              uint32_t u_coef_hor1,
-                                              uint32_t u_coef_ver0,
-                                              uint32_t u_coef_ver1,
-                                              int32_t i_height )
-{
-    if( 2 == i_height )
-    {
-        avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
-                                           p_dst_u, p_dst_v, i_dst_stride,
-                                           u_coef_hor0, u_coef_hor1,
-                                           u_coef_ver0, u_coef_ver1 );
-    }
-    else if( 4 == i_height )
-    {
-        avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
-                                           p_dst_u, p_dst_v, i_dst_stride,
-                                           u_coef_hor0, u_coef_hor1,
-                                           u_coef_ver0, u_coef_ver1 );
-    }
-}
-
-static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
-                                               int32_t i_src_stride,
-                                               uint8_t *p_dst_u,
-                                               uint8_t *p_dst_v,
-                                               int32_t i_dst_stride,
-                                               uint32_t u_coef_hor0,
-                                               uint32_t u_coef_hor1,
-                                               uint32_t u_coef_ver0,
-                                               uint32_t u_coef_ver1 )
-{
-    uint32_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 src0, src1, src2, src3, src4;
-    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
-    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
-    v16i8 mask;
-    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
-    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
-    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
-    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
-    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
-    v4i32 res0, res1;
-
-    mask = LD_SB( &pu_chroma_mask_arr[16] );
-
-    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
-    VSHF_B2_UB( src0, src1, src1, src2,
-                ( mask + 1 ), ( mask + 1 ), src3, src4 );
-    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
-    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
-                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
-                 res_hz3 );
-    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-          res_vt3 );
-    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
-    SRARI_H2_UH( res_vt0, res_vt2, 6 );
-    SAT_UH2_UH( res_vt0, res_vt2, 7 );
-    PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
-
-    u_out0 = __msa_copy_u_w( res0, 0 );
-    u_out1 = __msa_copy_u_w( res0, 1 );
-    u_out2 = __msa_copy_u_w( res1, 0 );
-    u_out3 = __msa_copy_u_w( res1, 1 );
-    SW( u_out0, p_dst_u );
-    p_dst_u += i_dst_stride;
-    SW( u_out1, p_dst_u );
-    SW( u_out2, p_dst_v );
-    p_dst_v += i_dst_stride;
-    SW( u_out3, p_dst_v );
-}
-
-static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
-                                                  int32_t i_src_stride,
-                                                  uint8_t *p_dst_u,
-                                                  uint8_t *p_dst_v,
-                                                  int32_t i_dst_stride,
-                                                  uint32_t u_coef_hor0,
-                                                  uint32_t u_coef_hor1,
-                                                  uint32_t u_coef_ver0,
-                                                  uint32_t u_coef_ver1,
-                                                  int32_t i_height )
-{
-    uint32_t u_row;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
-    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
-    v16i8 mask;
-    v4i32 res0, res1;
-    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
-    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
-    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
-    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
-    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
-
-    mask = LD_SB( &pu_chroma_mask_arr[16] );
-
-    src0 = LD_UB( p_src );
-    p_src += i_src_stride;
-
-    for( u_row = ( i_height >> 2 ); u_row--; )
-    {
-        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
-        p_src += ( 4 * i_src_stride );
-
-        VSHF_B2_UB( src0, src1, src1, src2,
-                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
-        VSHF_B2_UB( src2, src3, src3, src4,
-                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
-        VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
-        VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
-        DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
-                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
-                     res_hz3 );
-        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-              res_vt3 );
-        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
-        SRARI_H2_UH( res_vt0, res_vt1, 6 );
-        SAT_UH2_UH( res_vt0, res_vt1, 7 );
-        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
-
-        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
-        p_dst_u += ( 4 * i_dst_stride );
-
-        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
-                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
-                     res_hz3 );
-        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-              res_vt3 );
-        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
-        SRARI_H2_UH( res_vt0, res_vt1, 6 );
-        SAT_UH2_UH( res_vt0, res_vt1, 7 );
-        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
-
-        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
-        p_dst_v += ( 4 * i_dst_stride );
-        src0 = src4;
-    }
-}
-
-static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
-                                              int32_t i_src_stride,
-                                              uint8_t *p_dst_u,
-                                              uint8_t *p_dst_v,
-                                              int32_t i_dst_stride,
-                                              uint32_t u_coef_hor0,
-                                              uint32_t u_coef_hor1,
-                                              uint32_t u_coef_ver0,
-                                              uint32_t u_coef_ver1,
-                                              int32_t i_height )
-{
-    if( 2 == i_height )
-    {
-        avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
-                                           p_dst_u, p_dst_v, i_dst_stride,
-                                           u_coef_hor0, u_coef_hor1,
-                                           u_coef_ver0, u_coef_ver1 );
-    }
-    else
-    {
-        avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
-                                              p_dst_u, p_dst_v, i_dst_stride,
-                                              u_coef_hor0, u_coef_hor1,
-                                              u_coef_ver0, u_coef_ver1,
-                                              i_height );
-    }
-}
-
-static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
-                                              int32_t i_src_stride,
-                                              uint8_t *p_dst_u,
-                                              uint8_t *p_dst_v,
-                                              int32_t i_dst_stride,
-                                              uint32_t u_coef_hor0,
-                                              uint32_t u_coef_hor1,
-                                              uint32_t u_coef_ver0,
-                                              uint32_t u_coef_ver1,
-                                              int32_t i_height )
-{
-    uint32_t u_row;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
-    v16u8 src10, src11, src12, src13, src14;
-    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
-    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
-    v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
-    v16i8 coeff_hz_vec0, coeff_hz_vec1;
-    v16i8 tmp0, tmp1;
-    v16u8 coeff_hz_vec;
-    v8u16 coeff_vt_vec0, coeff_vt_vec1;
-
-    coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
-    coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
-    coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
-    coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
-    coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
-
-    LD_UB2( p_src, 16, src0, src13 );
-    p_src += i_src_stride;
-
-    VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
-    DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
-
-    for( u_row = ( i_height >> 2 ); u_row--; )
-    {
-        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
-        LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
-        p_src += ( 4 * i_src_stride );
-
-        VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
-        VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
-        DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
-                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
-                     res_hz4 );
-        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
-              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-              res_vt3 );
-
-        res_vt0 += ( res_hz0 * coeff_vt_vec1 );
-        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
-        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
-        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
-
-        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
-        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
-        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
-        ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
-        p_dst_u += ( 4 * i_dst_stride );
-        res_hz0 = res_hz4;
-
-        VSHF_B2_UB( src1, src5, src2, src6,
-                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
-        VSHF_B2_UB( src3, src7, src4, src8,
-                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
-        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
-                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
-                     res_hz4 );
-        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
-              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
-              res_vt3 );
-
-        res_vt0 += ( res_hz5 * coeff_vt_vec1 );
-        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
-        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
-        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
-
-        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
-        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
-        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
-        ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
-        p_dst_v += ( 4 * i_dst_stride );
-        res_hz5 = res_hz4;
-    }
-}
-
-static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
-                                     uint8_t *p_dst, int32_t i_dst_stride,
-                                     int32_t i_log2_denom, int32_t i_weight,
-                                     int32_t i_offset_in )
-{
-    uint32_t u_load0, u_load1, u_out0, u_out1;
-    v16u8 zero = { 0 };
-    v16u8 src0, src1;
-    v4i32 dst0, dst1;
-    v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
-    v8i16 vec0, vec1;
-
-    i_offset_in <<= ( i_log2_denom );
-
-    if( i_log2_denom )
-    {
-        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
-    }
-
-    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
-
-    u_load0 = LW( p_src );
-    p_src += i_src_stride;
-    u_load1 = LW( p_src );
-
-    src0 = ( v16u8 ) __msa_fill_w( u_load0 );
-    src1 = ( v16u8 ) __msa_fill_w( u_load1 );
-
-    ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
-    MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
-    ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
-    MAXI_SH2_SH( vec0, vec1, 0 );
-
-    tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
-    tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
-
-    SAT_UH2_UH( tp0, tp1, 7 );
-    PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
-
-    u_out0 = __msa_copy_u_w( dst0, 0 );
-    u_out1 = __msa_copy_u_w( dst1, 0 );
-    SW( u_out0, p_dst );
-    p_dst += i_dst_stride;
-    SW( u_out1, p_dst );
-}
-
-static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
-                                             int32_t i_src_stride,
-                                             uint8_t *p_dst,
-                                             int32_t i_dst_stride,
-                                             int32_t i_height,
-                                             int32_t i_log2_denom,
-                                             int32_t i_weight,
-                                             int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    uint32_t u_load0, u_load1, u_load2, u_load3;
-    v16u8 zero = { 0 };
-    v16u8 src0, src1, src2, src3;
-    v8u16 temp0, temp1, temp2, temp3;
-    v8u16 wgt, denom, offset;
-
-    i_offset_in <<= ( i_log2_denom );
-
-    if( i_log2_denom )
-    {
-        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
-    }
-
-    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
-        p_src += 4 * i_src_stride;
-
-        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
-        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
-        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
-        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
-
-        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp0, temp1, temp2, temp3 );
-        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
-              temp0, temp1, temp2, temp3 );
-        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
-                     temp0, temp1, temp2, temp3 );
-        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
-        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
-        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
-        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                        uint8_t *p_dst, int32_t i_dst_stride,
-                                        int32_t i_height, int32_t i_log2_denom,
-                                        int32_t i_weight, int32_t i_offset_in )
-{
-    if( 2 == i_height )
-    {
-        avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
-                                 i_log2_denom, i_weight, i_offset_in );
-    }
-    else
-    {
-        avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
-                                         p_dst, i_dst_stride,
-                                         i_height, i_log2_denom,
-                                         i_weight, i_offset_in );
-    }
-}
-
-static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                        uint8_t *p_dst, int32_t i_dst_stride,
-                                        int32_t i_height, int32_t i_log2_denom,
-                                        int32_t i_weight, int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v16u8 zero = { 0 };
-    v16u8 src0, src1, src2, src3;
-    v8u16 temp0, temp1, temp2, temp3;
-    v8u16 wgt, denom, offset;
-    v16i8 out0, out1;
-
-    i_offset_in <<= ( i_log2_denom );
-
-    if( i_log2_denom )
-    {
-        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
-    }
-
-    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += 4 * i_src_stride;
-
-        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp0, temp1, temp2, temp3 );
-        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
-              temp0, temp1, temp2, temp3 );
-        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
-                     temp0, temp1, temp2, temp3 );
-        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
-        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
-        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
-        PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
-        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                         uint8_t *p_dst, int32_t i_dst_stride,
-                                         int32_t i_height, int32_t i_log2_denom,
-                                         int32_t i_weight, int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v16i8 zero = { 0 };
-    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-    v8u16 wgt, denom, offset;
-
-    i_offset_in <<= ( i_log2_denom );
-
-    if( i_log2_denom )
-    {
-        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
-    }
-
-    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += 4 * i_src_stride;
-
-        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp0, temp2, temp4, temp6 );
-        ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp1, temp3, temp5, temp7 );
-        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
-              temp0, temp1, temp2, temp3 );
-        MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
-              temp4, temp5, temp6, temp7 );
-        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
-                     temp0, temp1, temp2, temp3 );
-        ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
-                     temp4, temp5, temp6, temp7 );
-        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
-        MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
-        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
-        SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
-        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
-        SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
-        PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
-                     dst0, dst1, dst2, dst3 );
-
-        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
-        p_dst += 4 * i_dst_stride;
-    }
-}
-
-static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
-                                          int32_t i_src1_stride,
-                                          uint8_t *p_src2_in,
-                                          int32_t i_src2_stride,
-                                          uint8_t *p_dst,
-                                          int32_t i_dst_stride,
-                                          int32_t i_log2_denom,
-                                          int32_t i_src1_weight,
-                                          int32_t i_src2_weight,
-                                          int32_t i_offset_in )
-{
-    uint32_t u_load0, u_load1, u_out0, u_out1;
-    v8i16 src1_wgt, src2_wgt;
-    v16u8 in0, in1, in2, in3;
-    v8i16 temp0, temp1, temp2, temp3;
-    v16i8 zero = { 0 };
-    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
-
-    src1_wgt = __msa_fill_h( i_src1_weight );
-    src2_wgt = __msa_fill_h( i_src2_weight );
-    u_load0 = LW( p_src1_in );
-    u_load1 = LW( p_src1_in + i_src1_stride );
-    in0 = ( v16u8 ) __msa_fill_w( u_load0 );
-    in1 = ( v16u8 ) __msa_fill_w( u_load1 );
-    u_load0 = LW( p_src2_in );
-    u_load1 = LW( p_src2_in + i_src2_stride );
-    in2 = ( v16u8 ) __msa_fill_w( u_load0 );
-    in3 = ( v16u8 ) __msa_fill_w( u_load1 );
-    ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
-                temp0, temp1, temp2, temp3 );
-    temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
-    temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
-    SRAR_H2_SH( temp0, temp1, denom );
-    CLIP_SH2_0_255( temp0, temp1 );
-    PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
-    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
-    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
-    SW( u_out0, p_dst );
-    p_dst += i_dst_stride;
-    SW( u_out1, p_dst );
-}
-
-static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
-                                                  int32_t i_src1_stride,
-                                                  uint8_t *p_src2_in,
-                                                  int32_t i_src2_stride,
-                                                  uint8_t *p_dst,
-                                                  int32_t i_dst_stride,
-                                                  int32_t i_height,
-                                                  int32_t i_log2_denom,
-                                                  int32_t i_src1_weight,
-                                                  int32_t i_src2_weight,
-                                                  int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    uint32_t u_load0, u_load1, u_load2, u_load3;
-    v8i16 src1_wgt, src2_wgt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-    v16i8 zero = { 0 };
-    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
-
-    src1_wgt = __msa_fill_h( i_src1_weight );
-    src2_wgt = __msa_fill_h( i_src2_weight );
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
-        p_src1_in += ( 4 * i_src1_stride );
-        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
-        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
-        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
-        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
-        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
-        p_src2_in += ( 4 * i_src2_stride );
-        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
-        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
-        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
-        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
-        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp0, temp1, temp2, temp3 );
-        ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
-                    temp4, temp5, temp6, temp7 );
-        temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
-        temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
-        temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
-        temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
-        SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
-        CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
-        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
-                                             int32_t i_src1_stride,
-                                             uint8_t *p_src2_in,
-                                             int32_t i_src2_stride,
-                                             uint8_t *p_dst,
-                                             int32_t i_dst_stride,
-                                             int32_t i_height,
-                                             int32_t i_log2_denom,
-                                             int32_t i_src1_weight,
-                                             int32_t i_src2_weight,
-                                             int32_t i_offset_in )
-{
-    if( 2 == i_height )
-    {
-        avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
-                                      p_src2_in, i_src2_stride,
-                                      p_dst, i_dst_stride,
-                                      i_log2_denom, i_src1_weight,
-                                      i_src2_weight, i_offset_in );
-    }
-    else
-    {
-        avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
-                                              p_src2_in, i_src2_stride,
-                                              p_dst, i_dst_stride,
-                                              i_height, i_log2_denom,
-                                              i_src1_weight, i_src2_weight,
-                                              i_offset_in );
-    }
-}
-
-static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
-                                             int32_t i_src1_stride,
-                                             uint8_t *p_src2_in,
-                                             int32_t i_src2_stride,
-                                             uint8_t *p_dst,
-                                             int32_t i_dst_stride,
-                                             int32_t i_height,
-                                             int32_t i_log2_denom,
-                                             int32_t i_src1_weight,
-                                             int32_t i_src2_weight,
-                                             int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v8i16 src1_wgt, src2_wgt;
-    v16u8 src0, src1, src2, src3;
-    v16u8 dst0, dst1, dst2, dst3;
-    v8i16 temp0, temp1, temp2, temp3;
-    v8i16 res0, res1, res2, res3;
-    v16i8 zero = { 0 };
-    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
-
-    src1_wgt = __msa_fill_h( i_src1_weight );
-    src2_wgt = __msa_fill_h( i_src2_weight );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
-        p_src1_in += ( 4 * i_src1_stride );
-        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
-        p_src2_in += ( 4 * i_src2_stride );
-        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
-                    temp0, temp1, temp2, temp3 );
-        ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
-                    res0, res1, res2, res3 );
-        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
-        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
-        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
-        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
-        SRAR_H4_SH( res0, res1, res2, res3, denom );
-        CLIP_SH4_0_255( res0, res1, res2, res3 );
-        PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
-                     dst0, dst1, dst2, dst3 );
-        ST8x1_UB( dst0, p_dst );
-        p_dst += i_dst_stride;
-        ST8x1_UB( dst1, p_dst );
-        p_dst += i_dst_stride;
-        ST8x1_UB( dst2, p_dst );
-        p_dst += i_dst_stride;
-        ST8x1_UB( dst3, p_dst );
-        p_dst += i_dst_stride;
-    }
-}
-
-static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
-                                              int32_t i_src1_stride,
-                                              uint8_t *p_src2_in,
-                                              int32_t i_src2_stride,
-                                              uint8_t *p_dst,
-                                              int32_t i_dst_stride,
-                                              int32_t i_height,
-                                              int32_t i_log2_denom,
-                                              int32_t i_src1_weight,
-                                              int32_t i_src2_weight,
-                                              int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v8i16 src1_wgt, src2_wgt;
-    v16u8 src0, src1, src2, src3;
-    v16u8 dst0, dst1, dst2, dst3;
-    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-    v16i8 zero = { 0 };
-    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
-
-    src1_wgt = __msa_fill_h( i_src1_weight );
-    src2_wgt = __msa_fill_h( i_src2_weight );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
-        p_src1_in += ( 4 * i_src1_stride );
-        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
-        p_src2_in += ( 4 * i_src2_stride );
-        ILVRL_B2_SH( zero, src0, temp1, temp0 );
-        ILVRL_B2_SH( zero, src1, temp3, temp2 );
-        ILVRL_B2_SH( zero, src2, temp5, temp4 );
-        ILVRL_B2_SH( zero, src3, temp7, temp6 );
-        ILVRL_B2_SH( zero, dst0, res1, res0 );
-        ILVRL_B2_SH( zero, dst1, res3, res2 );
-        ILVRL_B2_SH( zero, dst2, res5, res4 );
-        ILVRL_B2_SH( zero, dst3, res7, res6 );
-        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
-        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
-        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
-        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
-        res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
-        res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
-        res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
-        res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
-        SRAR_H4_SH( res0, res1, res2, res3, denom );
-        SRAR_H4_SH( res4, res5, res6, res7, denom );
-        CLIP_SH4_0_255( res0, res1, res2, res3 );
-        CLIP_SH4_0_255( res4, res5, res6, res7 );
-        PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
-                     dst0, dst1, dst2, dst3 );
-        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
-        p_dst += 4 * i_dst_stride;
-    }
-}
-
-static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
-                                       int32_t i_src1_stride,
-                                       uint8_t *p_src2_in,
-                                       int32_t i_src2_stride,
-                                       uint8_t *p_dst, int32_t i_dst_stride,
-                                       int32_t i_log2_denom,
-                                       int32_t i_src1_weight,
-                                       int32_t i_src2_weight,
-                                       int32_t i_offset_in )
-{
-    uint32_t u_load0, u_load1, u_out0, u_out1;
-    v16u8 src1_wgt, src2_wgt, wgt;
-    v16i8 in0, in1, in2, in3;
-    v8u16 temp0, temp1, denom, offset;
-
-    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
-
-    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
-    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
-
-    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
-
-    u_load0 = LW( p_src1_in );
-    u_load1 = LW( p_src1_in + i_src1_stride );
-    in0 = ( v16i8 ) __msa_fill_w( u_load0 );
-    in1 = ( v16i8 ) __msa_fill_w( u_load1 );
-
-    u_load0 = LW( p_src2_in );
-    u_load1 = LW( p_src2_in + i_src2_stride );
-    in2 = ( v16i8 ) __msa_fill_w( u_load0 );
-    in3 = ( v16i8 ) __msa_fill_w( u_load1 );
-
-    ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
-
-    temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
-    temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
-    temp0 >>= denom;
-    temp1 >>= denom;
-    MAXI_SH2_UH( temp0, temp1, 0 );
-    SAT_UH2_UH( temp0, temp1, 7 );
-    PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
-
-    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
-    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
-    SW( u_out0, p_dst );
-    p_dst += i_dst_stride;
-    SW( u_out1, p_dst );
-}
-
-static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
-                                               int32_t i_src1_stride,
-                                               uint8_t *p_src2_in,
-                                               int32_t i_src2_stride,
-                                               uint8_t *p_dst,
-                                               int32_t i_dst_stride,
-                                               int32_t i_height,
-                                               int32_t i_log2_denom,
-                                               int32_t i_src1_weight,
-                                               int32_t i_src2_weight,
-                                               int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    uint32_t u_load0, u_load1, u_load2, u_load3;
-    v16u8 src1_wgt, src2_wgt, wgt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 temp0, temp1, temp2, temp3;
-    v8u16 res0, res1, res2, res3;
-    v8u16 denom, offset;
-
-    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
-
-    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
-    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
-
-    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
-        p_src1_in += ( 4 * i_src1_stride );
-
-        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
-        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
-        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
-        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
-
-        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
-        p_src2_in += ( 4 * i_src2_stride );
-
-        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
-        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
-        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
-        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
-
-        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                    temp0, temp1, temp2, temp3 );
-        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
-                     res0, res1, res2, res3 );
-        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
-              res0, res1, res2, res3 );
-        SRA_4V( res0, res1, res2, res3, denom );
-        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
-        SAT_UH4_UH( res0, res1, res2, res3, 7 );
-        PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
-                                          int32_t i_src1_stride,
-                                          uint8_t *p_src2_in,
-                                          int32_t i_src2_stride,
-                                          uint8_t *p_dst,
-                                          int32_t i_dst_stride,
-                                          int32_t i_height,
-                                          int32_t i_log2_denom,
-                                          int32_t i_src1_weight,
-                                          int32_t i_src2_weight,
-                                          int32_t i_offset_in )
-{
-    if( 2 == i_height )
-    {
-        avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
-                                   p_src2_in, i_src2_stride,
-                                   p_dst, i_dst_stride,
-                                   i_log2_denom, i_src1_weight,
-                                   i_src2_weight, i_offset_in );
-    }
-    else
-    {
-        avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
-                                           p_src2_in, i_src2_stride,
-                                           p_dst, i_dst_stride,
-                                           i_height, i_log2_denom,
-                                           i_src1_weight,
-                                           i_src2_weight, i_offset_in );
-    }
-}
-
-
-static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
-                                          int32_t i_src1_stride,
-                                          uint8_t *p_src2_in,
-                                          int32_t i_src2_stride,
-                                          uint8_t *p_dst,
-                                          int32_t i_dst_stride,
-                                          int32_t i_height,
-                                          int32_t i_log2_denom,
-                                          int32_t i_src1_weight,
-                                          int32_t i_src2_weight,
-                                          int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v16u8 src1_wgt, src2_wgt, wgt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 temp0, temp1, temp2, temp3;
-    v8u16 res0, res1, res2, res3;
-    v8u16 denom, offset;
-    v16i8 out0, out1;
-
-    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
-
-    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
-    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
-
-    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
-        p_src1_in += ( 4 * i_src1_stride );
-
-        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
-        p_src2_in += ( 4 * i_src2_stride );
-
-        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                    temp0, temp1, temp2, temp3 );
-        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
-                     res0, res1, res2, res3 );
-        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
-              res0, res1, res2, res3 );
-        SRA_4V( res0, res1, res2, res3, denom );
-        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
-        SAT_UH4_UH( res0, res1, res2, res3, 7 );
-        PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
-        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
-        p_dst += 4 * i_dst_stride;
-    }
-}
-
-static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
-                                           int32_t i_src1_stride,
-                                           uint8_t *p_src2_in,
-                                           int32_t i_src2_stride,
-                                           uint8_t *p_dst,
-                                           int32_t i_dst_stride,
-                                           int32_t i_height,
-                                           int32_t i_log2_denom,
-                                           int32_t i_src1_weight,
-                                           int32_t i_src2_weight,
-                                           int32_t i_offset_in )
-{
-    uint8_t u_cnt;
-    v16u8 src1_wgt, src2_wgt, wgt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
-    v8u16 denom, offset;
-
-    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
-
-    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
-    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
-    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
-    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
-
-    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
-
-    for( u_cnt = i_height / 4; u_cnt--; )
-    {
-        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
-        p_src1_in += ( 4 * i_src1_stride );
-
-        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
-        p_src2_in += ( 4 * i_src2_stride );
-
-        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                    temp0, temp2, temp4, temp6 );
-        ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                    temp1, temp3, temp5, temp7 );
-        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
-                     res0, res1, res2, res3 );
-        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
-              res0, res1, res2, res3 );
-        DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
-                     res4, res5, res6, res7 );
-        ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
-              res4, res5, res6, res7 );
-        SRA_4V( res0, res1, res2, res3, denom );
-        SRA_4V( res4, res5, res6, res7, denom );
-        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
-        MAXI_SH4_UH( res4, res5, res6, res7, 0 );
-        SAT_UH4_UH( res0, res1, res2, res3, 7 );
-        SAT_UH4_UH( res4, res5, res6, res7, 7 );
-        PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
-                     temp0, temp1, temp2, temp3 );
-        ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
-        p_dst += 4 * i_dst_stride;
-    }
-}
-
-static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
-                             uint8_t *p_dst, int32_t i_dst_stride,
-                             int32_t i_height )
-{
-    int32_t i_cnt;
-    uint32_t u_src0, u_src1;
-
-    for( i_cnt = ( i_height / 2 ); i_cnt--;  )
-    {
-        u_src0 = LW( p_src );
-        p_src += i_src_stride;
-        u_src1 = LW( p_src );
-        p_src += i_src_stride;
-
-        SW( u_src0, p_dst );
-        p_dst += i_dst_stride;
-        SW( u_src1, p_dst );
-        p_dst += i_dst_stride;
-    }
-}
-
-static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
-                             uint8_t *p_dst, int32_t i_dst_stride,
-                             int32_t i_height )
-{
-    int32_t i_cnt;
-    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-    if( 0 == i_height % 12 )
-    {
-        for( i_cnt = ( i_height / 12 ); i_cnt--; )
-        {
-            LD_UB8( p_src, i_src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7 );
-            p_src += ( 8 * i_src_stride );
-
-            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
-            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
-            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
-            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
-            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
-            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
-            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
-            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
-
-            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-
-            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-            p_src += ( 4 * i_src_stride );
-
-            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
-            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
-            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
-            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
-
-            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-        }
-    }
-    else if( 0 == i_height % 8 )
-    {
-        for( i_cnt = i_height >> 3; i_cnt--; )
-        {
-            LD_UB8( p_src, i_src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7 );
-            p_src += ( 8 * i_src_stride );
-
-            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
-            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
-            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
-            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
-            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
-            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
-            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
-            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
-
-            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-        }
-    }
-    else if( 0 == i_height % 4 )
-    {
-        for( i_cnt = ( i_height / 4 ); i_cnt--; )
-        {
-            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-            p_src += ( 4 * i_src_stride );
-            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
-            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
-            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
-            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
-
-            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-        }
-    }
-    else if( 0 == i_height % 2 )
-    {
-        for( i_cnt = ( i_height / 2 ); i_cnt--; )
-        {
-            LD_UB2( p_src, i_src_stride, src0, src1 );
-            p_src += ( 2 * i_src_stride );
-            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
-            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
-
-            SD( u_out0, p_dst );
-            p_dst += i_dst_stride;
-            SD( u_out1, p_dst );
-            p_dst += i_dst_stride;
-        }
-    }
-}
-
-
-static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
-                                   uint8_t *p_dst, int32_t i_dst_stride,
-                                   int32_t i_height, int32_t i_width )
-{
-    int32_t i_cnt, i_loop_cnt;
-    uint8_t *p_src_tmp, *p_dst_tmp;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-    for( i_cnt = ( i_width >> 4 ); i_cnt--; )
-    {
-        p_src_tmp = p_src;
-        p_dst_tmp = p_dst;
-
-        for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
-        {
-            LD_UB8( p_src_tmp, i_src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7 );
-            p_src_tmp += ( 8 * i_src_stride );
-
-            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
-                    p_dst_tmp, i_dst_stride );
-            p_dst_tmp += ( 8 * i_dst_stride );
-        }
-
-        p_src += 16;
-        p_dst += 16;
-    }
-}
-
-static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
-                              uint8_t *p_dst, int32_t i_dst_stride,
-                              int32_t i_height )
-{
-    int32_t i_cnt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-    if( 0 == i_height % 12 )
-    {
-        for( i_cnt = ( i_height / 12 ); i_cnt--; )
-        {
-            LD_UB8( p_src, i_src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7 );
-            p_src += ( 8 * i_src_stride );
-            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
-                    p_dst, i_dst_stride );
-            p_dst += ( 8 * i_dst_stride );
-
-            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-            p_src += ( 4 * i_src_stride );
-            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-        }
-    }
-    else if( 0 == i_height % 8 )
-    {
-        copy_16multx8mult_msa( p_src, i_src_stride,
-                               p_dst, i_dst_stride, i_height, 16 );
-    }
-    else if( 0 == i_height % 4 )
-    {
-        for( i_cnt = ( i_height >> 2 ); i_cnt--; )
-        {
-            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-            p_src += ( 4 * i_src_stride );
-
-            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
-            p_dst += ( 4 * i_dst_stride );
-        }
-    }
-}
-
-static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
-                                uint8_t *p_src2, int32_t i_src2_stride,
-                                uint8_t *p_dst, int32_t i_dst_stride,
-                                int32_t i_height )
-{
-    int32_t i_cnt;
-    uint32_t u_out0, u_out1;
-    v16u8 src0, src1, src2, src3;
-    v16u8 dst0, dst1;
-
-    for( i_cnt = ( i_height / 2 ); i_cnt--; )
-    {
-        LD_UB2( p_src1, i_src1_stride, src0, src1 );
-        p_src1 += ( 2 * i_src1_stride );
-        LD_UB2( p_src2, i_src2_stride, src2, src3 );
-        p_src2 += ( 2 * i_src2_stride );
-
-        AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
-
-        u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
-        u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
-        SW( u_out0, p_dst );
-        p_dst += i_dst_stride;
-        SW( u_out1, p_dst );
-        p_dst += i_dst_stride;
-    }
-}
-
-static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
-                                uint8_t *p_src2, int32_t i_src2_stride,
-                                uint8_t *p_dst, int32_t i_dst_stride,
-                                int32_t i_height )
-{
-    int32_t i_cnt;
-    uint64_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 dst0, dst1, dst2, dst3;
-
-    for( i_cnt = ( i_height / 4 ); i_cnt--; )
-    {
-        LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
-        p_src1 += ( 4 * i_src1_stride );
-        LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
-        p_src2 += ( 4 * i_src2_stride );
-
-        AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
-                     dst0, dst1, dst2, dst3 );
-
-        u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
-        u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
-        u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
-        u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
-        SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
-                                 uint8_t *p_src2, int32_t i_src2_stride,
-                                 uint8_t *p_dst, int32_t i_dst_stride,
-                                 int32_t i_height )
-{
-    int32_t i_cnt;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-
-    for( i_cnt = ( i_height / 8 ); i_cnt--; )
-    {
-        LD_UB8( p_src1, i_src1_stride,
-                src0, src1, src2, src3, src4, src5, src6, src7 );
-        p_src1 += ( 8 * i_src1_stride );
-        LD_UB8( p_src2, i_src2_stride,
-                dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
-        p_src2 += ( 8 * i_src2_stride );
-
-        AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                     dst0, dst1, dst2, dst3 );
-        AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
-                     dst4, dst5, dst6, dst7 );
-
-        ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
-                p_dst, i_dst_stride );
-        p_dst += ( 8 * i_dst_stride );
-    }
-}
-
-static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
-                                     int32_t i_height )
-{
-    int8_t i_cnt;
-    v16u8 zero = { 0 };
-
-    for( i_cnt = ( i_height / 2 ); i_cnt--; )
-    {
-        ST_UB( zero, p_src );
-        p_src += i_stride;
-        ST_UB( zero, p_src );
-        p_src += i_stride;
-    }
-}
-
-static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
-                                       uint8_t *p_src1, int32_t i_src1_stride,
-                                       uint8_t *p_dst, int32_t i_dst_stride,
-                                       int32_t i_width, int32_t i_height )
-{
-    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
-    v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
-
-    i_w_mul8 = i_width - i_width % 8;
-    i_h4w = i_height - i_height % 4;
-
-    for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
-    {
-        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
-        {
-            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
-            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
-            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
-            ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                        vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
-            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
-                    p_dst, i_dst_stride );
-            ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
-                    ( p_dst + 16 ), i_dst_stride );
-            p_src0 += 16;
-            p_src1 += 16;
-            p_dst += 32;
-        }
-
-        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
-        {
-            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
-            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
-            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
-                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
-            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
-                    p_dst, i_dst_stride );
-            p_src0 += 8;
-            p_src1 += 8;
-            p_dst += 16;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst[0] = p_src0[0];
-            p_dst[1] = p_src1[0];
-            p_dst[i_dst_stride] = p_src0[i_src0_stride];
-            p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
-            p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
-            p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
-            p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
-            p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
-            p_src0 += 1;
-            p_src1 += 1;
-            p_dst += 2;
-        }
-
-        p_src0 += ( ( 4 * i_src0_stride ) - i_width );
-        p_src1 += ( ( 4 * i_src1_stride ) - i_width );
-        p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
-    }
-
-    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
-    {
-        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
-        {
-            src0 = LD_UB( p_src0 );
-            src4 = LD_UB( p_src1 );
-            ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
-            ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
-            p_src0 += 16;
-            p_src1 += 16;
-            p_dst += 32;
-        }
-
-        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
-        {
-            src0 = LD_UB( p_src0 );
-            src4 = LD_UB( p_src1 );
-            vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
-                                                 ( v16i8 ) src0 );
-            ST_UB( vec_ilv_r0, p_dst );
-            p_src0 += 8;
-            p_src1 += 8;
-            p_dst += 16;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst[0] = p_src0[0];
-            p_dst[1] = p_src1[0];
-            p_src0 += 1;
-            p_src1 += 1;
-            p_dst += 2;
-        }
-
-        p_src0 += ( i_src0_stride - i_width );
-        p_src1 += ( i_src1_stride - i_width );
-        p_dst += ( i_dst_stride - ( i_width * 2 ) );
-    }
-}
-
-static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
-                                         uint8_t *p_dst0, int32_t dst0_stride,
-                                         uint8_t *p_dst1, int32_t dst1_stride,
-                                         int32_t i_width, int32_t i_height )
-{
-    int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
-    uint32_t u_res_w0, u_res_w1;
-    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
-    v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
-    v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
-    uint8_t *p_dst;
-
-    i_w_mul8 = i_width - i_width % 8;
-    i_w_mul4 = i_width - i_width % 4;
-    i_h4w = i_height - i_height % 8;
-
-    for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
-    {
-        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
-        {
-            LD_UB8( p_src, i_src_stride,
-                    in0, in1, in2, in3, in4, in5, in6, in7 );
-            p_src += 16;
-            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
-                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
-            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
-                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
-            ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
-            p_dst = p_dst0 + 4 * dst0_stride;
-            ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
-            ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
-            p_dst = p_dst1 + 4 * dst1_stride;
-            ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
-            p_dst0 += 8;
-            p_dst1 += 8;
-        }
-
-        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
-        {
-            LD_UB8( p_src, i_src_stride,
-                    in0, in1, in2, in3, in4, in5, in6, in7 );
-            p_src += 8;
-            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
-                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
-            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
-                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
-            ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
-            p_dst = p_dst0 + 4 * dst0_stride;
-            ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
-            ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
-            p_dst = p_dst1 + 4 * dst1_stride;
-            ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
-            p_dst0 += 4;
-            p_dst1 += 4;
-        }
-
-        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0[0] = p_src[0];
-            p_dst1[0] = p_src[1];
-            p_dst0[dst0_stride] = p_src[i_src_stride];
-            p_dst1[dst1_stride] = p_src[i_src_stride + 1];
-            p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
-            p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
-            p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
-            p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
-            p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
-            p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
-            p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
-            p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
-            p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
-            p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
-            p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
-            p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
-            p_dst0 += 1;
-            p_dst1 += 1;
-            p_src += 2;
-        }
-
-        p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
-        p_dst0 += ( ( 8 * dst0_stride ) - i_width );
-        p_dst1 += ( ( 8 * dst1_stride ) - i_width );
-    }
-
-    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
-    {
-        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
-        {
-            in0 = LD_UB( p_src );
-            p_src += 16;
-            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
-                                                  ( v16i8 ) in0 );
-            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
-                                                  ( v16i8 ) in0 );
-            ST8x1_UB( vec_pckev0, p_dst0 );
-            ST8x1_UB( vec_pckod0, p_dst1 );
-            p_dst0 += 8;
-            p_dst1 += 8;
-        }
-
-        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
-        {
-            in0 = LD_UB( p_src );
-            p_src += 8;
-            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
-                                                  ( v16i8 ) in0 );
-            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
-                                                  ( v16i8 ) in0 );
-            u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
-            SW( u_res_w0, p_dst0 );
-            u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
-            SW( u_res_w1, p_dst1 );
-            p_dst0 += 4;
-            p_dst1 += 4;
-        }
-
-        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0[0] = p_src[0];
-            p_dst1[0] = p_src[1];
-            p_dst0 += 1;
-            p_dst1 += 1;
-            p_src += 2;
-        }
-
-        p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
-        p_dst0 += ( ( dst0_stride ) - i_width );
-        p_dst1 += ( ( dst1_stride ) - i_width );
-    }
-}
-
-
-static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
-                                             int32_t i_src_stride,
-                                             uint8_t *p_dst0,
-                                             int32_t i_dst0_stride,
-                                             uint8_t *p_dst1,
-                                             int32_t i_dst1_stride,
-                                             uint8_t *p_dst2,
-                                             int32_t i_dst2_stride,
-                                             int32_t i_width,
-                                             int32_t i_height )
-{
-    uint8_t *p_src_orig = p_src;
-    uint8_t *p_dst0_orig = p_dst0;
-    uint8_t *p_dst1_orig = p_dst1;
-    uint8_t *p_dst2_orig = p_dst2;
-    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
-    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
-    v16i8 temp0, temp1, temp2, temp3;
-    v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
-    v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
-    v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
-
-    i_w_mul8 = i_width - i_width % 8;
-    i_h_mul4 = i_height - i_height % 4;
-
-    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
-    {
-        p_src = p_src_orig;
-        p_dst0 = p_dst0_orig;
-        p_dst1 = p_dst1_orig;
-        p_dst2 = p_dst2_orig;
-
-        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
-        {
-            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
-            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
-
-            VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
-            VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
-            ST8x1_UB( temp0, p_dst0 );
-            ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
-            ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
-            ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
-
-            VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
-            VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
-            ST8x1_UB( temp0, p_dst1 );
-            ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
-            ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
-            ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
-
-            VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
-            VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
-            ST8x1_UB( temp0, p_dst2 );
-            ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
-            ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
-            ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
-
-            p_src += 8 * 3;
-            p_dst0 += 8;
-            p_dst1 += 8;
-            p_dst2 += 8;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
-            p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
-            p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
-
-            p_dst0_orig[i_loop_width + i_dst0_stride] =
-                p_src_orig[0 + i_src_stride + 3 * i_loop_width];
-            p_dst1_orig[i_loop_width + i_dst1_stride] =
-                p_src_orig[1 + i_src_stride + 3 * i_loop_width];
-            p_dst2_orig[i_loop_width + i_dst2_stride] =
-                p_src_orig[2 + i_src_stride + 3 * i_loop_width];
-
-            p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
-                p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
-            p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
-                p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
-            p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
-                p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
-
-            p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
-                p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
-            p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
-                p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
-            p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
-                p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
-        }
-
-        p_src_orig += ( 4 * i_src_stride );
-        p_dst0_orig += ( 4 * i_dst0_stride );
-        p_dst1_orig += ( 4 * i_dst1_stride );
-        p_dst2_orig += ( 4 * i_dst2_stride );
-    }
-
-    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
-    {
-        p_src = p_src_orig;
-        p_dst0 = p_dst0_orig;
-        p_dst1 = p_dst1_orig;
-        p_dst2 = p_dst2_orig;
-
-        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
-        {
-            in0 = LD_SB( p_src );
-            in4 = LD_SB( p_src + 16 );
-            temp0 = __msa_vshf_b( mask0, in4, in0 );
-            ST8x1_UB( temp0, p_dst0 );
-            temp0 = __msa_vshf_b( mask1, in4, in0 );
-            ST8x1_UB( temp0, p_dst1 );
-            temp0 = __msa_vshf_b( mask2, in4, in0 );
-            ST8x1_UB( temp0, p_dst2 );
-
-            p_src += 8 * 3;
-            p_dst0 += 8;
-            p_dst1 += 8;
-            p_dst2 += 8;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
-            p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
-            p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
-        }
-
-        p_src_orig += ( i_src_stride );
-        p_dst0_orig += ( i_dst0_stride );
-        p_dst1_orig += ( i_dst1_stride );
-        p_dst2_orig += ( i_dst2_stride );
-    }
-}
-
-static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
-                                              int32_t i_src_stride,
-                                              uint8_t *p_dst0,
-                                              int32_t i_dst0_stride,
-                                              uint8_t *p_dst1,
-                                              int32_t i_dst1_stride,
-                                              uint8_t *p_dst2,
-                                              int32_t i_dst2_stride,
-                                              int32_t i_width,
-                                              int32_t i_height )
-{
-    uint8_t *p_src_orig = p_src;
-    uint8_t *p_dst0_orig = p_dst0;
-    uint8_t *p_dst1_orig = p_dst1;
-    uint8_t *p_dst2_orig = p_dst2;
-    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
-    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
-    v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
-    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-    v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
-
-    i_w_mul8 = i_width - i_width % 8;
-    i_h_mul4 = i_height - i_height % 4;
-
-    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
-    {
-        p_src = p_src_orig;
-        p_dst0 = p_dst0_orig;
-        p_dst1 = p_dst1_orig;
-        p_dst2 = p_dst2_orig;
-
-        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
-        {
-            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
-            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
-            LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
-            LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
-
-            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
-            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
-            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
-            PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
-            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
-            temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
-            PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
-            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
-            temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
-            PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
-            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
-            temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
-            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
-            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
-            PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
-            in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
-            PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
-            in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
-            PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
-            in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
-            ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
-            ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
-            ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
-
-            p_src += 16 * 4;
-            p_dst0 += 16;
-            p_dst1 += 16;
-            p_dst2 += 16;
-        }
-
-        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
-        {
-            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
-            LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
-
-            PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
-            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
-            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
-
-            PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
-            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
-            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
-
-            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
-            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
-            PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
-            in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
-            PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
-            in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
-            PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
-            in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
-
-            ST8x1_UB( in0, p_dst0 );
-            ST8x1_UB( in4, p_dst0 + i_dst0_stride );
-            ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
-            ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
-
-            ST8x1_UB( in1, p_dst2 );
-            ST8x1_UB( in5, p_dst2 + i_dst2_stride );
-            ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
-            ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
-
-            ST8x1_UB( in2, p_dst1 );
-            ST8x1_UB( in6, p_dst1 + i_dst1_stride );
-            ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
-            ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
-
-            p_src += 8 * 4;
-            p_dst0 += 8;
-            p_dst1 += 8;
-            p_dst2 += 8;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
-            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
-            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
-
-            p_dst0_orig[i_dst0_stride + i_loop_width] =
-                p_src_orig[i_src_stride + 4 * i_loop_width];
-            p_dst1_orig[i_dst1_stride + i_loop_width] =
-                p_src_orig[i_src_stride + 4 * i_loop_width + 1];
-            p_dst2_orig[i_dst2_stride + i_loop_width] =
-                p_src_orig[i_src_stride + 4 * i_loop_width + 2];
-
-            p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
-                p_src_orig[2 * i_src_stride + 4 * i_loop_width];
-            p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
-                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
-            p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
-                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
-
-            p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
-                p_src_orig[3 * i_src_stride + 4 * i_loop_width];
-            p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
-                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
-            p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
-                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
-        }
-
-        p_src_orig += ( 4 * i_src_stride );
-        p_dst0_orig += ( 4 * i_dst0_stride );
-        p_dst1_orig += ( 4 * i_dst1_stride );
-        p_dst2_orig += ( 4 * i_dst2_stride );
-    }
-
-    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
-    {
-        p_src = p_src_orig;
-        p_dst0 = p_dst0_orig;
-        p_dst1 = p_dst1_orig;
-        p_dst2 = p_dst2_orig;
-
-        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
-        {
-            LD_SB4( p_src, 16, in0, in4, in8, in12 );
-
-            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
-            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
-            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
-            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
-            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
-            ST_SB( in0, p_dst0 );
-            ST_SB( in0, p_dst0 );
-            ST_SB( in1, p_dst2 );
-            ST_SB( in1, p_dst2 );
-            ST_SB( in2, p_dst1 );
-            ST_SB( in2, p_dst1 );
-
-            p_src += 16 * 4;
-            p_dst0 += 16;
-            p_dst1 += 16;
-            p_dst2 += 16;
-        }
-
-        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
-        {
-            in0 = LD_SB( p_src );
-            in4 = LD_SB( p_src + 16 );
-
-            temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
-            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
-            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
-            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
-            ST8x1_UB( in0, p_dst0 );
-            ST8x1_UB( in1, p_dst2 );
-            ST8x1_UB( in2, p_dst1 );
-
-            p_src += 8 * 4;
-            p_dst0 += 8;
-            p_dst1 += 8;
-            p_dst2 += 8;
-        }
-
-        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
-        {
-            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
-            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
-            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
-        }
-
-        p_src_orig += ( i_src_stride );
-        p_dst0_orig += ( i_dst0_stride );
-        p_dst1_orig += ( i_dst1_stride );
-        p_dst2_orig += ( i_dst2_stride );
-    }
-}
-
-static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
-                                         uint8_t *p_src1, int32_t i_src1_stride,
-                                         uint8_t *p_dst, int32_t i_dst_stride,
-                                         int32_t i_height )
-{
-    int32_t i_loop_height, i_h4w;
-    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
-    v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
-
-    i_h4w = i_height % 4;
-    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
-    {
-        LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
-        p_src0 += ( 4 * i_src0_stride );
-        LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
-        p_src1 += ( 4 * i_src1_stride );
-        ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
-                    ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
-        ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
-                p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-
-    for( i_loop_height = i_h4w; i_loop_height--; )
-    {
-        in0 = LD_UB( p_src0 );
-        p_src0 += ( i_src0_stride );
-        in1 = LD_UB( p_src1 );
-        p_src1 += ( i_src1_stride );
-        ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
-        ST_UB( ilvr_vec0, p_dst );
-        p_dst += ( i_dst_stride );
-    }
-}
-
-static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
-                                        uint8_t *p_dst0, int32_t dst0_stride,
-                                        uint8_t *p_dst1, int32_t dst1_stride,
-                                        uint8_t *p_dst2, int32_t dst2_stride,
-                                        uint8_t *p_dst3, int32_t dst3_stride,
-                                        int32_t i_width, int32_t i_height )
-{
-    int32_t i_loop_width, i_loop_height, i_w16_mul;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
-    v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
-    v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
-    v16u8 tmp0, tmp1, tmp2, tmp3;
-    v16u8 res0, res1;
-
-    i_w16_mul = i_width - i_width % 16;
-    for( i_loop_height = i_height; i_loop_height--; )
-    {
-        LD_UB3( p_src, i_src_stride, src0, src1, src2 );
-        p_src += 16;
-        for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
-        {
-            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
-            p_src += 16;
-            LD_UB3( p_src, i_src_stride, src6, src7, src8 );
-            p_src += 16;
-            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
-            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
-            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
-                                                  ( v16i8 ) src2 );
-            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
-                                                  ( v16i8 ) src2 );
-            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
-                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
-                         tmp0, tmp1, tmp2, tmp3 );
-            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
-            ST_UB( res0, p_dst0 );
-            ST_UB( res1, p_dst2 );
-
-            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
-            SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
-            SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
-            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
-                         pckev_vec0, pckev_vec1 )
-            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
-                                                  ( v16i8 ) sld1_vec2 );
-            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
-                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
-                         tmp0, tmp1, tmp2, tmp3 );
-            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
-            ST_UB( res0, p_dst1 );
-            ST_UB( res1, p_dst3 );
-
-            src0 = src6;
-            src1 = src7;
-            src2 = src8;
-            p_dst0 += 16;
-            p_dst1 += 16;
-            p_dst2 += 16;
-            p_dst3 += 16;
-        }
-
-        for( i_loop_width = i_w16_mul; i_loop_width < i_width;
-             i_loop_width += 8 )
-        {
-            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
-            p_src += 16;
-            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
-            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
-            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
-                                                  ( v16i8 ) src2 );
-            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
-                                                  ( v16i8 ) src2 );
-            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
-                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
-                         tmp0, tmp1, tmp2, tmp3 );
-            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
-            ST8x1_UB( res0, p_dst0 );
-            ST8x1_UB( res1, p_dst2 );
-
-            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
-            SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
-            SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
-            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
-                         pckev_vec0, pckev_vec1 )
-            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
-                                                  ( v16i8 ) sld1_vec2 );
-            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
-                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
-                         tmp0, tmp1, tmp2, tmp3 );
-            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
-            ST8x1_UB( res0, p_dst1 );
-            ST8x1_UB( res1, p_dst3 );
-            p_dst0 += 8;
-            p_dst1 += 8;
-            p_dst2 += 8;
-            p_dst3 += 8;
-        }
-
-        p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
-        p_dst0 += ( dst0_stride - i_width );
-        p_dst1 += ( dst1_stride - i_width );
-        p_dst2 += ( dst2_stride - i_width );
-        p_dst3 += ( dst3_stride - i_width );
-    }
-}
-
-void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                           uint8_t *p_src, intptr_t i_src_stride,
-                           int32_t i_height )
-{
-    copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
-}
-
-void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
-                          intptr_t i_src_stride, int32_t i_height )
-{
-    copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
-}
-
-void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
-                          intptr_t i_src_stride, int32_t i_height )
-{
-    copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
-}
-
-void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                               uint8_t *p_pix2, intptr_t pix2_stride,
-                               uint8_t *p_pix3, intptr_t pix3_stride,
-                               int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                             p_pix1, pix1_stride, 16 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
-                                          p_pix3, pix3_stride,
-                                          p_pix1, pix1_stride,
-                                          16, 5, i_weight,
-                                          ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
-                                       p_pix3, pix3_stride,
-                                       p_pix1, pix1_stride,
-                                       16, 5, i_weight,
-                                       ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                              uint8_t *p_pix2, intptr_t pix2_stride,
-                              uint8_t *p_pix3, intptr_t pix3_stride,
-                              int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                             p_pix1, pix1_stride, 8 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
-                                          p_pix3, pix3_stride,
-                                          p_pix1, pix1_stride,
-                                          8, 5, i_weight,
-                                          ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
-                                       p_pix3, pix3_stride,
-                                       p_pix1, pix1_stride,
-                                       8, 5, i_weight,
-                                       ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                              uint8_t *p_pix2, intptr_t pix2_stride,
-                              uint8_t *p_pix3, intptr_t pix3_stride,
-                              int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 16 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 16, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 16, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                             uint8_t *p_pix2, intptr_t pix2_stride,
-                             uint8_t *p_pix3, intptr_t pix3_stride,
-                             int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 8 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 8, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 8, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                             uint8_t *p_pix2, intptr_t pix2_stride,
-                             uint8_t *p_pix3, intptr_t pix3_stride,
-                             int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 4 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 4, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 4, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                              uint8_t *p_pix2, intptr_t pix2_stride,
-                              uint8_t *p_pix3, intptr_t pix3_stride,
-                              int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 16 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 16, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 16, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                             uint8_t *p_pix2, intptr_t pix2_stride,
-                             uint8_t *p_pix3, intptr_t pix3_stride,
-                             int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 8 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 8, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 8, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                             uint8_t *p_pix2, intptr_t pix2_stride,
-                             uint8_t *p_pix3, intptr_t pix3_stride,
-                             int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 4 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
-                                         p_pix3, pix3_stride,
-                                         p_pix1, pix1_stride, 4, 5, i_weight,
-                                         ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 4, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-}
-
-void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
-                             uint8_t *p_pix2, intptr_t pix2_stride,
-                             uint8_t *p_pix3, intptr_t pix3_stride,
-                             int32_t i_weight )
-{
-    if( 32 == i_weight )
-    {
-        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
-                            p_pix1, pix1_stride, 2 );
-    }
-    else if( i_weight < 0 || i_weight > 63 )
-    {
-        avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
-                                      p_pix3, pix3_stride,
-                                      p_pix1, pix1_stride, 5, i_weight,
-                                      ( 64 - i_weight ), 0 );
-    }
-    else
-    {
-        avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
-                                   p_pix3, pix3_stride,
-                                   p_pix1, pix1_stride, 5, i_weight,
-                                   ( 64 - i_weight ), 0 );
-    }
-}
-
-
-void x264_memzero_aligned_msa( void *p_dst, size_t n )
-{
-    uint32_t u_tot32_mul_lines = n >> 5;
-    uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
-
-    memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
-
-    if( u_remaining )
-    {
-        memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
-    }
-}
-
-void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                            uint8_t *p_src, intptr_t i_src_stride,
-                            const x264_weight_t *pWeight, int32_t i_height )
-{
-    int32_t i_log2_denom = pWeight->i_denom;
-    int32_t i_offset = pWeight->i_offset;
-    int32_t i_weight = pWeight->i_scale;
-
-    avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
-                                i_height, i_log2_denom, i_weight, i_offset );
-}
-
-void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                            uint8_t *p_src, intptr_t i_src_stride,
-                            const x264_weight_t *pWeight, int32_t i_height )
-{
-    int32_t i_log2_denom = pWeight->i_denom;
-    int32_t i_offset = pWeight->i_offset;
-    int32_t i_weight = pWeight->i_scale;
-
-    avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
-                                i_height, i_log2_denom, i_weight, i_offset );
-}
-
-void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                             uint8_t *p_src, intptr_t i_src_stride,
-                             const x264_weight_t *pWeight, int32_t i_height )
-{
-    int32_t i_log2_denom = pWeight->i_denom;
-    int32_t i_offset = pWeight->i_offset;
-    int32_t i_weight = pWeight->i_scale;
-
-    avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
-                                 i_height, i_log2_denom, i_weight, i_offset );
-}
-
-void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                             uint8_t *p_src, intptr_t i_src_stride,
-                             const x264_weight_t *pWeight, int32_t i_height )
-{
-    x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
-                            pWeight, i_height );
-    x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
-                           pWeight, i_height );
-}
-
-void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                       uint8_t *p_src[4], intptr_t i_src_stride,
-                       int32_t m_vx, int32_t m_vy,
-                       int32_t i_width, int32_t i_height,
-                       const x264_weight_t *pWeight )
-{
-    int32_t  i_qpel_idx;
-    int32_t  i_offset;
-    uint8_t  *p_src1;
-
-    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
-    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
-    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
-             ( 3 == ( m_vy & 3 ) ) * i_src_stride;
-
-    if( i_qpel_idx & 5 )
-    {
-        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
-                          i_offset + ( 3 == ( m_vx&3 ) );
-
-        if( 16 == i_width )
-        {
-            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
-                                 p_dst, i_dst_stride, i_height );
-        }
-        else if( 8 == i_width )
-        {
-            avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
-                                p_dst, i_dst_stride, i_height );
-        }
-        else if( 4 == i_width )
-        {
-            avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
-                                p_dst, i_dst_stride, i_height );
-        }
-
-        if( pWeight->weightfn )
-        {
-            if( 16 == i_width )
-            {
-                x264_mc_weight_w16_msa( p_dst, i_dst_stride,
-                                        p_dst, i_dst_stride,
-                                        pWeight, i_height );
-            }
-            else if( 8 == i_width )
-            {
-                x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
-                                       pWeight, i_height );
-            }
-            else if( 4 == i_width )
-            {
-                x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
-                                       pWeight, i_height );
-            }
-        }
-    }
-    else if( pWeight->weightfn )
-    {
-        if( 16 == i_width )
-        {
-            x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
-                                    pWeight, i_height );
-        }
-        else if( 8 == i_width )
-        {
-            x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
-                                   pWeight, i_height );
-        }
-        else if( 4 == i_width )
-        {
-            x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
-                                   pWeight, i_height );
-        }
-    }
-    else
-    {
-        if( 16 == i_width )
-        {
-            copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
-                              i_height );
-        }
-        else if( 8 == i_width )
-        {
-            copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
-                             i_height );
-        }
-        else if( 4 == i_width )
-        {
-            copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
-                             i_height );
-        }
-    }
-}
-
-void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
-                         intptr_t i_dst_stride,
-                         uint8_t *p_src, intptr_t i_src_stride,
-                         int32_t m_vx, int32_t m_vy,
-                         int32_t i_width, int32_t i_height )
-{
-    int32_t i_d8x = m_vx & 0x07;
-    int32_t i_d8y = m_vy & 0x07;
-    int32_t i_coeff_horiz1 = ( 8 - i_d8x );
-    int32_t i_coeff_vert1 = ( 8 - i_d8y );
-    int32_t i_coeff_horiz0 = i_d8x;
-    int32_t i_coeff_vert0 = i_d8y;
-
-    p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
-
-    if( 2 == i_width )
-    {
-        avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
-                                          p_dst_u, p_dst_v, i_dst_stride,
-                                          i_coeff_horiz0, i_coeff_horiz1,
-                                          i_coeff_vert0, i_coeff_vert1,
-                                          i_height );
-    }
-    else if( 4 == i_width )
-    {
-        avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
-                                          p_dst_u, p_dst_v, i_dst_stride,
-                                          i_coeff_horiz0, i_coeff_horiz1,
-                                          i_coeff_vert0, i_coeff_vert1,
-                                          i_height );
-    }
-    else if( 8 == i_width )
-    {
-        avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
-                                          p_dst_u, p_dst_v, i_dst_stride,
-                                          i_coeff_horiz0, i_coeff_horiz1,
-                                          i_coeff_vert0, i_coeff_vert1,
-                                          i_height );
-    }
-}
-
-void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
-                           uint8_t *p_dstc, uint8_t *p_src,
-                           intptr_t i_stride, int32_t i_width,
-                           int32_t i_height, int16_t *p_buf )
-{
-    for( int32_t i = 0; i < ( i_width / 16 ); i++ )
-    {
-        avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
-                             p_dst_v - 2, i_stride, i_height );
-        avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
-                              p_dstc, i_stride, i_height );
-        avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
-
-        p_src += 16;
-        p_dst_v += 16;
-        p_dsth += 16;
-        p_dstc += 16;
-    }
-}
-
-void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                                     uint8_t *p_src0, intptr_t i_src_stride0,
-                                     uint8_t *p_src1, intptr_t i_src_stride1,
-                                     int32_t i_width, int32_t i_height )
-{
-    plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
-                               p_dst, i_dst_stride, i_width, i_height );
-}
-
-void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
-                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
-                                       uint8_t *p_src, intptr_t i_src_stride,
-                                       int32_t i_width, int32_t i_height )
-{
-    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
-                                 p_dst1, i_dst_stride1, i_width, i_height );
-}
-
-void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
-                                           intptr_t i_dst_stride0,
-                                           uint8_t *p_dst1,
-                                           intptr_t i_dst_stride1,
-                                           uint8_t *p_dst2,
-                                           intptr_t i_dst_stride2,
-                                           uint8_t *p_src,
-                                           intptr_t i_src_stride,
-                                           int32_t i_src_width,
-                                           int32_t i_width,
-                                           int32_t i_height )
-{
-    if( 3 == i_src_width )
-    {
-        plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
-                                         p_dst0, i_dst_stride0,
-                                         p_dst1, i_dst_stride1,
-                                         p_dst2, i_dst_stride2,
-                                         i_width, i_height );
-    }
-    else if( 4 == i_src_width )
-    {
-        plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
-                                          p_dst0, i_dst_stride0,
-                                          p_dst1, i_dst_stride1,
-                                          p_dst2, i_dst_stride2,
-                                          i_width, i_height );
-    }
-}
-
-void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
-                                       uint8_t *p_src0, uint8_t *p_src1,
-                                       int32_t i_height )
-{
-    store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
-                                 p_dst, i_dst_stride, i_height );
-}
-
-void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
-                                             intptr_t i_src_stride,
-                                             int32_t i_height )
-{
-    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
-                                 ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
-                                 8, i_height );
-}
-
-void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
-                                             intptr_t i_src_stride,
-                                             int32_t i_height )
-{
-    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
-                                 ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
-                                 8, i_height );
-}
-
-void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
-                                      uint8_t *p_dst1, uint8_t *p_dst2,
-                                      uint8_t *p_dst3, intptr_t i_src_stride,
-                                      intptr_t i_dst_stride, int32_t i_width,
-                                      int32_t i_height )
-{
-    frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
-                                p_dst1, i_dst_stride, p_dst2, i_dst_stride,
-                                p_dst3, i_dst_stride, i_width, i_height );
-}
-
-uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
-                           uint8_t *p_src[4], intptr_t i_src_stride,
-                           int32_t m_vx, int32_t m_vy,
-                           int32_t i_width, int32_t i_height,
-                           const x264_weight_t *pWeight )
-{
-    int32_t i_qpel_idx, i_cnt, i_h4w;
-    int32_t i_offset;
-    uint8_t *p_src1, *src1_org;
-
-    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
-    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
-    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
-           ( 3 == ( m_vy & 3 ) ) * i_src_stride;
-
-    i_h4w = i_height - i_height%4;
-
-    if( i_qpel_idx & 5 )
-    {
-        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
-                          i_offset + ( 3 == ( m_vx & 3 ) );
-
-        if( 16 == i_width )
-        {
-            avg_src_width16_msa( p_src1, i_src_stride,
-                                 p_src2, i_src_stride,
-                                 p_dst, *p_dst_stride, i_h4w );
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                v16u8 src_vec1, src_vec2;
-                v16u8 dst_vec0;
-
-                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
-                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
-
-                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
-
-                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-        else if( 20 == i_width )
-        {
-            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
-                                 p_dst, *p_dst_stride, i_h4w );
-            avg_src_width4_msa( p_src1 + 16, i_src_stride,
-                                p_src2 + 16, i_src_stride,
-                                p_dst + 16, *p_dst_stride, i_h4w );
-
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
-                v16u8 dst_vec0, dst_vec1;
-                uint32_t temp0;
-
-                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
-                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
-                src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
-                src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
-
-                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
-                dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
-
-                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
-
-                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
-                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
-            }
-        }
-        else if( 12 == i_width )
-        {
-            avg_src_width8_msa( p_src1, i_src_stride,
-                                p_src2, i_src_stride,
-                                p_dst, *p_dst_stride, i_h4w );
-            avg_src_width4_msa( p_src1 + 8, i_src_stride,
-                                p_src2 + 8, i_src_stride,
-                                p_dst + 8, *p_dst_stride, i_h4w );
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint32_t temp0;
-                uint64_t dst0;
-                v16u8 src_vec1, src_vec2;
-                v16u8 dst_vec0;
-
-                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
-                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
-
-                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
-
-                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
-                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
-
-                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
-                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
-            }
-        }
-        else if( 8 == i_width )
-        {
-            avg_src_width8_msa( p_src1, i_src_stride,
-                                p_src2, i_src_stride,
-                                p_dst, *p_dst_stride, i_h4w );
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint64_t dst0;
-                v16u8 src_vec1, src_vec2;
-                v16u8 dst_vec0;
-
-                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
-                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
-
-                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
-
-                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
-
-                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-        else if( 4 == i_width )
-        {
-            avg_src_width4_msa( p_src1, i_src_stride,
-                                p_src2, i_src_stride,
-                                p_dst, *p_dst_stride, i_h4w );
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint32_t temp0;
-                v16u8 src_vec1, src_vec2;
-                v16u8 dst_vec0;
-
-                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
-                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
-
-                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
-                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
-
-                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-
-        if( pWeight->weightfn )
-        {
-            int32_t i_log2_denom;
-            int32_t i_offset_val;
-            int32_t i_weight;
-
-            i_log2_denom = pWeight->i_denom;
-            i_offset_val = pWeight->i_offset;
-            i_weight = pWeight->i_scale;
-
-            if( 16 == i_width || 12 == i_width )
-            {
-                x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
-                                        p_dst, *p_dst_stride,
-                                        pWeight, i_h4w );
-                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-                {
-                    v16i8 zero = {0};
-                    v16u8 src_vec0;
-                    v16i8 tmp0;
-                    v8u16 temp_vec0, temp_vec1;
-                    v8u16 wgt, offset_val0;
-                    v8i16 denom;
-
-                    i_offset_val <<= ( i_log2_denom );
-
-                    if( i_log2_denom )
-                    {
-                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                    }
-
-                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                    offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
-                    denom = __msa_fill_h( i_log2_denom );
-
-                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
-
-                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-
-                    temp_vec0 = wgt * temp_vec0;
-                    temp_vec1 = wgt * temp_vec1;
-
-                    temp_vec0 =
-                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                  ( v8i16 ) offset_val0 );
-                    temp_vec1 =
-                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
-                                                  ( v8i16 ) offset_val0 );
-
-                    temp_vec0 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                    temp_vec1 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
-
-                    temp_vec0 =
-                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                    temp_vec1 =
-                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
-
-                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
-
-                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
-                                          ( v16i8 ) temp_vec0 );
-                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
-                }
-            }
-            else if( 20 == i_width )
-            {
-                x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
-                                        p_dst, *p_dst_stride,
-                                        pWeight, i_h4w );
-                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-                {
-                    uint32_t temp0;
-                    v16i8 zero = {0};
-                    v16u8 src_vec0;
-                    v16i8 tmp0;
-                    v8u16 temp_vec0, temp_vec1;
-                    v8u16 wgt;
-                    v8i16 denom, offset_val0;
-
-                    i_offset_val <<= ( i_log2_denom );
-
-                    if( i_log2_denom )
-                    {
-                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                    }
-
-                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                    offset_val0 = __msa_fill_h( i_offset_val );
-                    denom = __msa_fill_h( i_log2_denom );
-
-                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
-                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
-
-                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-
-                    temp_vec0 = wgt * temp_vec0;
-                    temp_vec1 = wgt * temp_vec1;
-
-                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                          offset_val0 );
-                    temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
-                                                          offset_val0 );
-
-                    temp_vec0 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                    temp_vec1 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
-
-                    temp_vec0 =
-                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                    temp_vec1 =
-                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
-
-                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
-
-                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
-                                          ( v16i8 ) temp_vec0 );
-                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
-
-                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
-                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-                    temp_vec0 = wgt * temp_vec0;
-
-                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                          offset_val0 );
-                    temp_vec0 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
-                                                       denom );
-                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                          ( v16i8 ) temp_vec0 );
-                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
-                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
-                }
-            }
-            else if( 8 == i_width )
-            {
-                x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
-                                       p_dst, *p_dst_stride,
-                                       pWeight, i_h4w );
-                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-                {
-                    uint64_t temp0;
-                    v16i8 zero = {0};
-                    v16u8 src_vec0;
-                    v16i8 tmp0;
-                    v8u16 temp_vec0;
-                    v8u16 wgt;
-                    v8i16 denom, offset_val0;
-
-                    i_offset_val = i_offset_val << i_log2_denom;
-
-                    if( i_log2_denom )
-                    {
-                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                    }
-
-                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                    offset_val0 = __msa_fill_h( i_offset_val );
-                    denom = __msa_fill_h( i_log2_denom );
-
-                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
-
-                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-                    temp_vec0 = wgt * temp_vec0;
-
-                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                          offset_val0 );
-                    temp_vec0 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                    temp_vec0 =
-                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                          ( v16i8 ) temp_vec0 );
-                    temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
-                    SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
-                }
-            }
-            else if( 4 == i_width )
-            {
-                x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
-                                       p_dst, *p_dst_stride,
-                                       pWeight, i_h4w );
-                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-                {
-                    uint32_t temp0;
-                    v16i8 zero = {0};
-                    v16u8 src_vec0;
-                    v16i8 tmp0;
-                    v8u16 temp_vec0;
-                    v8u16 wgt;
-                    v8i16 denom, offset_val0;
-
-                    i_offset_val <<= ( i_log2_denom );
-
-                    if( i_log2_denom )
-                    {
-                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                    }
-
-                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                    offset_val0 = __msa_fill_h( i_offset_val );
-                    denom = __msa_fill_h( i_log2_denom );
-
-                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
-
-                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
-
-                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
-                                                        ( v16i8 ) src_vec0 );
-                    temp_vec0 = wgt * temp_vec0;
-
-                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                          offset_val0 );
-                    temp_vec0 =
-                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
-                                                       denom );
-                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                          ( v16i8 ) temp_vec0 );
-                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
-                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
-                }
-            }
-        }
-
-        return p_dst;
-    }
-    else if( pWeight->weightfn )
-    {
-        int32_t i_offset_val, i_log2_denom, i_weight;
-
-        i_log2_denom = pWeight->i_denom;
-        i_offset_val = pWeight->i_offset;
-        i_weight = pWeight->i_scale;
-
-        i_h4w = i_height - i_height%4;
-
-        src1_org = p_src1;
-
-        if( 16 == i_width || 12 == i_width )
-        {
-            x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
-                                    pWeight, i_h4w );
-            p_src1 = src1_org + i_h4w * i_src_stride;
-
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                v16i8 zero = {0};
-                v16u8 src_vec0;
-                v16i8 tmp0;
-                v8u16 temp_vec0, temp_vec1;
-                v8u16 wgt;
-                v8i16 denom, offset_val0;
-
-                i_offset_val <<= ( i_log2_denom );
-
-                if( i_log2_denom )
-                {
-                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                }
-
-                wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                offset_val0 = __msa_fill_h( i_offset_val );
-                denom = __msa_fill_h( i_log2_denom );
-
-                src_vec0 = LD_UB( p_src1 );
-                p_src1 += i_src_stride;
-
-                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
-                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
-
-                temp_vec0 = wgt * temp_vec0;
-                temp_vec1 = wgt * temp_vec1;
-
-                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                      offset_val0 );
-                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
-                                                      offset_val0 );
-
-                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
-
-                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
-
-                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
-
-                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
-                                      ( v16i8 ) temp_vec0 );
-                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-        else if( 20 == i_width )
-        {
-            x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
-                                    pWeight, i_h4w );
-            p_src1 = src1_org + i_h4w * i_src_stride;
-
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint32_t temp0;
-                v16i8 zero = {0};
-                v16u8 src_vec0;
-                v16i8 tmp0;
-                v8u16 temp_vec0, temp_vec1;
-                v8u16 wgt;
-                v8i16 denom, offset_val0;
-
-                i_offset_val <<= ( i_log2_denom );
-
-                if( i_log2_denom )
-                {
-                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                }
-
-                wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                offset_val0 = __msa_fill_h( i_offset_val );
-                denom = __msa_fill_h( i_log2_denom );
-
-                src_vec0 = LD_UB( p_src1 );
-                temp0 = LW( p_src1 + 16 );
-                p_src1 += i_src_stride;
-
-                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
-                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
-
-                temp_vec0 = wgt * temp_vec0;
-                temp_vec1 = wgt * temp_vec1;
-
-                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                      offset_val0 );
-                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
-                                                      offset_val0 );
-
-                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
-
-                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
-
-                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
-
-                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
-                                      ( v16i8 ) temp_vec0 );
-                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
-
-                src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
-                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
-                temp_vec0 = wgt * temp_vec0;
-
-                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                      offset_val0 );
-                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                      ( v16i8 ) temp_vec0 );
-                temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
-                SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
-            }
-        }
-        else if( 8 == i_width )
-        {
-            x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
-                                   pWeight, i_h4w );
-            p_src1 = src1_org + i_h4w * i_src_stride;
-
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint64_t u_temp0;
-                v16i8 zero = {0};
-                v16u8 src_vec0;
-                v16i8 tmp0;
-                v8u16 temp_vec0;
-                v8u16 wgt;
-                v8i16 denom, offset_val0;
-
-                i_offset_val = i_offset_val << i_log2_denom;
-
-                if( i_log2_denom )
-                {
-                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                }
-
-                wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                offset_val0 = __msa_fill_h( i_offset_val );
-                denom = __msa_fill_h( i_log2_denom );
-
-                src_vec0 = LD_UB( p_src1 );
-                p_src1 += i_src_stride;
-
-                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
-                temp_vec0 = wgt * temp_vec0;
-
-                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                      offset_val0 );
-                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                      ( v16i8 ) temp_vec0 );
-                u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
-                SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-        else if( 4 == i_width )
-        {
-            x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
-                                   pWeight, i_h4w );
-            p_src1 = src1_org + i_h4w * i_src_stride;
-
-            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
-            {
-                uint32_t u_temp0;
-                v16i8 zero = {0};
-                v16u8 src_vec0;
-                v16i8 tmp0;
-                v8u16 temp_vec0;
-                v8u16 wgt;
-                v8i16 denom, offset_val0;
-
-                i_offset_val <<= ( i_log2_denom );
-
-                if( i_log2_denom )
-                {
-                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
-                }
-
-                wgt = ( v8u16 ) __msa_fill_h( i_weight );
-                offset_val0 = __msa_fill_h( i_offset_val );
-                denom = __msa_fill_h( i_log2_denom );
-
-                u_temp0 = LW( p_src1 );
-                p_src1 += i_src_stride;
-
-                src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
-
-                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
-                temp_vec0 = wgt * temp_vec0;
-
-                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
-                                                      offset_val0 );
-                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
-                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
-                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
-
-                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
-                                      ( v16i8 ) temp_vec0 );
-                u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
-                SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
-            }
-        }
-
-        return p_dst;
-    }
-    else
-    {
-        *p_dst_stride = i_src_stride;
-        return p_src1;
-    }
-}
-#endif // !HIGH_BIT_DEPTH
-
-void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
-{
-#if !HIGH_BIT_DEPTH
-    if( cpu & X264_CPU_MSA )
-    {
-        pf->mc_luma = x264_mc_luma_msa;
-        pf->mc_chroma = x264_mc_chroma_msa;
-        pf->get_ref = x264_get_ref_msa;
-
-        pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
-        pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
-        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
-        pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
-        pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
-        pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
-        pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
-        pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
-        pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
-
-        pf->weight = x264_mc_weight_wtab_msa;
-        pf->offsetadd = x264_mc_weight_wtab_msa;
-        pf->offsetsub = x264_mc_weight_wtab_msa;
-
-        pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
-        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
-        pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
-        pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
-
-        pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
-        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
-        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
-
-        pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
-        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
-        pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
-
-        pf->hpel_filter = x264_hpel_filter_msa;
-
-        pf->memcpy_aligned = memcpy;
-        pf->memzero_aligned = x264_memzero_aligned_msa;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
-    }
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/mips/mc.h b/android/src/main/libenc/jni/libx264/common/mips/mc.h
deleted file mode 100755
index 692a180..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/mc.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * mc.h: msa motion compensation
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Neha Rana <neha.rana@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_MC_H
-#define X264_MIPS_MC_H
-
-void x264_mc_init_mips( int cpu, x264_mc_functions_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/pixel-c.c b/android/src/main/libenc/jni/libx264/common/mips/pixel-c.c
deleted file mode 100755
index d583d79..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/pixel-c.c
+++ /dev/null
@@ -1,1491 +0,0 @@
-/*****************************************************************************
- * pixel-c.c: msa pixel metrics
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-#include "pixel.h"
-#include "predict.h"
-
-#if !HIGH_BIT_DEPTH
-#define CALC_MSE_B( src, ref, var )                                    \
-{                                                                      \
-    v16u8 src_l0_m, src_l1_m;                                          \
-    v8i16 res_l0_m, res_l1_m;                                          \
-                                                                       \
-    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
-    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
-    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
-}
-
-#define CALC_MSE_AVG_B( src, ref, var, sub )                           \
-{                                                                      \
-    v16u8 src_l0_m, src_l1_m;                                          \
-    v8i16 res_l0_m, res_l1_m;                                          \
-                                                                       \
-    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
-    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
-    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
-                                                                       \
-    sub += res_l0_m + res_l1_m;                                        \
-}
-
-#define VARIANCE_WxH( sse, diff, shift )                                \
-    ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
-
-static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref, int32_t i_ref_stride,
-                                int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
-    v16u8 src = { 0 };
-    v16u8 ref = { 0 };
-    v16u8 diff;
-    v8u16 sad = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
-        p_src += ( 4 * i_src_stride );
-        LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
-        p_ref += ( 4 * i_ref_stride );
-
-        INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
-        INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
-
-        diff = __msa_asub_u_b( src, ref );
-        sad += __msa_hadd_u_h( diff, diff );
-    }
-
-    return ( HADD_UH_U32( sad ) );
-}
-
-static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref, int32_t i_ref_stride,
-                                int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-    v8u16 sad = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref += ( 4 * i_ref_stride );
-
-        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                     src0, src1, ref0, ref1 );
-        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
-    }
-
-    return ( HADD_UH_U32( sad ) );
-}
-
-static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_ref, int32_t i_ref_stride,
-                                 int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    v16u8 src0, src1, ref0, ref1;
-    v8u16 sad = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB2( p_src, i_src_stride, src0, src1 );
-        p_src += ( 2 * i_src_stride );
-        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
-        p_ref += ( 2 * i_ref_stride );
-        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        LD_UB2( p_src, i_src_stride, src0, src1 );
-        p_src += ( 2 * i_src_stride );
-        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
-        p_ref += ( 2 * i_ref_stride );
-        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
-    }
-
-    return ( HADD_UH_U32( sad ) );
-}
-
-static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref0, uint8_t *p_ref1,
-                                uint8_t *p_ref2, int32_t i_ref_stride,
-                                int32_t i_height, uint32_t *pu_sad_array )
-{
-    int32_t i_ht_cnt;
-    v16u8 src = { 0 };
-    uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
-    v16u8 ref0 = { 0 };
-    v16u8 ref1 = { 0 };
-    v16u8 ref2 = { 0 };
-    v16u8 diff;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LW4( p_src, i_src_stride, src0, src1, src2, src3 );
-        INSERT_W4_UB( src0, src1, src2, src3, src );
-        p_src += ( 4 * i_src_stride );
-
-        LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
-        INSERT_W4_UB( load0, load1, load2, load3, ref0 );
-        p_ref0 += ( 4 * i_ref_stride );
-
-        LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
-        INSERT_W4_UB( load0, load1, load2, load3, ref1 );
-        p_ref1 += ( 4 * i_ref_stride );
-
-        LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
-        INSERT_W4_UB( load0, load1, load2, load3, ref2 );
-        p_ref2 += ( 4 * i_ref_stride );
-
-        diff = __msa_asub_u_b( src, ref0 );
-        sad0 += __msa_hadd_u_h( diff, diff );
-
-        diff = __msa_asub_u_b( src, ref1 );
-        sad1 += __msa_hadd_u_h( diff, diff );
-
-        diff = __msa_asub_u_b( src, ref2 );
-        sad2 += __msa_hadd_u_h( diff, diff );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-}
-
-static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref0, uint8_t *p_ref1,
-                                uint8_t *p_ref2, int32_t i_ref_stride,
-                                int32_t i_height, uint32_t *pu_sad_array )
-{
-    int32_t i_ht_cnt;
-    v16u8 src0, src1, src2, src3;
-    v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-        LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );
-        p_ref0 += ( 4 * i_ref_stride );
-
-        PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22,
-                     src0, src1, ref0, ref1 );
-        sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 );
-        p_ref1 += ( 4 * i_ref_stride );
-
-        PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
-        sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 );
-        p_ref2 += ( 4 * i_ref_stride );
-
-        PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
-        sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-}
-
-static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_ref0, uint8_t *p_ref1,
-                                 uint8_t *p_ref2, int32_t i_ref_stride,
-                                 int32_t i_height, uint32_t *pu_sad_array )
-{
-    int32_t i_ht_cnt;
-    v16u8 src, ref;
-    v16u8 diff;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
-    {
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-
-        ref = LD_UB( p_ref0 );
-        p_ref0 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad0 += __msa_hadd_u_h( diff, diff );
-
-        ref = LD_UB( p_ref1 );
-        p_ref1 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad1 += __msa_hadd_u_h( diff, diff );
-
-        ref = LD_UB( p_ref2 );
-        p_ref2 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad2 += __msa_hadd_u_h( diff, diff );
-
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-
-        ref = LD_UB( p_ref0 );
-        p_ref0 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad0 += __msa_hadd_u_h( diff, diff );
-
-        ref = LD_UB( p_ref1 );
-        p_ref1 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad1 += __msa_hadd_u_h( diff, diff );
-
-        ref = LD_UB( p_ref2 );
-        p_ref2 += i_ref_stride;
-        diff = __msa_asub_u_b( src, ref );
-        sad2 += __msa_hadd_u_h( diff, diff );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-}
-
-static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_aref[], int32_t i_ref_stride,
-                                int32_t i_height, uint32_t *pu_sad_array )
-{
-    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
-    int32_t i_ht_cnt;
-    uint32_t src0, src1, src2, src3;
-    uint32_t ref0, ref1, ref2, ref3;
-    v16u8 src = { 0 };
-    v16u8 ref = { 0 };
-    v16u8 diff;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-    v8u16 sad3 = { 0 };
-
-    p_ref0 = p_aref[0];
-    p_ref1 = p_aref[1];
-    p_ref2 = p_aref[2];
-    p_ref3 = p_aref[3];
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LW4( p_src, i_src_stride, src0, src1, src2, src3 );
-        INSERT_W4_UB( src0, src1, src2, src3, src );
-        p_src += ( 4 * i_src_stride );
-
-        LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
-        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
-        p_ref0 += ( 4 * i_ref_stride );
-
-        diff = __msa_asub_u_b( src, ref );
-        sad0 += __msa_hadd_u_h( diff, diff );
-
-        LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 );
-        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
-        p_ref1 += ( 4 * i_ref_stride );
-
-        diff = __msa_asub_u_b( src, ref );
-        sad1 += __msa_hadd_u_h( diff, diff );
-
-        LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 );
-        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
-        p_ref2 += ( 4 * i_ref_stride );
-
-        diff = __msa_asub_u_b( src, ref );
-        sad2 += __msa_hadd_u_h( diff, diff );
-
-        LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 );
-        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
-        p_ref3 += ( 4 * i_ref_stride );
-
-        diff = __msa_asub_u_b( src, ref );
-        sad3 += __msa_hadd_u_h( diff, diff );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-    pu_sad_array[3] = HADD_UH_U32( sad3 );
-}
-
-static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_aref[], int32_t i_ref_stride,
-                                int32_t i_height, uint32_t *pu_sad_array )
-{
-    int32_t i_ht_cnt;
-    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
-    v16u8 src0, src1, src2, src3;
-    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-    v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-    v8u16 sad3 = { 0 };
-
-    p_ref0 = p_aref[0];
-    p_ref1 = p_aref[1];
-    p_ref2 = p_aref[2];
-    p_ref3 = p_aref[3];
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-        LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref0 += ( 4 * i_ref_stride );
-        LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 );
-        p_ref1 += ( 4 * i_ref_stride );
-        LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 );
-        p_ref2 += ( 4 * i_ref_stride );
-        LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 );
-        p_ref3 += ( 4 * i_ref_stride );
-
-        PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 );
-        PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 );
-        sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 );
-        sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 );
-        sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-
-        PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 );
-        sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-    pu_sad_array[3] = HADD_UH_U32( sad3 );
-}
-
-static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_aref[], int32_t i_ref_stride,
-                                 int32_t i_height, uint32_t *pu_sad_array )
-{
-    int32_t i_ht_cnt;
-    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
-    v16u8 src, ref0, ref1, ref2, ref3, diff;
-    v8u16 sad0 = { 0 };
-    v8u16 sad1 = { 0 };
-    v8u16 sad2 = { 0 };
-    v8u16 sad3 = { 0 };
-
-    p_ref0 = p_aref[0];
-    p_ref1 = p_aref[1];
-    p_ref2 = p_aref[2];
-    p_ref3 = p_aref[3];
-
-    for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
-    {
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref0 = LD_UB( p_ref0 );
-        p_ref0 += i_ref_stride;
-        ref1 = LD_UB( p_ref1 );
-        p_ref1 += i_ref_stride;
-        ref2 = LD_UB( p_ref2 );
-        p_ref2 += i_ref_stride;
-        ref3 = LD_UB( p_ref3 );
-        p_ref3 += i_ref_stride;
-
-        diff = __msa_asub_u_b( src, ref0 );
-        sad0 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref1 );
-        sad1 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref2 );
-        sad2 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref3 );
-        sad3 += __msa_hadd_u_h( diff, diff );
-
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref0 = LD_UB( p_ref0 );
-        p_ref0 += i_ref_stride;
-        ref1 = LD_UB( p_ref1 );
-        p_ref1 += i_ref_stride;
-        ref2 = LD_UB( p_ref2 );
-        p_ref2 += i_ref_stride;
-        ref3 = LD_UB( p_ref3 );
-        p_ref3 += i_ref_stride;
-
-        diff = __msa_asub_u_b( src, ref0 );
-        sad0 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref1 );
-        sad1 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref2 );
-        sad2 += __msa_hadd_u_h( diff, diff );
-        diff = __msa_asub_u_b( src, ref3 );
-        sad3 += __msa_hadd_u_h( diff, diff );
-    }
-
-    pu_sad_array[0] = HADD_UH_U32( sad0 );
-    pu_sad_array[1] = HADD_UH_U32( sad1 );
-    pu_sad_array[2] = HADD_UH_U32( sad2 );
-    pu_sad_array[3] = HADD_UH_U32( sad3 );
-}
-
-static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride,
-                                          uint8_t i_height )
-{
-    uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
-    v16i8 pix, zero = { 0 };
-    v8u16 add, pix_r, pix_l;
-    v4u32 sqr = { 0 };
-
-    for ( u_cnt = i_height; u_cnt--; )
-    {
-        pix = LD_SB( p_pix );
-        p_pix += i_stride;
-        add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix );
-        u_sum += HADD_UH_U32( add );
-        ILVRL_B2_UH( zero, pix, pix_r, pix_l );
-        sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
-        sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l );
-    }
-
-    u_sqr_out = HADD_SW_S32( sqr );
-
-    return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
-}
-
-static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride,
-                                         uint8_t i_height )
-{
-    uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
-    v16i8 pix, zero = { 0 };
-    v8u16 add, pix_r;
-    v4u32 sqr = { 0 };
-
-    for ( u_cnt = i_height; u_cnt--; )
-    {
-        pix = LD_SB( p_pix );
-        p_pix += i_stride;
-        pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix );
-        add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r );
-        u_sum += HADD_UH_U32( add );
-        sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
-    }
-
-    u_sqr_out = HADD_SW_S32( sqr );
-
-    return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
-}
-
-static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                     uint8_t *p_ref, int32_t i_ref_stride,
-                                     int32_t i_height, int32_t *p_diff )
-{
-    int32_t i_ht_cnt;
-    uint32_t u_sse;
-    v16u8 src0, src1, src2, src3;
-    v16u8 ref0, ref1, ref2, ref3;
-    v8i16 avg = { 0 };
-    v4i32 vec, var = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref += ( 4 * i_ref_stride );
-
-        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                     src0, src1, ref0, ref1 );
-        CALC_MSE_AVG_B( src0, ref0, var, avg );
-        CALC_MSE_AVG_B( src1, ref1, var, avg );
-    }
-
-    vec = __msa_hadd_s_w( avg, avg );
-    *p_diff = HADD_SW_S32( vec );
-    u_sse = HADD_SW_S32( var );
-
-    return u_sse;
-}
-
-static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref, int32_t i_ref_stride,
-                                int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    uint32_t u_sse;
-    uint32_t u_src0, u_src1, u_src2, u_src3;
-    uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
-    v16u8 src = { 0 };
-    v16u8 ref = { 0 };
-    v4i32 var = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
-        p_src += ( 4 * i_src_stride );
-        LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
-        p_ref += ( 4 * i_ref_stride );
-
-        INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
-        INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
-        CALC_MSE_B( src, ref, var );
-    }
-
-    u_sse = HADD_SW_S32( var );
-
-    return u_sse;
-}
-
-static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                uint8_t *p_ref, int32_t i_ref_stride,
-                                int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    uint32_t u_sse;
-    v16u8 src0, src1, src2, src3;
-    v16u8 ref0, ref1, ref2, ref3;
-    v4i32 var = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += ( 4 * i_src_stride );
-        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref += ( 4 * i_ref_stride );
-
-        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                     src0, src1, ref0, ref1 );
-        CALC_MSE_B( src0, ref0, var );
-        CALC_MSE_B( src1, ref1, var );
-    }
-
-    u_sse = HADD_SW_S32( var );
-
-    return u_sse;
-}
-
-static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                 uint8_t *p_ref, int32_t i_ref_stride,
-                                 int32_t i_height )
-{
-    int32_t i_ht_cnt;
-    uint32_t u_sse;
-    v16u8 src, ref;
-    v4i32 var = { 0 };
-
-    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
-    {
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref = LD_UB( p_ref );
-        p_ref += i_ref_stride;
-        CALC_MSE_B( src, ref, var );
-
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref = LD_UB( p_ref );
-        p_ref += i_ref_stride;
-        CALC_MSE_B( src, ref, var );
-
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref = LD_UB( p_ref );
-        p_ref += i_ref_stride;
-        CALC_MSE_B( src, ref, var );
-
-        src = LD_UB( p_src );
-        p_src += i_src_stride;
-        ref = LD_UB( p_ref );
-        p_ref += i_ref_stride;
-        CALC_MSE_B( src, ref, var );
-    }
-
-    u_sse = HADD_SW_S32( var );
-
-    return u_sse;
-}
-
-static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride,
-                                 const uint8_t *p_ref, int32_t i_ref_stride,
-                                 int32_t pi_sum_array[2][4] )
-{
-    v16i8 zero = { 0 };
-    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-    v8u16 temp0, temp1, temp2, temp3;
-    v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-    v4u32 tmp0;
-    v4i32 tmp2, tmp3;
-
-    LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
-    p_src += ( 4 * i_src_stride );
-    LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-    p_ref += ( 4 * i_ref_stride );
-
-    ILVR_D2_UB( src1, src0, src3, src2, src0, src2 );
-    ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 );
-    HADD_UB2_UH( src0, src2, temp0, temp1 );
-
-    temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
-    temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
-
-    pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 );
-    pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 );
-
-    HADD_UB2_UH( ref0, ref2, temp0, temp1 );
-
-    temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
-    temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
-
-    pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 );
-    pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 );
-
-    ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2,
-                vec4, vec6 );
-    ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3,
-                vec5, vec7 );
-
-    tmp0 = __msa_dotp_u_w( vec0, vec0 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 );
-
-    tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
-    tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
-    tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
-    tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
-
-    pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 );
-    pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 );
-
-    tmp0 = __msa_dotp_u_w( vec4, vec0 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 );
-    tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 );
-
-    tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
-    tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
-    tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
-    tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
-
-    pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 );
-    pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 );
-}
-
-static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                      uint8_t *p_ref, int32_t i_ref_stride,
-                                      uint8_t i_height )
-{
-    int32_t cnt;
-    uint32_t u_sum = 0;
-    v16i8 src0, src1, src2, src3;
-    v16i8 ref0, ref1, ref2, ref3;
-    v8i16 zero = { 0 };
-    v8i16 diff0, diff1, diff2, diff3;
-    v8i16 temp0, temp1, temp2, temp3;
-
-    for ( cnt = i_height >> 2; cnt--; )
-    {
-        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += 4 * i_src_stride;
-        LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref += 4 * i_ref_stride;
-
-        ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
-                    diff0, diff1, diff2, diff3 );
-        HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
-        TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
-                            diff0, diff1, diff2, diff3 );
-        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-        TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
-                            diff0, diff1, diff2, diff3 );
-        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-
-        diff0 = __msa_add_a_h( diff0, zero );
-        diff1 = __msa_add_a_h( diff1, zero );
-        diff2 = __msa_add_a_h( diff2, zero );
-        diff3 = __msa_add_a_h( diff3, zero );
-        diff0 = ( diff0 + diff1 + diff2 + diff3 );
-        diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 );
-        diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 );
-        u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 );
-    }
-
-    return ( u_sum >> 1 );
-}
-
-static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride,
-                                      uint8_t *p_ref, int32_t i_ref_stride,
-                                      uint8_t i_height )
-{
-    int32_t cnt;
-    uint32_t u_sum = 0;
-    v16i8 src0, src1, src2, src3;
-    v16i8 ref0, ref1, ref2, ref3;
-    v8i16 zero = { 0 };
-    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
-    v8i16 temp0, temp1, temp2, temp3;
-
-    for ( cnt = i_height >> 2; cnt--; )
-    {
-        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
-        p_src += 4 * i_src_stride;
-        LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
-        p_ref += 4 * i_ref_stride;
-
-        ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
-                    diff0, diff1, diff2, diff3 );
-        HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
-        TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3,
-                            diff0, diff2, diff4, diff6 );
-
-        diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 );
-        diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 );
-        diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 );
-        diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 );
-
-        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-        BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
-        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
-        TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6,
-                            diff7, diff0, diff1, diff2, diff3, diff4, diff5,
-                            diff6, diff7 );
-        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-
-        diff0 = __msa_add_a_h( diff0, zero );
-        diff1 = __msa_add_a_h( diff1, zero );
-        diff2 = __msa_add_a_h( diff2, zero );
-        diff3 = __msa_add_a_h( diff3, zero );
-        diff0 = ( diff0 + diff1 + diff2 + diff3 );
-        u_sum += HADD_UH_U32( diff0 );
-    }
-
-    return ( u_sum >> 1 );
-}
-
-static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
-                             uint8_t *p_ref, int32_t i_ref_stride )
-{
-    uint32_t u_sum = 0;
-    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-    v8i16 zero = { 0 };
-    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
-    v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
-    v8i16 temp0, temp1, temp2, temp3;
-
-    LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
-    LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 );
-    ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1,
-                sub2, sub3 );
-    ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5,
-               sub6, sub7 );
-    HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 );
-    HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 );
-    TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-                        sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
-    BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 );
-    BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 );
-    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
-    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
-                        diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 );
-    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
-
-    temp0 = diff0 + diff4;
-    temp1 = diff1 + diff5;
-    temp2 = diff2 + diff6;
-    temp3 = diff3 + diff7;
-
-    temp0 = __msa_add_a_h( temp0, zero );
-    temp1 = __msa_add_a_h( temp1, zero );
-    temp2 = __msa_add_a_h( temp2, zero );
-    temp3 = __msa_add_a_h( temp3, zero );
-
-    diff0 = temp0 + __msa_asub_s_h( diff0, diff4 );
-    diff1 = temp1 + __msa_asub_s_h( diff1, diff5 );
-    diff2 = temp2 + __msa_asub_s_h( diff2, diff6 );
-    diff3 = temp3 + __msa_asub_s_h( diff3, diff7 );
-    diff0 = ( diff0 + diff1 + diff2 + diff3 );
-
-    u_sum = HADD_UH_U32( diff0 );
-
-    return u_sum;
-}
-
-static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride )
-{
-    int16_t tmp0, tmp1, tmp2, tmp3;
-    uint32_t u_sum4 = 0, u_sum8 = 0, u_dc;
-    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-    v8i16 zero = { 0 };
-    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
-    v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
-    v8i16 temp0, temp1, temp2, temp3;
-
-    LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
-
-    ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1,
-                diff2, diff3 );
-    ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5,
-                diff6, diff7 );
-    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
-                        diff4, diff5, diff6, diff7,
-                        diff0, diff1, diff2, diff3,
-                        diff4, diff5, diff6, diff7 );
-    BUTTERFLY_4( diff0, diff2, diff3, diff1,
-                 temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2,
-                 diff0, diff1, diff3, diff2 );
-    BUTTERFLY_4( diff4, diff6, diff7, diff5,
-                 temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2,
-                 diff4, diff5, diff7, diff6 );
-    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
-                        diff4, diff5, diff6, diff7,
-                        diff0, diff1, diff2, diff3,
-                        diff4, diff5, diff6, diff7 );
-    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
-
-    tmp0 = diff0[0];
-    tmp1 = diff0[4];
-    tmp2 = diff4[0];
-    tmp3 = diff4[4];
-
-    sub0 = __msa_add_a_h( diff0, zero );
-    sub1 = __msa_add_a_h( diff1, zero );
-    sub2 = __msa_add_a_h( diff2, zero );
-    sub3 = __msa_add_a_h( diff3, zero );
-    sub4 = __msa_add_a_h( diff4, zero );
-    sub5 = __msa_add_a_h( diff5, zero );
-    sub6 = __msa_add_a_h( diff6, zero );
-    sub7 = __msa_add_a_h( diff7, zero );
-
-    sub0 = ( sub0 + sub1 + sub2 + sub3 );
-    sub1 = ( sub4 + sub5 + sub6 + sub7 );
-    sub0 += sub1;
-
-    u_sum4 += HADD_UH_U32( sub0 );
-
-    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
-                        sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
-
-    ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 );
-    ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 );
-
-    diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 );
-    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 );
-    diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 );
-    diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 );
-
-    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
-    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
-    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
-
-    sub0 = __msa_add_a_h( diff0, zero );
-    sub1 = __msa_add_a_h( diff1, zero );
-    sub2 = __msa_add_a_h( diff2, zero );
-    sub3 = __msa_add_a_h( diff3, zero );
-    sub4 = __msa_add_a_h( diff4, zero );
-    sub5 = __msa_add_a_h( diff5, zero );
-    sub6 = __msa_add_a_h( diff6, zero );
-    sub7 = __msa_add_a_h( diff7, zero );
-
-    sub0 = ( sub0 + sub1 + sub2 + sub3 );
-    sub1 = ( sub4 + sub5 + sub6 + sub7 );
-    sub0 += sub1;
-
-    u_sum8 += HADD_UH_U32( sub0 );
-
-    u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 );
-    u_sum4 = u_sum4 - u_dc;
-    u_sum8 = u_sum8 - u_dc;
-
-    return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4;
-}
-
-int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                  uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
-}
-
-int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
-}
-
-void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                  uint8_t *p_ref1, uint8_t *p_ref2,
-                                  uint8_t *p_ref3, intptr_t i_ref_stride,
-                                  int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
-                         ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 uint8_t *p_ref3, intptr_t i_ref_stride,
-                                 int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
-                         ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 uint8_t *p_ref3, intptr_t i_ref_stride,
-                                 int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
-                        ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
-                        ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
-                        ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
-                        ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] )
-{
-    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
-
-    sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
-                        ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                  uint8_t *p_ref1, uint8_t *p_ref2,
-                                  intptr_t i_ref_stride,
-                                  int32_t p_sad_array[3] )
-{
-    sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                         i_ref_stride, 16, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 intptr_t i_ref_stride,
-                                 int32_t p_sad_array[3] )
-{
-    sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                         i_ref_stride, 8, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 intptr_t i_ref_stride,
-                                 int32_t p_sad_array[3] )
-{
-    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                        i_ref_stride, 16, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] )
-{
-    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                        i_ref_stride, 8, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] )
-{
-    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                        i_ref_stride, 4, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] )
-{
-    sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                        i_ref_stride, 8, ( uint32_t * ) p_sad_array );
-}
-
-void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] )
-{
-    sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
-                        i_ref_stride, 4, ( uint32_t * ) p_sad_array );
-}
-
-int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                  uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
-}
-
-int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
-}
-
-int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
-}
-
-int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride )
-{
-    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
-}
-
-void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                int32_t p_sad_array[3] )
-{
-    x264_intra_predict_vert_4x4_msa( p_dec );
-    p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_4x4_msa( p_dec );
-    p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_4x4_msa( p_dec );
-    p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-}
-
-void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                  int32_t p_sad_array[3] )
-{
-    x264_intra_predict_vert_16x16_msa( p_dec );
-    p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
-                                               p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_16x16_msa( p_dec );
-    p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
-                                               p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_16x16_msa( p_dec );
-    p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
-                                               p_enc, FENC_STRIDE );
-}
-
-void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
-                                int32_t p_sad_array[3] )
-{
-    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
-
-    x264_intra_predict_v_8x8_msa( pix, p_edge );
-    p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_h_8x8_msa( pix, p_edge );
-    p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_8x8_msa( pix, p_edge );
-    p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-}
-
-void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                 int32_t p_sad_array[3] )
-{
-    x264_intra_predict_dc_4blk_8x8_msa( p_dec );
-    p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_8x8_msa( p_dec );
-    p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-
-    x264_intra_predict_vert_8x8_msa( p_dec );
-    p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
-                                             p_enc, FENC_STRIDE );
-}
-
-void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
-                               const uint8_t *p_pix2, intptr_t i_stride2,
-                               int32_t i_sums[2][4] )
-{
-    ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums );
-}
-
-uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    uint64_t u_sum;
-
-    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
-
-    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
-}
-
-uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    uint64_t u_sum;
-
-    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
-    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
-
-    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
-}
-
-uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    uint64_t u_sum;
-
-    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
-    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
-
-    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
-}
-
-uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    uint64_t u_sum;
-
-    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
-    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
-    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
-    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride );
-
-    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
-}
-
-int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
-}
-
-int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
-}
-
-int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
-}
-
-int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
-}
-
-int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
-}
-
-int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
-}
-
-int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    uint32_t u32Sum = 0;
-
-    u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
-    u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
-                                     p_pix2 + 8, i_stride2, 8 );
-
-    return u32Sum;
-}
-
-int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                   uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    uint32_t u32Sum = 0;
-
-    u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
-    u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
-                                     p_pix2 + 8, i_stride2, 16 );
-
-    return u32Sum;
-}
-
-int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 );
-
-    return ( i32Sum + 2 ) >> 2;
-}
-
-int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                   uint8_t *p_pix2, intptr_t i_stride2 )
-{
-    int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) +
-                     sa8d_8x8_msa( p_pix1 + 8, i_stride,
-                                   p_pix2 + 8, i_stride2 ) +
-                     sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride,
-                                   p_pix2 + 8 * i_stride2, i_stride2 ) +
-                     sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride,
-                                   p_pix2 + 8 + 8 * i_stride2, i_stride2 );
-
-    return ( i32Sum + 2 ) >> 2;
-}
-
-void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                 int32_t p_sad_array[3] )
-{
-    x264_intra_predict_vert_4x4_msa( p_dec );
-    p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_4x4_msa( p_dec );
-    p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_4x4_msa( p_dec );
-    p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-}
-
-void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                   int32_t p_sad_array[3] )
-{
-    x264_intra_predict_vert_16x16_msa( p_dec );
-    p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
-                                                p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_16x16_msa( p_dec );
-    p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
-                                                p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_16x16_msa( p_dec );
-    p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
-                                                p_enc, FENC_STRIDE );
-}
-
-void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
-                                 int32_t p_sad_array[3] )
-{
-    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
-
-    x264_intra_predict_v_8x8_msa( pix, p_edge );
-    p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_h_8x8_msa( pix, p_edge );
-    p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_dc_8x8_msa( pix, p_edge );
-    p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-}
-
-void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                  int32_t p_sad_array[3] )
-{
-    x264_intra_predict_dc_4blk_8x8_msa( p_dec );
-    p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_hor_8x8_msa( p_dec );
-    p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-
-    x264_intra_predict_vert_8x8_msa( p_dec );
-    p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
-                                              p_enc, FENC_STRIDE );
-}
-
-uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    return avc_pixel_var16width_msa( p_pix, i_stride, 16 );
-}
-
-uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    return avc_pixel_var8width_msa( p_pix, i_stride, 16 );
-}
-
-uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
-{
-    return avc_pixel_var8width_msa( p_pix, i_stride, 8 );
-}
-
-int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
-                                  uint8_t *p_pix2, intptr_t i_stride2,
-                                  int32_t *p_ssd )
-{
-    int32_t i_var = 0, i_diff = 0, i_sqr = 0;
-
-    i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16,
-                                 &i_diff );
-    i_var = VARIANCE_WxH( i_sqr, i_diff, 7 );
-    *p_ssd = i_sqr;
-
-    return i_var;
-}
-
-int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
-                                 uint8_t *p_pix2, intptr_t i_stride2,
-                                 int32_t *p_ssd )
-{
-    int32_t i_var = 0, i_diff = 0, i_sqr = 0;
-
-    i_sqr = sse_diff_8width_msa( p_pix1, i_stride1,
-                                 p_pix2, i_stride2, 8, &i_diff );
-    i_var = VARIANCE_WxH( i_sqr, i_diff, 6 );
-    *p_ssd = i_sqr;
-
-    return i_var;
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/pixel.h b/android/src/main/libenc/jni/libx264/common/mips/pixel.h
deleted file mode 100755
index 33ffd48..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/pixel.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*****************************************************************************
- * pixel.h: msa pixel metrics
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_SAD_H
-#define X264_MIPS_SAD_H
-
-int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                  uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                  uint8_t *p_ref1, uint8_t *p_ref2,
-                                  uint8_t *p_ref3, intptr_t i_ref_stride,
-                                  int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 uint8_t *p_ref3, intptr_t i_ref_stride,
-                                 int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 uint8_t *p_ref3, intptr_t i_ref_stride,
-                                 int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] );
-void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                uint8_t *p_ref3, intptr_t i_ref_stride,
-                                int32_t p_sad_array[4] );
-void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                  uint8_t *p_ref1, uint8_t *p_ref2,
-                                  intptr_t i_ref_stride,
-                                  int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 intptr_t i_ref_stride,
-                                 int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                 uint8_t *p_ref1, uint8_t *p_ref2,
-                                 intptr_t i_ref_stride,
-                                 int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] );
-void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
-                                uint8_t *p_ref1, uint8_t *p_ref2,
-                                intptr_t i_ref_stride,
-                                int32_t p_sad_array[3] );
-int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                  uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                 uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
-                                uint8_t *p_ref, intptr_t i_ref_stride );
-void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                int32_t p_sad_array[3] );
-void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                  int32_t p_sad_array[3] );
-void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
-                                int32_t p_sad_array[3] );
-void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                 int32_t p_sad_array[3] );
-void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
-                               const uint8_t *p_pix2, intptr_t i_stride2,
-                               int32_t i_sums[2][4] );
-uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
-uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
-uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
-uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
-int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                  uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                   uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                 uint8_t *p_pix2, intptr_t i_stride2 );
-int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
-                                   uint8_t *p_pix2, intptr_t i_stride2 );
-void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                 int32_t p_sad_array[3] );
-void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                   int32_t p_sad_array[3] );
-void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
-                                 int32_t p_sad_array[3] );
-void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
-                                  int32_t p_sad_array[3] );
-uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
-uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
-uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
-int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
-                                  uint8_t *p_pix2, intptr_t i_stride2,
-                                  int32_t *p_ssd );
-int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
-                                 uint8_t *p_pix2, intptr_t i_stride2,
-                                 int32_t *p_ssd );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/predict-c.c b/android/src/main/libenc/jni/libx264/common/mips/predict-c.c
deleted file mode 100755
index d8183d6..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/predict-c.c
+++ /dev/null
@@ -1,607 +0,0 @@
-/*****************************************************************************
- * predict-c.c: msa intra prediction
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-
-#if !HIGH_BIT_DEPTH
-static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
-                                        int32_t i_dst_stride )
-{
-    uint32_t u_src_data;
-
-    u_src_data = LW( p_src );
-
-    SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
-}
-
-static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
-                                        int32_t i_dst_stride )
-{
-    uint64_t u_out;
-
-    u_out = LD( p_src );
-
-    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
-    p_dst += ( 4 * i_dst_stride );
-    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
-}
-
-static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
-                                          int32_t i_dst_stride )
-{
-    v16u8 src0 = LD_UB( p_src );
-
-    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
-            i_dst_stride );
-    p_dst += ( 8 * i_dst_stride );
-    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
-            i_dst_stride );
-}
-
-static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
-                                         uint8_t *p_dst, int32_t i_dst_stride )
-{
-    uint32_t u_out0, u_out1, u_out2, u_out3;
-
-    u_out0 = p_src[0 * i_src_stride] * 0x01010101;
-    u_out1 = p_src[1 * i_src_stride] * 0x01010101;
-    u_out2 = p_src[2 * i_src_stride] * 0x01010101;
-    u_out3 = p_src[3 * i_src_stride] * 0x01010101;
-
-    SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-}
-
-static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
-                                         uint8_t *p_dst, int32_t i_dst_stride )
-{
-    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
-
-    u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
-    u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
-    u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
-    u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
-    u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
-    u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
-    u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
-    u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
-
-    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-    p_dst += ( 4 * i_dst_stride );
-    SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
-}
-
-static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
-                                           uint8_t *p_dst,
-                                           int32_t i_dst_stride )
-{
-    uint32_t u_row;
-    uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
-    v16u8 src0, src1, src2, src3;
-
-    for ( u_row = 4; u_row--; )
-    {
-        u_inp0 = p_src[0];
-        p_src += i_src_stride;
-        u_inp1 = p_src[0];
-        p_src += i_src_stride;
-        u_inp2 = p_src[0];
-        p_src += i_src_stride;
-        u_inp3 = p_src[0];
-        p_src += i_src_stride;
-
-        src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
-        src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
-        src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
-        src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
-
-        ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
-        p_dst += ( 4 * i_dst_stride );
-    }
-}
-
-static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
-                                      int32_t i_src_stride_left,
-                                      uint8_t *p_dst, int32_t i_dst_stride,
-                                      uint8_t is_above, uint8_t is_left )
-{
-    uint32_t u_row;
-    uint32_t u_out, u_addition = 0;
-    v16u8 src_above, store;
-    v8u16 sum_above;
-    v4u32 sum;
-
-    if ( is_left && is_above )
-    {
-        src_above = LD_UB( p_src_top );
-
-        sum_above = __msa_hadd_u_h( src_above, src_above );
-        sum = __msa_hadd_u_w( sum_above, sum_above );
-        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
-
-        for ( u_row = 0; u_row < 4; u_row++ )
-        {
-            u_addition += p_src_left[u_row * i_src_stride_left];
-        }
-
-        u_addition = ( u_addition + 4 ) >> 3;
-        store = ( v16u8 ) __msa_fill_b( u_addition );
-    }
-    else if ( is_left )
-    {
-        for ( u_row = 0; u_row < 4; u_row++ )
-        {
-            u_addition += p_src_left[u_row * i_src_stride_left];
-        }
-
-        u_addition = ( u_addition + 2 ) >> 2;
-        store = ( v16u8 ) __msa_fill_b( u_addition );
-    }
-    else if ( is_above )
-    {
-        src_above = LD_UB( p_src_top );
-
-        sum_above = __msa_hadd_u_h( src_above, src_above );
-        sum = __msa_hadd_u_w( sum_above, sum_above );
-        sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
-        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
-    }
-    else
-    {
-        store = ( v16u8 ) __msa_ldi_b( 128 );
-    }
-
-    u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
-
-    SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
-}
-
-static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
-                                      uint8_t *p_dst, int32_t i_dst_stride )
-{
-    uint64_t u_val0, u_val1;
-    v16i8 store;
-    v16u8 src = { 0 };
-    v8u16 sum_h;
-    v4u32 sum_w;
-    v2u64 sum_d;
-
-    u_val0 = LD( p_src_top );
-    u_val1 = LD( p_src_left );
-    INSERT_D2_UB( u_val0, u_val1, src );
-    sum_h = __msa_hadd_u_h( src, src );
-    sum_w = __msa_hadd_u_w( sum_h, sum_h );
-    sum_d = __msa_hadd_u_d( sum_w, sum_w );
-    sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
-    sum_d = __msa_hadd_u_d( sum_w, sum_w );
-    sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
-    store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
-    u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
-
-    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
-    p_dst += ( 4 * i_dst_stride );
-    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
-}
-
-static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
-                                        int32_t i_src_stride_left,
-                                        uint8_t *p_dst, int32_t i_dst_stride,
-                                        uint8_t is_above, uint8_t is_left )
-{
-    uint32_t u_row;
-    uint32_t u_addition = 0;
-    v16u8 src_above, store;
-    v8u16 sum_above;
-    v4u32 sum_top;
-    v2u64 sum;
-
-    if ( is_left && is_above )
-    {
-        src_above = LD_UB( p_src_top );
-
-        sum_above = __msa_hadd_u_h( src_above, src_above );
-        sum_top = __msa_hadd_u_w( sum_above, sum_above );
-        sum = __msa_hadd_u_d( sum_top, sum_top );
-        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
-        sum = __msa_hadd_u_d( sum_top, sum_top );
-        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
-
-        for ( u_row = 0; u_row < 16; u_row++ )
-        {
-            u_addition += p_src_left[u_row * i_src_stride_left];
-        }
-
-        u_addition = ( u_addition + 16 ) >> 5;
-        store = ( v16u8 ) __msa_fill_b( u_addition );
-    }
-    else if ( is_left )
-    {
-        for ( u_row = 0; u_row < 16; u_row++ )
-        {
-            u_addition += p_src_left[u_row * i_src_stride_left];
-        }
-
-        u_addition = ( u_addition + 8 ) >> 4;
-        store = ( v16u8 ) __msa_fill_b( u_addition );
-    }
-    else if ( is_above )
-    {
-        src_above = LD_UB( p_src_top );
-
-        sum_above = __msa_hadd_u_h( src_above, src_above );
-        sum_top = __msa_hadd_u_w( sum_above, sum_above );
-        sum = __msa_hadd_u_d( sum_top, sum_top );
-        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
-        sum = __msa_hadd_u_d( sum_top, sum_top );
-        sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
-        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
-    }
-    else
-    {
-        store = ( v16u8 ) __msa_ldi_b( 128 );
-    }
-
-    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
-            i_dst_stride );
-    p_dst += ( 8 * i_dst_stride );
-    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
-            i_dst_stride );
-}
-
-static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
-{
-    uint8_t u_lpcnt;
-    int32_t i_res, i_res0, i_res1, i_res2, i_res3;
-    uint64_t u_out0, u_out1;
-    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
-    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
-    v4i32 int_multiplier = { 0, 1, 2, 3 };
-    v16u8 p_src_top;
-    v8i16 vec9, vec10, vec11;
-    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
-    v2i64 sum;
-
-    p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
-    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
-                                        ( v16i8 ) p_src_top );
-
-    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
-    vec9 *= short_multiplier;
-    vec8 = __msa_hadd_s_w( vec9, vec9 );
-    sum = __msa_hadd_s_d( vec8, vec8 );
-
-    i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
-
-    i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
-             2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
-             3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
-             4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
-
-    i_res0 *= 17;
-    i_res1 *= 17;
-    i_res0 = ( i_res0 + 16 ) >> 5;
-    i_res1 = ( i_res1 + 16 ) >> 5;
-
-    i_res3 = 3 * ( i_res0 + i_res1 );
-    i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
-    i_res = i_res2 - i_res3;
-
-    vec8 = __msa_fill_w( i_res0 );
-    vec4 = __msa_fill_w( i_res );
-    vec2 = __msa_fill_w( i_res1 );
-    vec5 = vec8 * int_multiplier;
-    vec3 = vec8 * 4;
-
-    for ( u_lpcnt = 4; u_lpcnt--; )
-    {
-        vec0 = vec5;
-        vec0 += vec4;
-        vec1 = vec0 + vec3;
-        vec6 = vec5;
-        vec4 += vec2;
-        vec6 += vec4;
-        vec7 = vec6 + vec3;
-
-        SRA_4V( vec0, vec1, vec6, vec7, 5 );
-        PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
-        CLIP_SH2_0_255( vec10, vec11 );
-        PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
-
-        u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
-        u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
-        SD( u_out0, p_src );
-        p_src += i_stride;
-        SD( u_out1, p_src );
-        p_src += i_stride;
-
-        vec4 += vec2;
-    }
-}
-
-static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
-{
-    uint8_t u_lpcnt;
-    int32_t i_res0, i_res1, i_res2, i_res3;
-    uint64_t u_load0, u_load1;
-    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
-    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
-    v4i32 int_multiplier = { 0, 1, 2, 3 };
-    v16u8 p_src_top = { 0 };
-    v8i16 vec9, vec10;
-    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
-
-    u_load0 = LD( p_src - ( i_stride + 1 ) );
-    u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
-
-    INSERT_D2_UB( u_load0, u_load1, p_src_top );
-
-    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
-                                        ( v16i8 ) p_src_top );
-
-    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
-    vec9 *= short_multiplier;
-    vec8 = __msa_hadd_s_w( vec9, vec9 );
-    res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
-
-    i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
-
-    i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
-             2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
-             3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
-             4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
-             5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
-             6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
-             7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
-             8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
-
-    i_res0 *= 5;
-    i_res1 *= 5;
-    i_res0 = ( i_res0 + 32 ) >> 6;
-    i_res1 = ( i_res1 + 32 ) >> 6;
-
-    i_res3 = 7 * ( i_res0 + i_res1 );
-    i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
-    i_res2 -= i_res3;
-
-    vec8 = __msa_fill_w( i_res0 );
-    vec4 = __msa_fill_w( i_res2 );
-    vec5 = __msa_fill_w( i_res1 );
-    vec6 = vec8 * 4;
-    vec7 = vec8 * int_multiplier;
-
-    for ( u_lpcnt = 16; u_lpcnt--; )
-    {
-        vec0 = vec7;
-        vec0 += vec4;
-        vec1 = vec0 + vec6;
-        vec2 = vec1 + vec6;
-        vec3 = vec2 + vec6;
-
-        SRA_4V( vec0, vec1, vec2, vec3, 5 );
-        PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
-        CLIP_SH2_0_255( vec9, vec10 );
-        PCKEV_ST_SB( vec9, vec10, p_src );
-        p_src += i_stride;
-
-        vec4 += vec5;
-    }
-}
-
-static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
-{
-    uint8_t u_lp_cnt;
-    uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
-    uint32_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 p_src_top;
-    v8u16 add;
-    v4u32 sum;
-
-    p_src_top = LD_UB( p_src - i_stride );
-    add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
-    sum = __msa_hadd_u_w( add, add );
-    u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
-    u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
-
-    for ( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
-    {
-        u_src0 += p_src[u_lp_cnt * i_stride - 1];
-        u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
-    }
-
-    u_src0 = ( u_src0 + 4 ) >> 3;
-    u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
-    u_src1 = ( u_src1 + 2 ) >> 2;
-    u_src2 = ( u_src2 + 2 ) >> 2;
-
-    u_out0 = u_src0 * 0x01010101;
-    u_out1 = u_src1 * 0x01010101;
-    u_out2 = u_src2 * 0x01010101;
-    u_out3 = u_src3 * 0x01010101;
-
-    for ( u_lp_cnt = 4; u_lp_cnt--; )
-    {
-        SW( u_out0, p_src );
-        SW( u_out1, ( p_src + 4 ) );
-        SW( u_out2, ( p_src + 4 * i_stride ) );
-        SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
-        p_src += i_stride;
-    }
-}
-
-static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
-                                       int32_t i_dst_stride )
-{
-    uint8_t u_src_val = p_src[15];
-    uint64_t u_out0, u_out1, u_out2, u_out3;
-    v16u8 src, vec4, vec5, res0;
-    v8u16 vec0, vec1, vec2, vec3;
-    v2i64 res1, res2, res3;
-
-    src = LD_UB( p_src );
-
-    vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
-    vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
-    vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
-    ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
-    ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
-    HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
-
-    vec0 += vec1;
-    vec2 += vec3;
-    vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
-    vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
-
-    res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
-    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
-    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
-    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
-
-    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
-    u_out1 = __msa_copy_u_d( res1, 0 );
-    u_out2 = __msa_copy_u_d( res2, 0 );
-    u_out3 = __msa_copy_u_d( res3, 0 );
-    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-    p_dst += ( 4 * i_dst_stride );
-
-    res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
-    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
-    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
-    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
-
-    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
-    u_out1 = __msa_copy_u_d( res1, 0 );
-    u_out2 = __msa_copy_u_d( res2, 0 );
-    u_out3 = __msa_copy_u_d( res3, 0 );
-    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
-}
-
-static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
-                                           int32_t i_dst_stride )
-{
-    v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
-
-    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
-    p_dst += ( 8 * i_dst_stride );
-    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
-}
-
-void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
-                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
-}
-
-void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
-                                FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
-}
-
-void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
-                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
-}
-
-void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
-                                   p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
-{
-    intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
-{
-    intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
-{
-    intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
-                                 p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
-{
-    intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
-{
-    intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
-{
-    intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
-{
-    intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
-                              p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
-{
-    intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
-{
-    intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
-{
-    intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
-                              FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
-}
-
-void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
-{
-    intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
-                                 p_src, FDEC_STRIDE );
-}
-
-void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
-{
-    intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/predict.h b/android/src/main/libenc/jni/libx264/common/mips/predict.h
deleted file mode 100755
index 36b0841..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/predict.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*****************************************************************************
- * predict.h: msa intra prediction
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_PREDICT_H
-#define X264_MIPS_PREDICT_H
-
-void x264_intra_predict_dc_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_hor_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_vert_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_plane_16x16_msa( uint8_t *p_src );
-void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src );
-void x264_intra_predict_hor_8x8_msa( uint8_t *p_src );
-void x264_intra_predict_vert_8x8_msa( uint8_t *p_src );
-void x264_intra_predict_plane_8x8_msa( uint8_t *p_src );
-void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
-void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
-void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
-void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
-void x264_intra_predict_dc_4x4_msa( uint8_t *p_src );
-void x264_intra_predict_hor_4x4_msa( uint8_t *p_src );
-void x264_intra_predict_vert_4x4_msa( uint8_t *p_src );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/quant-c.c b/android/src/main/libenc/jni/libx264/common/mips/quant-c.c
deleted file mode 100755
index 2f7c5bc..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/quant-c.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*****************************************************************************
- * quant-c.c: msa quantization and level-run
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macros.h"
-
-#if !HIGH_BIT_DEPTH
-static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
-                                 int32_t i_qp )
-{
-    const int32_t i_mf = i_qp % 6;
-    const int32_t q_bits = i_qp / 6 - 4;
-    v8i16 dct0, dct1;
-    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
-
-    LD_SH2( p_dct, 8, dct0, dct1 );
-
-    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
-    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
-
-    if ( q_bits >= 0 )
-    {
-        v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
-
-        q_bits_vec = __msa_fill_h( q_bits );
-
-        PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
-                     dequant_mf_h0, dequant_mf_h1 );
-
-        dct0 *= dequant_mf_h0;
-        dct1 *= dequant_mf_h1;
-        dct0 <<= q_bits_vec;
-        dct1 <<= q_bits_vec;
-        ST_SH2( dct0, dct1, p_dct, 8 );
-    }
-    else
-    {
-        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
-        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-        v4i32 q_bits_vec, q_bits_vec_add;
-
-        q_bits_vec_add = __msa_fill_w( q_bits_add );
-        q_bits_vec = __msa_fill_w( -q_bits );
-
-        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-
-        dct_signed_w0 *= dequant_m_f0;
-        dct_signed_w1 *= dequant_m_f1;
-        dct_signed_w2 *= dequant_m_f2;
-        dct_signed_w3 *= dequant_m_f3;
-        dct_signed_w0 += q_bits_vec_add;
-        dct_signed_w1 += q_bits_vec_add;
-        dct_signed_w2 += q_bits_vec_add;
-        dct_signed_w3 += q_bits_vec_add;
-
-        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
-                q_bits_vec );
-        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
-                     dct0, dct1 );
-        ST_SH2( dct0, dct1, p_dct, 8 );
-    }
-}
-
-static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
-                                 int32_t i_qp )
-{
-    const int32_t i_mf = i_qp % 6;
-    const int32_t q_bits = i_qp / 6 - 6;
-    v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
-    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
-    v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
-    v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
-    v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
-
-    LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
-
-    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
-    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
-    LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
-    LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
-    LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
-    LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
-    LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
-    LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
-
-    if ( q_bits >= 0 )
-    {
-        v8i16 q_bits_vec;
-        v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
-        v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
-
-        q_bits_vec = __msa_fill_h( q_bits );
-
-        PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
-                     dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
-                     dequant_mf_h0, dequant_mf_h1,
-                     dequant_mf_h2, dequant_mf_h3 );
-        PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
-                     dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
-                     dequant_mf_h4, dequant_mf_h5,
-                     dequant_mf_h6, dequant_mf_h7 );
-
-        dct0 *= dequant_mf_h0;
-        dct1 *= dequant_mf_h1;
-        dct2 *= dequant_mf_h2;
-        dct3 *= dequant_mf_h3;
-        dct4 *= dequant_mf_h4;
-        dct5 *= dequant_mf_h5;
-        dct6 *= dequant_mf_h6;
-        dct7 *= dequant_mf_h7;
-
-        SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
-        SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
-
-        ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
-    }
-    else
-    {
-        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
-        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-        v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
-        v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
-        v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
-        v4i32 q_bits_vec, q_bits_vec_add;
-
-        q_bits_vec_add = __msa_fill_w( q_bits_add );
-        q_bits_vec = __msa_fill_w( -q_bits );
-
-        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-        UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
-        UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
-        UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
-        UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
-        UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
-        UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
-
-        dct_signed_w0 *= dequant_m_f0;
-        dct_signed_w1 *= dequant_m_f1;
-        dct_signed_w2 *= dequant_m_f2;
-        dct_signed_w3 *= dequant_m_f3;
-        dct_signed_w4 *= dequant_m_f4;
-        dct_signed_w5 *= dequant_m_f5;
-        dct_signed_w6 *= dequant_m_f6;
-        dct_signed_w7 *= dequant_m_f7;
-        dct_signed_w8 *= dequant_m_f8;
-        dct_signed_w9 *= dequant_m_f9;
-        dct_signed_w10 *= dequant_m_f10;
-        dct_signed_w11 *= dequant_m_f11;
-        dct_signed_w12 *= dequant_m_f12;
-        dct_signed_w13 *= dequant_m_f13;
-        dct_signed_w14 *= dequant_m_f14;
-        dct_signed_w15 *= dequant_m_f15;
-
-        dct_signed_w0 += q_bits_vec_add;
-        dct_signed_w1 += q_bits_vec_add;
-        dct_signed_w2 += q_bits_vec_add;
-        dct_signed_w3 += q_bits_vec_add;
-        dct_signed_w4 += q_bits_vec_add;
-        dct_signed_w5 += q_bits_vec_add;
-        dct_signed_w6 += q_bits_vec_add;
-        dct_signed_w7 += q_bits_vec_add;
-        dct_signed_w8 += q_bits_vec_add;
-        dct_signed_w9 += q_bits_vec_add;
-        dct_signed_w10 += q_bits_vec_add;
-        dct_signed_w11 += q_bits_vec_add;
-        dct_signed_w12 += q_bits_vec_add;
-        dct_signed_w13 += q_bits_vec_add;
-        dct_signed_w14 += q_bits_vec_add;
-        dct_signed_w15 += q_bits_vec_add;
-
-        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
-                q_bits_vec );
-        SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
-                q_bits_vec );
-        SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
-                q_bits_vec );
-        SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
-                q_bits_vec );
-        PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
-                     dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
-                     dct0, dct1, dct2, dct3 );
-        PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
-                     dct_signed_w10, dct_signed_w13, dct_signed_w12,
-                     dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
-        ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
-    }
-}
-
-static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
-                                    int32_t pi_dequant_mf[6][16],
-                                    int32_t i_qp )
-{
-    const int32_t q_bits = i_qp / 6 - 6;
-    int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
-    v8i16 dct0, dct1, dequant_mf_h;
-
-    LD_SH2( p_dct, 8, dct0, dct1 );
-
-    if ( q_bits >= 0 )
-    {
-        i_dmf <<= q_bits;
-
-        dequant_mf_h = __msa_fill_h( i_dmf );
-        dct0 = dct0 * dequant_mf_h;
-        dct1 = dct1 * dequant_mf_h;
-
-        ST_SH2( dct0, dct1, p_dct, 8 );
-    }
-    else
-    {
-        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
-        v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
-        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-
-        q_bits_vec_add = __msa_fill_w( q_bits_add );
-        q_bits_vec = __msa_fill_w( -q_bits );
-
-        dequant_m_f = __msa_fill_w( i_dmf );
-
-        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-
-        dct_signed_w0 *= dequant_m_f;
-        dct_signed_w1 *= dequant_m_f;
-        dct_signed_w2 *= dequant_m_f;
-        dct_signed_w3 *= dequant_m_f;
-
-        dct_signed_w0 += q_bits_vec_add;
-        dct_signed_w1 += q_bits_vec_add;
-        dct_signed_w2 += q_bits_vec_add;
-        dct_signed_w3 += q_bits_vec_add;
-
-        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
-                q_bits_vec );
-        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
-                     dct0, dct1 );
-        ST_SH2( dct0, dct1, p_dct, 8 );
-    }
-}
-
-static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
-                                  uint16_t *p_bias )
-{
-    int32_t non_zero = 0;
-    v8i16 dct0, dct1;
-    v8i16 zero = { 0 };
-    v8i16 dct0_mask, dct1_mask;
-    v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
-    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
-    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
-    v4i32 bias0, bias1, bias2, bias3;
-
-    LD_SH2( p_dct, 8, dct0, dct1 );
-    LD_SH2( p_bias, 8, bias_h0, bias_h1 );
-    LD_SH2( p_mf, 8, mf_h0, mf_h1 );
-
-    dct0_mask = __msa_clei_s_h( dct0, 0 );
-    dct1_mask = __msa_clei_s_h( dct1, 0 );
-
-    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-    ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
-    ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
-    ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
-    ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
-
-    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
-    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
-    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
-    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
-
-    dct_w0 *= mf_vec0;
-    dct_w1 *= mf_vec1;
-    dct_w2 *= mf_vec2;
-    dct_w3 *= mf_vec3;
-
-    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
-    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
-
-    dct0 = zero - dct_h0;
-    dct1 = zero - dct_h1;
-
-    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
-                                   ( v16u8 ) dct0_mask );
-    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
-                                   ( v16u8 ) dct1_mask );
-    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
-    ST_SH2( dct0, dct1, p_dct, 8 );
-
-    return !!non_zero;
-}
-
-static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
-                                  uint16_t *p_bias )
-{
-    int32_t non_zero = 0;
-    v8i16 dct0, dct1, dct2, dct3;
-    v8i16 zero = { 0 };
-    v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
-    v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
-    v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
-    v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
-    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-    v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
-    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
-    v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
-    v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
-
-    LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
-
-    dct0_mask = __msa_clei_s_h( dct0, 0 );
-    dct1_mask = __msa_clei_s_h( dct1, 0 );
-    dct2_mask = __msa_clei_s_h( dct2, 0 );
-    dct3_mask = __msa_clei_s_h( dct3, 0 );
-
-    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-    UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
-    UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
-    LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
-    ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
-                bias0, bias2, bias4, bias6 );
-    ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
-                bias1, bias3, bias5, bias7 );
-    LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
-    ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
-                mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
-    ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
-                mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
-
-    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
-    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
-    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
-    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
-    dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
-    dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
-    dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
-    dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
-
-    dct_w0 *= mf_vec0;
-    dct_w1 *= mf_vec1;
-    dct_w2 *= mf_vec2;
-    dct_w3 *= mf_vec3;
-    dct_w4 *= mf_vec4;
-    dct_w5 *= mf_vec5;
-    dct_w6 *= mf_vec6;
-    dct_w7 *= mf_vec7;
-
-    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
-    SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
-    PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
-                 dct_h0, dct_h1, dct_h2, dct_h3 );
-    SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
-          dct0, dct1, dct2, dct3 );
-
-    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
-                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
-    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
-                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
-    dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
-                                   ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
-    dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
-                                   ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
-
-    non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
-    ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
-    LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
-
-    dct0_mask = __msa_clei_s_h( dct0, 0 );
-    dct1_mask = __msa_clei_s_h( dct1, 0 );
-    dct2_mask = __msa_clei_s_h( dct2, 0 );
-    dct3_mask = __msa_clei_s_h( dct3, 0 );
-
-    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-    UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
-    UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
-    LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
-    ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
-                bias0, bias2, bias4, bias6 );
-    ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
-                bias1, bias3, bias5, bias7 );
-    LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
-    ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
-                mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
-    ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
-                mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
-
-    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
-    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
-    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
-    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
-    dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
-    dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
-    dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
-    dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
-
-    dct_w0 *= mf_vec0;
-    dct_w1 *= mf_vec1;
-    dct_w2 *= mf_vec2;
-    dct_w3 *= mf_vec3;
-    dct_w4 *= mf_vec4;
-    dct_w5 *= mf_vec5;
-    dct_w6 *= mf_vec6;
-    dct_w7 *= mf_vec7;
-
-    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
-    SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
-    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
-    PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
-    SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
-          dct0, dct1, dct2, dct3 );
-
-    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
-                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
-    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
-                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
-    dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
-                                   ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
-    dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
-                                   ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
-
-    non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
-    ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
-
-    return !!non_zero;
-}
-
-static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
-                                     int32_t i_bias )
-{
-    int32_t non_zero = 0;
-    v8i16 dct0, dct1, dct0_mask, dct1_mask;
-    v8i16 zero = { 0 };
-    v8i16 dct_h0, dct_h1;
-    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
-    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
-    v4i32 mf_vec, bias_vec;
-
-    LD_SH2( p_dct, 8, dct0, dct1 );
-
-    dct0_mask = __msa_clei_s_h( dct0, 0 );
-    dct1_mask = __msa_clei_s_h( dct1, 0 );
-
-    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
-    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
-
-    bias_vec = __msa_fill_w( i_bias );
-    mf_vec = __msa_fill_w( i_mf );
-
-    dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
-    dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
-    dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
-    dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
-
-    dct_w0 *= mf_vec;
-    dct_w1 *= mf_vec;
-    dct_w2 *= mf_vec;
-    dct_w3 *= mf_vec;
-
-    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
-    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
-
-    dct0 = zero - dct_h0;
-    dct1 = zero - dct_h1;
-    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
-                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
-    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
-                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
-    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
-
-    ST_SH2( dct0, dct1, p_dct, 8 );
-
-    return !!non_zero;
-}
-
-static int32_t avc_coeff_last64_msa( int16_t *p_src )
-{
-    uint32_t u_res;
-    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
-    v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
-    v16u8 tmp0, tmp1, tmp2, tmp3;
-    v8u16 vec0, vec1, vec2, vec3;
-    v4i32 out0;
-    v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
-
-    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
-
-    tmp_h0 = __msa_ceqi_h( src0, 0 );
-    tmp_h1 = __msa_ceqi_h( src1, 0 );
-    tmp_h2 = __msa_ceqi_h( src2, 0 );
-    tmp_h3 = __msa_ceqi_h( src3, 0 );
-    tmp_h4 = __msa_ceqi_h( src4, 0 );
-    tmp_h5 = __msa_ceqi_h( src5, 0 );
-    tmp_h6 = __msa_ceqi_h( src6, 0 );
-    tmp_h7 = __msa_ceqi_h( src7, 0 );
-
-    PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
-                 tmp0, tmp1, tmp2, tmp3 );
-
-    tmp0 = tmp0 & mask;
-    tmp1 = tmp1 & mask;
-    tmp2 = tmp2 & mask;
-    tmp3 = tmp3 & mask;
-
-    HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
-    PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
-    HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
-
-    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
-    vec0 = __msa_hadd_u_h( tmp0, tmp0 );
-    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
-    out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
-    u_res = __msa_copy_u_w( out0, 0 );
-
-    return ( 63 - u_res );
-}
-
-static int32_t avc_coeff_last16_msa( int16_t *p_src )
-{
-    uint32_t u_res;
-    v8i16 src0, src1;
-    v8u16 tmp_h0;
-    v16u8 tmp0;
-    v8i16 out0, out1;
-    v16i8 res0;
-    v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
-
-    LD_SH2( p_src, 8, src0, src1 );
-
-    out0 = __msa_ceqi_h( src0, 0 );
-    out1 = __msa_ceqi_h( src1, 0 );
-
-    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
-    tmp0 = tmp0 & mask;
-    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
-    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
-    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
-    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
-    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
-    res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
-    out0 = __msa_nloc_h( ( v8i16 ) res0 );
-    u_res = __msa_copy_u_h( out0, 0 );
-
-    return ( 15 - u_res );
-}
-
-void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
-                           int32_t i_qp )
-{
-    avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
-}
-
-void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
-                           int32_t i_qp )
-{
-    avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
-}
-
-void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
-                              int32_t i_qp )
-{
-    avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
-}
-
-int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
-{
-    return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
-}
-
-int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
-                              uint16_t pu_mf[16], uint16_t pu_bias[16] )
-{
-    int32_t i_non_zero, i_non_zero_acc = 0;
-
-    for( int32_t j = 0; j < 4; j++  )
-    {
-        i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
-
-        i_non_zero_acc |= ( !!i_non_zero ) << j;
-    }
-
-    return i_non_zero_acc;
-}
-
-int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
-{
-    return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
-}
-
-int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
-{
-    return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
-}
-
-int32_t x264_coeff_last64_msa( int16_t *p_src )
-{
-    return avc_coeff_last64_msa( p_src );
-}
-
-int32_t x264_coeff_last16_msa( int16_t *p_src )
-{
-    return avc_coeff_last16_msa( p_src );
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mips/quant.h b/android/src/main/libenc/jni/libx264/common/mips/quant.h
deleted file mode 100755
index 4467cd2..0000000
--- a/android/src/main/libenc/jni/libx264/common/mips/quant.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*****************************************************************************
- * quant.h: msa quantization and level-run
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Rishikesh More <rishikesh.more@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MIPS_QUANT_H
-#define X264_MIPS_QUANT_H
-
-void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
-                           int32_t i_qp );
-void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
-                           int32_t i_qp );
-void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
-                              int32_t i_qp );
-int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
-int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
-                              uint16_t pu_mf[16], uint16_t pu_bias[16] );
-int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
-int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias );
-int32_t x264_coeff_last64_msa( int16_t *p_src );
-int32_t x264_coeff_last16_msa( int16_t *p_src );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/mvpred.c b/android/src/main/libenc/jni/libx264/common/mvpred.c
deleted file mode 100755
index 067b93a..0000000
--- a/android/src/main/libenc/jni/libx264/common/mvpred.c
+++ /dev/null
@@ -1,607 +0,0 @@
-/*****************************************************************************
- * mvpred.c: motion vector prediction
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
-{
-    const int i8 = x264_scan8[idx];
-    const int i_ref= h->mb.cache.ref[i_list][i8];
-    int     i_refa = h->mb.cache.ref[i_list][i8 - 1];
-    int16_t *mv_a  = h->mb.cache.mv[i_list][i8 - 1];
-    int     i_refb = h->mb.cache.ref[i_list][i8 - 8];
-    int16_t *mv_b  = h->mb.cache.mv[i_list][i8 - 8];
-    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
-    int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];
-
-    // Partitions not yet reached in scan order are unavailable.
-    if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
-    {
-        i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
-        mv_c   = h->mb.cache.mv[i_list][i8 - 8 - 1];
-
-        if( SLICE_MBAFF
-            && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2
-            && MB_INTERLACED != h->mb.field[h->mb.i_mb_left_xy[0]] )
-        {
-            if( idx == 2 )
-            {
-                mv_c = h->mb.cache.topright_mv[i_list][0];
-                i_refc = h->mb.cache.topright_ref[i_list][0];
-            }
-            else if( idx == 8 )
-            {
-                mv_c = h->mb.cache.topright_mv[i_list][1];
-                i_refc = h->mb.cache.topright_ref[i_list][1];
-            }
-            else if( idx == 10 )
-            {
-                mv_c = h->mb.cache.topright_mv[i_list][2];
-                i_refc = h->mb.cache.topright_ref[i_list][2];
-            }
-        }
-    }
-    if( h->mb.i_partition == D_16x8 )
-    {
-        if( idx == 0 )
-        {
-            if( i_refb == i_ref )
-            {
-                CP32( mvp, mv_b );
-                return;
-            }
-        }
-        else
-        {
-            if( i_refa == i_ref )
-            {
-                CP32( mvp, mv_a );
-                return;
-            }
-        }
-    }
-    else if( h->mb.i_partition == D_8x16 )
-    {
-        if( idx == 0 )
-        {
-            if( i_refa == i_ref )
-            {
-                CP32( mvp, mv_a );
-                return;
-            }
-        }
-        else
-        {
-            if( i_refc == i_ref )
-            {
-                CP32( mvp, mv_c );
-                return;
-            }
-        }
-    }
-
-    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
-    if( i_count > 1 )
-    {
-median:
-        x264_median_mv( mvp, mv_a, mv_b, mv_c );
-    }
-    else if( i_count == 1 )
-    {
-        if( i_refa == i_ref )
-            CP32( mvp, mv_a );
-        else if( i_refb == i_ref )
-            CP32( mvp, mv_b );
-        else
-            CP32( mvp, mv_c );
-    }
-    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        CP32( mvp, mv_a );
-    else
-        goto median;
-}
-
-void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
-{
-    int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
-    int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
-    int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
-    int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
-    int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
-    int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
-    if( i_refc == -2 )
-    {
-        i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
-        mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
-    }
-
-    int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
-    if( i_count > 1 )
-    {
-median:
-        x264_median_mv( mvp, mv_a, mv_b, mv_c );
-    }
-    else if( i_count == 1 )
-    {
-        if( i_refa == i_ref )
-            CP32( mvp, mv_a );
-        else if( i_refb == i_ref )
-            CP32( mvp, mv_b );
-        else
-            CP32( mvp, mv_c );
-    }
-    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        CP32( mvp, mv_a );
-    else
-        goto median;
-}
-
-
-void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
-{
-    int     i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
-    int     i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
-    int16_t *mv_a  = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
-    int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
-
-    if( i_refa == -2 || i_refb == -2 ||
-        !( i_refa | M32( mv_a ) ) ||
-        !( i_refb | M32( mv_b ) ) )
-    {
-        M32( mv ) = 0;
-    }
-    else
-        x264_mb_predict_mv_16x16( h, 0, 0, mv );
-}
-
-static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
-{
-    int mb_x = h->mb.i_mb_x;
-    int mb_y = h->mb.i_mb_y;
-    int mb_xy = h->mb.i_mb_xy;
-    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
-    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
-    int preshift = MB_INTERLACED;
-    int postshift = MB_INTERLACED;
-    int offset = 1;
-    int yshift = 1;
-    h->mb.i_partition = partition_col[0];
-    if( PARAM_INTERLACED && h->fref[1][0]->field[mb_xy] != MB_INTERLACED )
-    {
-        if( MB_INTERLACED )
-        {
-            mb_y = h->mb.i_mb_y&~1;
-            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
-            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
-            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
-            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
-            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];
-            preshift = 0;
-            yshift = 0;
-
-            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) &&
-                (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) &&
-                partition_col[0] != D_8x8 )
-                h->mb.i_partition = D_16x8;
-            else
-                h->mb.i_partition = D_8x8;
-        }
-        else
-        {
-            int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1];
-            int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc)
-                          >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc);
-            mb_y = (h->mb.i_mb_y&~1) + col_parity;
-            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
-            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
-            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
-            preshift = 1;
-            yshift = 2;
-            h->mb.i_partition = partition_col[0];
-        }
-        offset = 0;
-    }
-    int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x;
-    int i_mb_8x8 =  4 * h->mb.i_mb_stride * mb_y + 2 * mb_x;
-
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-
-    /* Don't do any checks other than the ones we have to, based
-     * on the size of the colocated partitions.
-     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
-    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
-    int step = (h->mb.i_partition == D_16x8) + 1;
-    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
-    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);
-    for( int i8 = 0; i8 < max_i8; i8 += step )
-    {
-        int x8 = i8&1;
-        int y8 = i8>>1;
-        int ypart = (SLICE_MBAFF && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ?
-                    MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 :
-                    3*y8;
-
-        if( IS_INTRA( type_col[y8] ) )
-        {
-            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, 0 );
-            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 0, 0 );
-            x264_macroblock_cache_mv(  h, 2*x8, 2*y8, width, height, 1, 0 );
-            continue;
-        }
-
-        int i_part_8x8 = i_mb_8x8 + x8 + (ypart>>1) * h->mb.i_b8_stride;
-        int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8];
-        int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) << postshift) + (offset&i_ref1_ref&MB_INTERLACED);
-
-        if( i_ref >= 0 )
-        {
-            int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
-            int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + ypart * h->mb.i_b4_stride];
-            int16_t mv_y = (mv_col[1]<<yshift)/2;
-            int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-            int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8;
-            if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_y > h->mb.mv_max_spel[1]) )
-                return 0;
-            x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) );
-        }
-        else
-        {
-            /* the collocated ref isn't in the current list0 */
-            /* FIXME: we might still be able to use direct_8x8 on some partitions */
-            /* FIXME: with B-pyramid + extensive ref list reordering
-             *   (not currently used), we would also have to check
-             *   l1mv1 like in spatial mode */
-            return 0;
-        }
-    }
-
-    return 1;
-}
-
-static ALWAYS_INLINE int x264_mb_predict_mv_direct16x16_spatial( x264_t *h, int b_interlaced )
-{
-    int8_t ref[2];
-    ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
-    for( int i_list = 0; i_list < 2; i_list++ )
-    {
-        int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
-        int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
-        int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
-        int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
-        int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
-        int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
-        if( i_refc == -2 )
-        {
-            i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
-            mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
-        }
-
-        int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
-        if( i_ref < 0 )
-        {
-            i_ref = -1;
-            M32( mv[i_list] ) = 0;
-        }
-        else
-        {
-            /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
-             * not relevant to spatial direct. */
-            int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
-            if( i_count > 1 )
-                x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
-            else
-            {
-                if( i_refa == i_ref )
-                    CP32( mv[i_list], mv_a );
-                else if( i_refb == i_ref )
-                    CP32( mv[i_list], mv_b );
-                else
-                    CP32( mv[i_list], mv_c );
-            }
-        }
-
-        x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
-        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
-        ref[i_list] = i_ref;
-    }
-
-    int mb_x = h->mb.i_mb_x;
-    int mb_y = h->mb.i_mb_y;
-    int mb_xy = h->mb.i_mb_xy;
-    int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] };
-    int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] };
-    h->mb.i_partition = partition_col[0];
-    if( b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED )
-    {
-        if( MB_INTERLACED )
-        {
-            mb_y = h->mb.i_mb_y&~1;
-            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
-            type_col[0] = h->fref[1][0]->mb_type[mb_xy];
-            type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride];
-            partition_col[0] = h->fref[1][0]->mb_partition[mb_xy];
-            partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride];
-
-            if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) &&
-                (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) &&
-                partition_col[0] != D_8x8 )
-                h->mb.i_partition = D_16x8;
-            else
-                h->mb.i_partition = D_8x8;
-        }
-        else
-        {
-            int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1];
-            int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc)
-                          >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc);
-            mb_y = (h->mb.i_mb_y&~1) + col_parity;
-            mb_xy = mb_x + h->mb.i_mb_stride * mb_y;
-            type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy];
-            partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy];
-            h->mb.i_partition = partition_col[0];
-        }
-    }
-    int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy;
-    int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy;
-
-    int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8];
-    int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8];
-    int16_t (*l1mv[2])[2] = { (int16_t (*)[2]) &h->fref[1][0]->mv[0][i_mb_4x4],
-                              (int16_t (*)[2]) &h->fref[1][0]->mv[1][i_mb_4x4] };
-
-    if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
-    {
-        x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
-        x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-        return 1;
-    }
-
-    if( h->param.i_threads > 1
-        && ( mv[0][1] > h->mb.mv_max_spel[1]
-          || mv[1][1] > h->mb.mv_max_spel[1] ) )
-    {
-#if 0
-        fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
-                mv[0][0], mv[0][1], mv[1][0], mv[1][1],
-                h->mb.mv_max_spel[1]);
-#endif
-        return 0;
-    }
-
-    if( !M64( mv ) || (!b_interlaced && IS_INTRA( type_col[0] )) || (ref[0]&&ref[1]) )
-        return 1;
-
-    /* Don't do any checks other than the ones we have to, based
-     * on the size of the colocated partitions.
-     * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
-    int max_i8 = (D_16x16 - h->mb.i_partition) + 1;
-    int step = (h->mb.i_partition == D_16x8) + 1;
-    int width = 4 >> ((D_16x16 - h->mb.i_partition)&1);
-    int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1);
-
-    /* col_zero_flag */
-    for( int i8 = 0; i8 < max_i8; i8 += step )
-    {
-        const int x8 = i8&1;
-        const int y8 = i8>>1;
-        int ypart = (b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ?
-                    MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 :
-                    3*y8;
-        int o8 = x8 + (ypart>>1) * h->mb.i_b8_stride;
-        int o4 = 3*x8 + ypart * h->mb.i_b4_stride;
-
-        if( b_interlaced && IS_INTRA( type_col[y8] ) )
-            continue;
-
-        int idx;
-        if( l1ref0[o8] == 0 )
-            idx = 0;
-        else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
-            idx = 1;
-        else
-            continue;
-
-        if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
-        {
-            if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
-            if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
-        }
-    }
-
-    return 1;
-}
-
-
-static int x264_mb_predict_mv_direct16x16_spatial_interlaced( x264_t *h )
-{
-    return x264_mb_predict_mv_direct16x16_spatial( h, 1 );
-}
-
-static int x264_mb_predict_mv_direct16x16_spatial_progressive( x264_t *h )
-{
-    return x264_mb_predict_mv_direct16x16_spatial( h, 0 );
-}
-
-int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
-{
-    int b_available;
-    if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE )
-        return 0;
-    else if( h->sh.b_direct_spatial_mv_pred )
-    {
-        if( SLICE_MBAFF )
-            b_available = x264_mb_predict_mv_direct16x16_spatial_interlaced( h );
-        else
-            b_available = x264_mb_predict_mv_direct16x16_spatial_progressive( h );
-    }
-    else
-        b_available = x264_mb_predict_mv_direct16x16_temporal( h );
-
-    if( b_changed != NULL && b_available )
-    {
-        int changed;
-
-        changed  = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
-        changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
-        changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
-        changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
-        if( !changed && h->mb.i_partition != D_16x16 )
-        {
-            changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
-            changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
-            changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
-            changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
-        }
-        if( !changed && h->mb.i_partition == D_8x8 )
-        {
-            changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
-            changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
-            changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
-            changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
-            changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
-            changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
-            changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
-            changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
-        }
-        *b_changed = changed;
-        if( !changed )
-            return b_available;
-    }
-
-    /* cache ref & mv */
-    if( b_available )
-        for( int l = 0; l < 2; l++ )
-        {
-            CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
-            CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
-            CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
-            CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
-            h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
-            h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
-            h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
-            h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
-            h->mb.cache.direct_partition = h->mb.i_partition;
-        }
-
-    return b_available;
-}
-
-/* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
-{
-    int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
-    int i = 0;
-
-#define SET_MVP(mvp) \
-    { \
-        CP32( mvc[i], mvp ); \
-        i++; \
-    }
-
-#define SET_IMVP(xy) \
-    if( xy >= 0 ) \
-    { \
-        int shift = 1 + MB_INTERLACED - h->mb.field[xy]; \
-        int16_t *mvp = h->mb.mvr[i_list][i_ref<<1>>shift][xy]; \
-        mvc[i][0] = mvp[0]; \
-        mvc[i][1] = mvp[1]<<1>>shift; \
-        i++; \
-    }
-
-    /* b_direct */
-    if( h->sh.i_type == SLICE_TYPE_B
-        && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref )
-    {
-        SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
-    }
-
-    if( i_ref == 0 && h->frames.b_have_lowres )
-    {
-        int idx = i_list ? h->fref[1][0]->i_frame-h->fenc->i_frame-1
-                         : h->fenc->i_frame-h->fref[0][0]->i_frame-1;
-        if( idx <= h->param.i_bframe )
-        {
-            int16_t (*lowres_mv)[2] = h->fenc->lowres_mvs[i_list][idx];
-            if( lowres_mv[0][0] != 0x7fff )
-            {
-                M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
-                i++;
-            }
-        }
-    }
-
-    /* spatial predictors */
-    if( SLICE_MBAFF )
-    {
-        SET_IMVP( h->mb.i_mb_left_xy[0] );
-        SET_IMVP( h->mb.i_mb_top_xy );
-        SET_IMVP( h->mb.i_mb_topleft_xy );
-        SET_IMVP( h->mb.i_mb_topright_xy );
-    }
-    else
-    {
-        SET_MVP( mvr[h->mb.i_mb_left_xy[0]] );
-        SET_MVP( mvr[h->mb.i_mb_top_xy] );
-        SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
-        SET_MVP( mvr[h->mb.i_mb_topright_xy] );
-    }
-#undef SET_IMVP
-#undef SET_MVP
-
-    /* temporal predictors */
-    if( h->fref[0][0]->i_ref[0] > 0 )
-    {
-        x264_frame_t *l0 = h->fref[0][0];
-        int field = h->mb.i_mb_y&1;
-        int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
-        int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc;
-        refpoc += l0->i_delta_poc[field^(i_ref&1)];
-
-#define SET_TMVP( dx, dy ) \
-        { \
-            int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
-            int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \
-            mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \
-            mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \
-            i++; \
-        }
-
-        SET_TMVP(0,0);
-        if( h->mb.i_mb_x < h->mb.i_mb_width-1 )
-            SET_TMVP(1,0);
-        if( h->mb.i_mb_y < h->mb.i_mb_height-1 )
-            SET_TMVP(0,1);
-#undef SET_TMVP
-    }
-
-    *i_mvc = i;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl.c b/android/src/main/libenc/jni/libx264/common/opencl.c
deleted file mode 100755
index eac7f8e..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl.c
+++ /dev/null
@@ -1,719 +0,0 @@
-/*****************************************************************************
- * opencl.c: OpenCL initialization and kernel compilation
- *****************************************************************************
- * Copyright (C) 2012-2016 x264 project
- *
- * Authors: Steve Borho <sborho@multicorewareinc.com>
- *          Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#ifdef _WIN32
-#include <windows.h>
-#define ocl_open LoadLibraryW( L"OpenCL" )
-#define ocl_close FreeLibrary
-#define ocl_address GetProcAddress
-#else
-#include <dlfcn.h> //dlopen, dlsym, dlclose
-#if SYS_MACOSX
-#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW )
-#else
-#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW )
-#endif
-#define ocl_close dlclose
-#define ocl_address dlsym
-#endif
-
-#define LOAD_OCL_FUNC(name, continue_on_fail)\
-{\
-    ocl->name = (void*)ocl_address( ocl->library, #name );\
-    if( !continue_on_fail && !ocl->name )\
-        goto fail;\
-}
-
-/* load the library and functions we require from it */
-x264_opencl_function_t *x264_opencl_load_library( void )
-{
-    x264_opencl_function_t *ocl;
-#undef fail
-#define fail fail0
-    CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) );
-#undef fail
-#define fail fail1
-    ocl->library = ocl_open;
-    if( !ocl->library )
-        goto fail;
-#undef fail
-#define fail fail2
-    LOAD_OCL_FUNC( clBuildProgram, 0 );
-    LOAD_OCL_FUNC( clCreateBuffer, 0 );
-    LOAD_OCL_FUNC( clCreateCommandQueue, 0 );
-    LOAD_OCL_FUNC( clCreateContext, 0 );
-    LOAD_OCL_FUNC( clCreateImage2D, 0 );
-    LOAD_OCL_FUNC( clCreateKernel, 0 );
-    LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 );
-    LOAD_OCL_FUNC( clCreateProgramWithSource, 0 );
-    LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 );
-    LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 );
-    LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 );
-    LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 );
-    LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 );
-    LOAD_OCL_FUNC( clFinish, 0 );
-    LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 );
-    LOAD_OCL_FUNC( clGetDeviceIDs, 0 );
-    LOAD_OCL_FUNC( clGetDeviceInfo, 0 );
-    LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 );
-    LOAD_OCL_FUNC( clGetPlatformIDs, 0 );
-    LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 );
-    LOAD_OCL_FUNC( clGetProgramInfo, 0 );
-    LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 );
-    LOAD_OCL_FUNC( clReleaseCommandQueue, 0 );
-    LOAD_OCL_FUNC( clReleaseContext, 0 );
-    LOAD_OCL_FUNC( clReleaseKernel, 0 );
-    LOAD_OCL_FUNC( clReleaseMemObject, 0 );
-    LOAD_OCL_FUNC( clReleaseProgram, 0 );
-    LOAD_OCL_FUNC( clSetKernelArg, 0 );
-    return ocl;
-#undef fail
-fail2:
-    ocl_close( ocl->library );
-fail1:
-    x264_free( ocl );
-fail0:
-    return NULL;
-}
-
-void x264_opencl_close_library( x264_opencl_function_t *ocl )
-{
-    if( !ocl )
-        return;
-    ocl_close( ocl->library );
-    x264_free( ocl );
-}
-
-/* define from recent cl_ext.h, copied here in case headers are old */
-#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
-
-/* Requires full include path in case of out-of-tree builds */
-#include "common/oclobj.h"
-
-static int x264_detect_switchable_graphics( void );
-
-/* Try to load the cached compiled program binary, verify the device context is
- * still valid before reuse */
-static cl_program x264_opencl_cache_load( x264_t *h, const char *dev_name, const char *dev_vendor, const char *driver_version )
-{
-    /* try to load cached program binary */
-    FILE *fp = x264_fopen( h->param.psz_clbin_file, "rb" );
-    if( !fp )
-        return NULL;
-
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    cl_program program = NULL;
-    uint8_t *binary = NULL;
-
-    fseek( fp, 0, SEEK_END );
-    size_t size = ftell( fp );
-    rewind( fp );
-    CHECKED_MALLOC( binary, size );
-
-    if ( fread( binary, 1, size, fp ) != size )
-        goto fail;
-    const uint8_t *ptr = (const uint8_t*)binary;
-
-#define CHECK_STRING( STR )\
-    do {\
-        size_t len = strlen( STR );\
-        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
-            goto fail;\
-        else {\
-            size -= (len+1); ptr += (len+1);\
-        }\
-    } while( 0 )
-
-    CHECK_STRING( dev_name );
-    CHECK_STRING( dev_vendor );
-    CHECK_STRING( driver_version );
-    CHECK_STRING( x264_opencl_source_hash );
-#undef CHECK_STRING
-
-    cl_int status;
-    program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
-    if( status != CL_SUCCESS )
-        program = NULL;
-
-fail:
-    fclose( fp );
-    x264_free( binary );
-    return program;
-}
-
-/* Save the compiled program binary to a file for later reuse.  Device context
- * is also saved in the cache file so we do not reuse stale binaries */
-static void x264_opencl_cache_save( x264_t *h, cl_program program, const char *dev_name, const char *dev_vendor, const char *driver_version )
-{
-    FILE *fp = x264_fopen( h->param.psz_clbin_file, "wb" );
-    if( !fp )
-    {
-        x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" );
-        return;
-    }
-
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    uint8_t *binary = NULL;
-
-    size_t size = 0;
-    cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
-    if( status != CL_SUCCESS || !size )
-    {
-        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" );
-        goto fail;
-    }
-
-    CHECKED_MALLOC( binary, size );
-    status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
-    if( status != CL_SUCCESS )
-    {
-        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" );
-        goto fail;
-    }
-
-    fputs( dev_name, fp );
-    fputc( '\n', fp );
-    fputs( dev_vendor, fp );
-    fputc( '\n', fp );
-    fputs( driver_version, fp );
-    fputc( '\n', fp );
-    fputs( x264_opencl_source_hash, fp );
-    fputc( '\n', fp );
-    fwrite( binary, 1, size, fp );
-
-fail:
-    fclose( fp );
-    x264_free( binary );
-    return;
-}
-
-/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
- * the Makefile. It defines a x264_opencl_source byte array which we will pass
- * to clCreateProgramWithSource().  We also attempt to use a cache file for the
- * compiled binary, stored in the current working folder. */
-static cl_program x264_opencl_compile( x264_t *h )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    cl_program program = NULL;
-    char *build_log = NULL;
-
-    char dev_name[64];
-    char dev_vendor[64];
-    char driver_version[64];
-    cl_int status;
-    status  = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(dev_name), dev_name, NULL );
-    status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(dev_vendor), dev_vendor, NULL );
-    status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL );
-    if( status != CL_SUCCESS )
-        return NULL;
-
-    // Most AMD GPUs have vector registers
-    int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." );
-    h->opencl.b_device_AMD_SI = 0;
-
-    if( vectorize )
-    {
-        /* Disable OpenCL on Intel/AMD switchable graphics devices */
-        if( x264_detect_switchable_graphics() )
-        {
-            x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
-            return NULL;
-        }
-
-        /* Detect AMD SouthernIsland or newer device (single-width registers) */
-        cl_uint simdwidth = 4;
-        status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
-        if( status == CL_SUCCESS && simdwidth == 1 )
-        {
-            vectorize = 0;
-            h->opencl.b_device_AMD_SI = 1;
-        }
-    }
-
-    x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
-
-    program = x264_opencl_cache_load( h, dev_name, dev_vendor, driver_version );
-    if( !program )
-    {
-        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
-        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
-        const char *strptr = (const char*)x264_opencl_source;
-        size_t size = sizeof(x264_opencl_source);
-        program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
-        if( status != CL_SUCCESS || !program )
-        {
-            x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
-            return NULL;
-        }
-    }
-
-    /* Build the program binary for the OpenCL device */
-    const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
-    status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
-    if( status == CL_SUCCESS )
-    {
-        x264_opencl_cache_save( h, program, dev_name, dev_vendor, driver_version );
-        return program;
-    }
-
-    /* Compile failure, should not happen with production code. */
-
-    size_t build_log_len = 0;
-    status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len );
-    if( status != CL_SUCCESS || !build_log_len )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
-        goto fail;
-    }
-
-    build_log = x264_malloc( build_log_len );
-    if( !build_log )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
-        goto fail;
-    }
-
-    status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
-    if( status != CL_SUCCESS )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
-        goto fail;
-    }
-
-    FILE *log_file = x264_fopen( "x264_kernel_build_log.txt", "w" );
-    if( !log_file )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" );
-        goto fail;
-    }
-    fwrite( build_log, 1, build_log_len, log_file );
-    fclose( log_file );
-    x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
-
-fail:
-    x264_free( build_log );
-    if( program )
-        ocl->clReleaseProgram( program );
-    return NULL;
-}
-
-static int x264_opencl_lookahead_alloc( x264_t *h )
-{
-    if( !h->param.rc.i_lookahead )
-        return -1;
-
-    static const char *kernelnames[] = {
-        "mb_intra_cost_satd_8x8",
-        "sum_intra_cost",
-        "downscale_hpel",
-        "downscale1",
-        "downscale2",
-        "memset_int16",
-        "weightp_scaled_images",
-        "weightp_hpel",
-        "hierarchical_motion",
-        "subpel_refine",
-        "mode_selection",
-        "sum_inter_cost"
-    };
-
-    cl_kernel *kernels[] = {
-        &h->opencl.intra_kernel,
-        &h->opencl.rowsum_intra_kernel,
-        &h->opencl.downscale_hpel_kernel,
-        &h->opencl.downscale_kernel1,
-        &h->opencl.downscale_kernel2,
-        &h->opencl.memset_kernel,
-        &h->opencl.weightp_scaled_images_kernel,
-        &h->opencl.weightp_hpel_kernel,
-        &h->opencl.hme_kernel,
-        &h->opencl.subpel_refine_kernel,
-        &h->opencl.mode_select_kernel,
-        &h->opencl.rowsum_inter_kernel
-    };
-
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    cl_int status;
-
-    h->opencl.lookahead_program = x264_opencl_compile( h );
-    if( !h->opencl.lookahead_program )
-        goto fail;
-
-    for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ )
-    {
-        *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
-        if( status != CL_SUCCESS )
-        {
-            x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
-            goto fail;
-        }
-    }
-
-    h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
-    if( status != CL_SUCCESS )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
-        goto fail;
-    }
-    h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                         0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
-    if( status != CL_SUCCESS )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
-        goto fail;
-    }
-
-    return 0;
-fail:
-    x264_opencl_lookahead_delete( h );
-    return -1;
-}
-
-static void CL_CALLBACK x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
-{
-    /* Any error notification can be assumed to be fatal to the OpenCL context.
-     * We need to stop using it immediately to prevent further damage. */
-    x264_t *h = (x264_t*)user_data;
-    h->param.b_opencl = 0;
-    h->opencl.b_fatal_error = 1;
-    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
-    x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
-}
-
-int x264_opencl_lookahead_init( x264_t *h )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    cl_platform_id *platforms = NULL;
-    cl_device_id *devices = NULL;
-    cl_image_format *imageType = NULL;
-    cl_context context = NULL;
-    int ret = -1;
-
-    cl_uint numPlatforms = 0;
-    cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms );
-    if( status != CL_SUCCESS || !numPlatforms )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
-        goto fail;
-    }
-    platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms );
-    if( !platforms )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" );
-        goto fail;
-    }
-    status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL );
-    if( status != CL_SUCCESS )
-    {
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
-        goto fail;
-    }
-
-    /* Select the first OpenCL platform with a GPU device that supports our
-     * required image (texture) formats */
-    for( cl_uint i = 0; i < numPlatforms; i++ )
-    {
-        cl_uint gpu_count = 0;
-        status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
-        if( status != CL_SUCCESS || !gpu_count )
-            continue;
-
-        x264_free( devices );
-        devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
-        if( !devices )
-            continue;
-
-        status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
-        if( status != CL_SUCCESS )
-            continue;
-
-        /* Find a GPU device that supports our image formats */
-        for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
-        {
-            h->opencl.device = devices[gpu];
-
-            /* if the user has specified an exact device ID, skip all other
-             * GPUs.  If this device matches, allow it to continue through the
-             * checks for supported images, etc.  */
-            if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id )
-                continue;
-
-            cl_bool image_support = 0;
-            status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
-            if( status != CL_SUCCESS || !image_support )
-                continue;
-
-            if( context )
-                ocl->clReleaseContext( context );
-            context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
-            if( status != CL_SUCCESS || !context )
-                continue;
-
-            cl_uint imagecount = 0;
-            status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
-            if( status != CL_SUCCESS || !imagecount )
-                continue;
-
-            x264_free( imageType );
-            imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
-            if( !imageType )
-                continue;
-
-            status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
-            if( status != CL_SUCCESS )
-                continue;
-
-            int b_has_r = 0;
-            int b_has_rgba = 0;
-            for( cl_uint j = 0; j < imagecount; j++ )
-            {
-                if( imageType[j].image_channel_order == CL_R &&
-                    imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
-                    b_has_r = 1;
-                else if( imageType[j].image_channel_order == CL_RGBA &&
-                         imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
-                    b_has_rgba = 1;
-            }
-            if( !b_has_r || !b_has_rgba )
-            {
-                char dev_name[64];
-                status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
-                if( status == CL_SUCCESS )
-                {
-                    /* emit warning if we are discarding the user's explicit choice */
-                    int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
-                    x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name );
-                }
-                continue;
-            }
-
-            /* user selection of GPU device, skip N first matches */
-            if( h->param.i_opencl_device )
-            {
-                h->param.i_opencl_device--;
-                continue;
-            }
-
-            h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status );
-            if( status != CL_SUCCESS || !h->opencl.queue )
-                continue;
-
-            h->opencl.context = context;
-            context = NULL;
-
-            ret = 0;
-            break;
-        }
-
-        if( !ret )
-            break;
-    }
-
-    if( !h->param.psz_clbin_file )
-        h->param.psz_clbin_file = "x264_lookahead.clbin";
-
-    if( ret )
-        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" );
-    else
-        ret = x264_opencl_lookahead_alloc( h );
-
-fail:
-    if( context )
-        ocl->clReleaseContext( context );
-    x264_free( imageType );
-    x264_free( devices );
-    x264_free( platforms );
-    return ret;
-}
-
-static void x264_opencl_lookahead_free( x264_t *h )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-
-#define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 )
-    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel );
-    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel );
-    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel );
-    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel );
-    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel );
-    RELEASE( h->opencl.memset_kernel, clReleaseKernel );
-    RELEASE( h->opencl.intra_kernel, clReleaseKernel );
-    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel );
-    RELEASE( h->opencl.hme_kernel, clReleaseKernel );
-    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel );
-    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel );
-    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel );
-
-    RELEASE( h->opencl.lookahead_program, clReleaseProgram );
-
-    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject );
-    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject );
-    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject );
-    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject );
-    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject );
-    RELEASE( h->opencl.row_satds[0], clReleaseMemObject );
-    RELEASE( h->opencl.row_satds[1], clReleaseMemObject );
-    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject );
-    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject );
-    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject );
-    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject );
-    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject );
-    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject );
-    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject );
-    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject );
-#undef RELEASE
-}
-
-void x264_opencl_lookahead_delete( x264_t *h )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-
-    if( !ocl )
-        return;
-
-    if( h->opencl.queue )
-        ocl->clFinish( h->opencl.queue );
-
-    x264_opencl_lookahead_free( h );
-
-    if( h->opencl.queue )
-    {
-        ocl->clReleaseCommandQueue( h->opencl.queue );
-        h->opencl.queue = NULL;
-    }
-    if( h->opencl.context )
-    {
-        ocl->clReleaseContext( h->opencl.context );
-        h->opencl.context = NULL;
-    }
-}
-
-void x264_opencl_frame_delete( x264_frame_t *frame )
-{
-    x264_opencl_function_t *ocl = frame->opencl.ocl;
-
-    if( !ocl )
-        return;
-
-#define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 )
-    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
-        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
-    RELEASEBUF( frame->opencl.luma_hpel );
-    RELEASEBUF( frame->opencl.inv_qscale_factor );
-    RELEASEBUF( frame->opencl.intra_cost );
-    RELEASEBUF( frame->opencl.lowres_mvs0 );
-    RELEASEBUF( frame->opencl.lowres_mvs1 );
-    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
-    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
-#undef RELEASEBUF
-}
-
-/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
- * we consult AMD's ADL interface to detect this situation and disable
- * OpenCL on these machines (Linux and Windows) */
-#ifdef _WIN32
-#define ADL_API_CALL
-#define ADL_CALLBACK __stdcall
-#define adl_close FreeLibrary
-#define adl_address GetProcAddress
-#else
-#define ADL_API_CALL
-#define ADL_CALLBACK
-#define adl_close dlclose
-#define adl_address dlsym
-#endif
-
-typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int );
-typedef int   ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int );
-typedef int   ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * );
-typedef int   ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * );
-typedef int   ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void );
-
-#define ADL_OK 0
-#define ADL_PX_SCHEME_DYNAMIC 2
-
-static void* ADL_CALLBACK adl_malloc_wrapper( int iSize )
-{
-    return x264_malloc( iSize );
-}
-
-static int x264_detect_switchable_graphics( void )
-{
-    void *hDLL;
-    ADL_MAIN_CONTROL_CREATE          ADL_Main_Control_Create;
-    ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
-    ADL_POWERXPRESS_SCHEME_GET       ADL_PowerXpress_Scheme_Get;
-    ADL_MAIN_CONTROL_DESTROY         ADL_Main_Control_Destroy;
-    int ret = 0;
-
-#ifdef _WIN32
-    hDLL = LoadLibraryW( L"atiadlxx.dll" );
-    if( !hDLL )
-        hDLL = LoadLibraryW( L"atiadlxy.dll" );
-#else
-    hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
-#endif
-    if( !hDLL )
-        goto fail0;
-
-    ADL_Main_Control_Create          = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create");
-    ADL_Main_Control_Destroy         = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy");
-    ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
-    ADL_PowerXpress_Scheme_Get       = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get");
-    if( !ADL_Main_Control_Create || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
-        !ADL_PowerXpress_Scheme_Get )
-        goto fail1;
-
-    if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) )
-        goto fail1;
-
-    int numAdapters = 0;
-    if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
-        goto fail2;
-
-    for( int i = 0; i < numAdapters; i++ )
-    {
-        int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
-        if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
-            break;
-
-        if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
-        {
-            ret = 1;
-            break;
-        }
-    }
-
-fail2:
-    ADL_Main_Control_Destroy();
-fail1:
-    adl_close( hDLL );
-fail0:
-    return ret;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl.h b/android/src/main/libenc/jni/libx264/common/opencl.h
deleted file mode 100755
index 5a379c5..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl.h
+++ /dev/null
@@ -1,804 +0,0 @@
-/*****************************************************************************
- * opencl.h: OpenCL structures and defines
- *****************************************************************************
- * Copyright (C) 2012-2016 x264 project
- *
- * Authors: Steve Borho <sborho@multicorewareinc.com>
- *          Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_OPENCL_H
-#define X264_OPENCL_H
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#include "extras/cl.h"
-
-#define OCL_API(ret, attr, name) typedef ret (attr *name##_func)
-
-/* Platform API */
-OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs)
-(   cl_uint          /* num_entries */,
-    cl_platform_id * /* platforms */,
-    cl_uint *        /* num_platforms */);
-
-OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo)
-(   cl_platform_id   /* platform */,
-    cl_platform_info /* param_name */,
-    size_t           /* param_value_size */,
-    void *           /* param_value */,
-    size_t *         /* param_value_size_ret */);
-
-/* Device APIs */
-OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs)
-(   cl_platform_id   /* platform */,
-    cl_device_type   /* device_type */,
-    cl_uint          /* num_entries */,
-    cl_device_id *   /* devices */,
-    cl_uint *        /* num_devices */);
-
-OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo)
-(   cl_device_id    /* device */,
-    cl_device_info  /* param_name */,
-    size_t          /* param_value_size */,
-    void *          /* param_value */,
-    size_t *        /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clCreateSubDevices)
-(   cl_device_id                         /* in_device */,
-    const cl_device_partition_property * /* properties */,
-    cl_uint                              /* num_devices */,
-    cl_device_id *                       /* out_devices */,
-    cl_uint *                            /* num_devices_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainDevice)
-(   cl_device_id /* device */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseDevice)
-(   cl_device_id /* device */);
-
-/* Context APIs  */
-OCL_API(cl_context, CL_API_CALL, clCreateContext)
-(   const cl_context_properties * /* properties */,
-    cl_uint                 /* num_devices */,
-    const cl_device_id *    /* devices */,
-    void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
-    void *                  /* user_data */,
-    cl_int *                /* errcode_ret */);
-
-OCL_API(cl_context, CL_API_CALL, clCreateContextFromType)
-(   const cl_context_properties * /* properties */,
-    cl_device_type          /* device_type */,
-    void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
-    void *                  /* user_data */,
-    cl_int *                /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainContext)
-(   cl_context /* context */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseContext)
-(   cl_context /* context */);
-
-OCL_API(cl_int, CL_API_CALL, clGetContextInfo)
-(   cl_context         /* context */,
-    cl_context_info    /* param_name */,
-    size_t             /* param_value_size */,
-    void *             /* param_value */,
-    size_t *           /* param_value_size_ret */);
-
-/* Command Queue APIs */
-OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue)
-(   cl_context                     /* context */,
-    cl_device_id                   /* device */,
-    cl_command_queue_properties    /* properties */,
-    cl_int *                       /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue)
-(   cl_command_queue /* command_queue */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue)
-(   cl_command_queue /* command_queue */);
-
-OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo)
-(   cl_command_queue      /* command_queue */,
-    cl_command_queue_info /* param_name */,
-    size_t                /* param_value_size */,
-    void *                /* param_value */,
-    size_t *              /* param_value_size_ret */);
-
-/* Memory Object APIs */
-OCL_API(cl_mem, CL_API_CALL, clCreateBuffer)
-(   cl_context   /* context */,
-    cl_mem_flags /* flags */,
-    size_t       /* size */,
-    void *       /* host_ptr */,
-    cl_int *     /* errcode_ret */);
-
-OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer)
-(   cl_mem                   /* buffer */,
-    cl_mem_flags             /* flags */,
-    cl_buffer_create_type    /* buffer_create_type */,
-    const void *             /* buffer_create_info */,
-    cl_int *                 /* errcode_ret */);
-
-OCL_API(cl_mem, CL_API_CALL, clCreateImage)
-(   cl_context              /* context */,
-    cl_mem_flags            /* flags */,
-    const cl_image_format * /* image_format */,
-    const cl_image_desc *   /* image_desc */,
-    void *                  /* host_ptr */,
-    cl_int *                /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainMemObject)
-(   cl_mem /* memobj */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseMemObject)
-(   cl_mem /* memobj */);
-
-OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats)
-(   cl_context           /* context */,
-    cl_mem_flags         /* flags */,
-    cl_mem_object_type   /* image_type */,
-    cl_uint              /* num_entries */,
-    cl_image_format *    /* image_formats */,
-    cl_uint *            /* num_image_formats */);
-
-OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo)
-(   cl_mem           /* memobj */,
-    cl_mem_info      /* param_name */,
-    size_t           /* param_value_size */,
-    void *           /* param_value */,
-    size_t *         /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clGetImageInfo)
-(   cl_mem           /* image */,
-    cl_image_info    /* param_name */,
-    size_t           /* param_value_size */,
-    void *           /* param_value */,
-    size_t *         /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback)
-(   cl_mem /* memobj */,
-    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
-    void * /*user_data */ );
-
-/* Sampler APIs */
-OCL_API(cl_sampler, CL_API_CALL, clCreateSampler)
-(   cl_context          /* context */,
-    cl_bool             /* normalized_coords */,
-    cl_addressing_mode  /* addressing_mode */,
-    cl_filter_mode      /* filter_mode */,
-    cl_int *            /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainSampler)
-(   cl_sampler /* sampler */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseSampler)
-(   cl_sampler /* sampler */);
-
-OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo)
-(   cl_sampler         /* sampler */,
-    cl_sampler_info    /* param_name */,
-    size_t             /* param_value_size */,
-    void *             /* param_value */,
-    size_t *           /* param_value_size_ret */);
-
-/* Program Object APIs  */
-OCL_API(cl_program, CL_API_CALL, clCreateProgramWithSource)
-(   cl_context        /* context */,
-    cl_uint           /* count */,
-    const char **     /* strings */,
-    const size_t *    /* lengths */,
-    cl_int *          /* errcode_ret */);
-
-OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBinary)
-(   cl_context                     /* context */,
-    cl_uint                        /* num_devices */,
-    const cl_device_id *           /* device_list */,
-    const size_t *                 /* lengths */,
-    const unsigned char **         /* binaries */,
-    cl_int *                       /* binary_status */,
-    cl_int *                       /* errcode_ret */);
-
-OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBuiltInKernels)
-(   cl_context            /* context */,
-    cl_uint               /* num_devices */,
-    const cl_device_id *  /* device_list */,
-    const char *          /* kernel_names */,
-    cl_int *              /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainProgram)
-(   cl_program /* program */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseProgram)
-(   cl_program /* program */);
-
-OCL_API(cl_int, CL_API_CALL, clBuildProgram)
-(   cl_program           /* program */,
-    cl_uint              /* num_devices */,
-    const cl_device_id * /* device_list */,
-    const char *         /* options */,
-    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-    void *               /* user_data */);
-
-OCL_API(cl_int, CL_API_CALL, clCompileProgram)
-(   cl_program           /* program */,
-    cl_uint              /* num_devices */,
-    const cl_device_id * /* device_list */,
-    const char *         /* options */,
-    cl_uint              /* num_input_headers */,
-    const cl_program *   /* input_headers */,
-    const char **        /* header_include_names */,
-    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-    void *               /* user_data */);
-
-OCL_API(cl_program, CL_API_CALL, clLinkProgram)
-(   cl_context           /* context */,
-    cl_uint              /* num_devices */,
-    const cl_device_id * /* device_list */,
-    const char *         /* options */,
-    cl_uint              /* num_input_programs */,
-    const cl_program *   /* input_programs */,
-    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-    void *               /* user_data */,
-    cl_int *             /* errcode_ret */ );
-
-
-OCL_API(cl_int, CL_API_CALL, clUnloadPlatformCompiler)
-(   cl_platform_id /* platform */);
-
-OCL_API(cl_int, CL_API_CALL, clGetProgramInfo)
-(   cl_program         /* program */,
-    cl_program_info    /* param_name */,
-    size_t             /* param_value_size */,
-    void *             /* param_value */,
-    size_t *           /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clGetProgramBuildInfo)
-(   cl_program            /* program */,
-    cl_device_id          /* device */,
-    cl_program_build_info /* param_name */,
-    size_t                /* param_value_size */,
-    void *                /* param_value */,
-    size_t *              /* param_value_size_ret */);
-
-/* Kernel Object APIs */
-OCL_API(cl_kernel, CL_API_CALL, clCreateKernel)
-(   cl_program      /* program */,
-    const char *    /* kernel_name */,
-    cl_int *        /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clCreateKernelsInProgram)
-(   cl_program     /* program */,
-    cl_uint        /* num_kernels */,
-    cl_kernel *    /* kernels */,
-    cl_uint *      /* num_kernels_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainKernel)
-(   cl_kernel    /* kernel */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseKernel)
-(   cl_kernel   /* kernel */);
-
-OCL_API(cl_int, CL_API_CALL, clSetKernelArg)
-(   cl_kernel    /* kernel */,
-    cl_uint      /* arg_index */,
-    size_t       /* arg_size */,
-    const void * /* arg_value */);
-
-OCL_API(cl_int, CL_API_CALL, clGetKernelInfo)
-(   cl_kernel       /* kernel */,
-    cl_kernel_info  /* param_name */,
-    size_t          /* param_value_size */,
-    void *          /* param_value */,
-    size_t *        /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clGetKernelArgInfo)
-(   cl_kernel       /* kernel */,
-    cl_uint         /* arg_indx */,
-    cl_kernel_arg_info  /* param_name */,
-    size_t          /* param_value_size */,
-    void *          /* param_value */,
-    size_t *        /* param_value_size_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clGetKernelWorkGroupInfo)
-(   cl_kernel                  /* kernel */,
-    cl_device_id               /* device */,
-    cl_kernel_work_group_info  /* param_name */,
-    size_t                     /* param_value_size */,
-    void *                     /* param_value */,
-    size_t *                   /* param_value_size_ret */);
-
-/* Event Object APIs */
-OCL_API(cl_int, CL_API_CALL, clWaitForEvents)
-(   cl_uint             /* num_events */,
-    const cl_event *    /* event_list */);
-
-OCL_API(cl_int, CL_API_CALL, clGetEventInfo)
-(   cl_event         /* event */,
-    cl_event_info    /* param_name */,
-    size_t           /* param_value_size */,
-    void *           /* param_value */,
-    size_t *         /* param_value_size_ret */);
-
-OCL_API(cl_event, CL_API_CALL, clCreateUserEvent)
-(   cl_context    /* context */,
-    cl_int *      /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clRetainEvent)
-(   cl_event /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clReleaseEvent)
-(   cl_event /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clSetUserEventStatus)
-(   cl_event   /* event */,
-    cl_int     /* execution_status */);
-
-OCL_API(cl_int, CL_API_CALL, clSetEventCallback)
-(   cl_event    /* event */,
-    cl_int      /* command_exec_callback_type */,
-    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
-    void *      /* user_data */);
-
-/* Profiling APIs */
-OCL_API(cl_int, CL_API_CALL, clGetEventProfilingInfo)
-(   cl_event            /* event */,
-    cl_profiling_info   /* param_name */,
-    size_t              /* param_value_size */,
-    void *              /* param_value */,
-    size_t *            /* param_value_size_ret */);
-
-/* Flush and Finish APIs */
-OCL_API(cl_int, CL_API_CALL, clFlush)
-(   cl_command_queue /* command_queue */);
-
-OCL_API(cl_int, CL_API_CALL, clFinish)
-(   cl_command_queue /* command_queue */);
-
-/* Enqueued Commands APIs */
-OCL_API(cl_int, CL_API_CALL, clEnqueueReadBuffer)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* buffer */,
-    cl_bool             /* blocking_read */,
-    size_t              /* offset */,
-    size_t              /* size */,
-    void *              /* ptr */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueReadBufferRect)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* buffer */,
-    cl_bool             /* blocking_read */,
-    const size_t *      /* buffer_offset */,
-    const size_t *      /* host_offset */,
-    const size_t *      /* region */,
-    size_t              /* buffer_row_pitch */,
-    size_t              /* buffer_slice_pitch */,
-    size_t              /* host_row_pitch */,
-    size_t              /* host_slice_pitch */,
-    void *              /* ptr */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBuffer)
-(   cl_command_queue   /* command_queue */,
-    cl_mem             /* buffer */,
-    cl_bool            /* blocking_write */,
-    size_t             /* offset */,
-    size_t             /* size */,
-    const void *       /* ptr */,
-    cl_uint            /* num_events_in_wait_list */,
-    const cl_event *   /* event_wait_list */,
-    cl_event *         /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBufferRect)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* buffer */,
-    cl_bool             /* blocking_write */,
-    const size_t *      /* buffer_offset */,
-    const size_t *      /* host_offset */,
-    const size_t *      /* region */,
-    size_t              /* buffer_row_pitch */,
-    size_t              /* buffer_slice_pitch */,
-    size_t              /* host_row_pitch */,
-    size_t              /* host_slice_pitch */,
-    const void *        /* ptr */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueFillBuffer)
-(   cl_command_queue   /* command_queue */,
-    cl_mem             /* buffer */,
-    const void *       /* pattern */,
-    size_t             /* pattern_size */,
-    size_t             /* offset */,
-    size_t             /* size */,
-    cl_uint            /* num_events_in_wait_list */,
-    const cl_event *   /* event_wait_list */,
-    cl_event *         /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBuffer)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* src_buffer */,
-    cl_mem              /* dst_buffer */,
-    size_t              /* src_offset */,
-    size_t              /* dst_offset */,
-    size_t              /* size */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferRect)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* src_buffer */,
-    cl_mem              /* dst_buffer */,
-    const size_t *      /* src_origin */,
-    const size_t *      /* dst_origin */,
-    const size_t *      /* region */,
-    size_t              /* src_row_pitch */,
-    size_t              /* src_slice_pitch */,
-    size_t              /* dst_row_pitch */,
-    size_t              /* dst_slice_pitch */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueReadImage)
-(   cl_command_queue     /* command_queue */,
-    cl_mem               /* image */,
-    cl_bool              /* blocking_read */,
-    const size_t *       /* origin[3] */,
-    const size_t *       /* region[3] */,
-    size_t               /* row_pitch */,
-    size_t               /* slice_pitch */,
-    void *               /* ptr */,
-    cl_uint              /* num_events_in_wait_list */,
-    const cl_event *     /* event_wait_list */,
-    cl_event *           /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueWriteImage)
-(   cl_command_queue    /* command_queue */,
-    cl_mem              /* image */,
-    cl_bool             /* blocking_write */,
-    const size_t *      /* origin[3] */,
-    const size_t *      /* region[3] */,
-    size_t              /* input_row_pitch */,
-    size_t              /* input_slice_pitch */,
-    const void *        /* ptr */,
-    cl_uint             /* num_events_in_wait_list */,
-    const cl_event *    /* event_wait_list */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueFillImage)
-(   cl_command_queue   /* command_queue */,
-    cl_mem             /* image */,
-    const void *       /* fill_color */,
-    const size_t *     /* origin[3] */,
-    const size_t *     /* region[3] */,
-    cl_uint            /* num_events_in_wait_list */,
-    const cl_event *   /* event_wait_list */,
-    cl_event *         /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImage)
-(   cl_command_queue     /* command_queue */,
-    cl_mem               /* src_image */,
-    cl_mem               /* dst_image */,
-    const size_t *       /* src_origin[3] */,
-    const size_t *       /* dst_origin[3] */,
-    const size_t *       /* region[3] */,
-    cl_uint              /* num_events_in_wait_list */,
-    const cl_event *     /* event_wait_list */,
-    cl_event *           /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImageToBuffer)
-(   cl_command_queue /* command_queue */,
-    cl_mem           /* src_image */,
-    cl_mem           /* dst_buffer */,
-    const size_t *   /* src_origin[3] */,
-    const size_t *   /* region[3] */,
-    size_t           /* dst_offset */,
-    cl_uint          /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event *       /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferToImage)
-(   cl_command_queue /* command_queue */,
-    cl_mem           /* src_buffer */,
-    cl_mem           /* dst_image */,
-    size_t           /* src_offset */,
-    const size_t *   /* dst_origin[3] */,
-    const size_t *   /* region[3] */,
-    cl_uint          /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event *       /* event */);
-
-OCL_API(void *, CL_API_CALL, clEnqueueMapBuffer)
-(   cl_command_queue /* command_queue */,
-    cl_mem           /* buffer */,
-    cl_bool          /* blocking_map */,
-    cl_map_flags     /* map_flags */,
-    size_t           /* offset */,
-    size_t           /* size */,
-    cl_uint          /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event *       /* event */,
-    cl_int *         /* errcode_ret */);
-
-OCL_API(void *, CL_API_CALL, clEnqueueMapImage)
-(   cl_command_queue  /* command_queue */,
-    cl_mem            /* image */,
-    cl_bool           /* blocking_map */,
-    cl_map_flags      /* map_flags */,
-    const size_t *    /* origin[3] */,
-    const size_t *    /* region[3] */,
-    size_t *          /* image_row_pitch */,
-    size_t *          /* image_slice_pitch */,
-    cl_uint           /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */,
-    cl_int *          /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueUnmapMemObject)
-(   cl_command_queue /* command_queue */,
-    cl_mem           /* memobj */,
-    void *           /* mapped_ptr */,
-    cl_uint          /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueMigrateMemObjects)
-(   cl_command_queue       /* command_queue */,
-    cl_uint                /* num_mem_objects */,
-    const cl_mem *         /* mem_objects */,
-    cl_mem_migration_flags /* flags */,
-    cl_uint                /* num_events_in_wait_list */,
-    const cl_event *       /* event_wait_list */,
-    cl_event *             /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueNDRangeKernel)
-(   cl_command_queue /* command_queue */,
-    cl_kernel        /* kernel */,
-    cl_uint          /* work_dim */,
-    const size_t *   /* global_work_offset */,
-    const size_t *   /* global_work_size */,
-    const size_t *   /* local_work_size */,
-    cl_uint          /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event *       /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueTask)
-(   cl_command_queue  /* command_queue */,
-    cl_kernel         /* kernel */,
-    cl_uint           /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueNativeKernel)
-(   cl_command_queue  /* command_queue */,
-    void (CL_CALLBACK * /*user_func*/)(void *),
-    void *            /* args */,
-    size_t            /* cb_args */,
-    cl_uint           /* num_mem_objects */,
-    const cl_mem *    /* mem_list */,
-    const void **     /* args_mem_loc */,
-    cl_uint           /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueMarkerWithWaitList)
-(   cl_command_queue /* command_queue */,
-    cl_uint           /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueBarrierWithWaitList)
-(   cl_command_queue /* command_queue */,
-    cl_uint           /* num_events_in_wait_list */,
-    const cl_event *  /* event_wait_list */,
-    cl_event *        /* event */);
-
-
-/* Extension function access
-*
-* Returns the extension function address for the given function name,
-* or NULL if a valid function can not be found.  The client must
-* check to make sure the address is not NULL, before using or
-* calling the returned function address.
-*/
-OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddressForPlatform)
-(   cl_platform_id /* platform */,
-    const char *   /* func_name */);
-
-
-// Deprecated OpenCL 1.1 APIs
-OCL_API(cl_mem, CL_API_CALL, clCreateImage2D)
-(   cl_context              /* context */,
-    cl_mem_flags            /* flags */,
-    const cl_image_format * /* image_format */,
-    size_t                  /* image_width */,
-    size_t                  /* image_height */,
-    size_t                  /* image_row_pitch */,
-    void *                  /* host_ptr */,
-    cl_int *                /* errcode_ret */);
-
-OCL_API(cl_mem, CL_API_CALL, clCreateImage3D)
-(   cl_context              /* context */,
-    cl_mem_flags            /* flags */,
-    const cl_image_format * /* image_format */,
-    size_t                  /* image_width */,
-    size_t                  /* image_height */,
-    size_t                  /* image_depth */,
-    size_t                  /* image_row_pitch */,
-    size_t                  /* image_slice_pitch */,
-    void *                  /* host_ptr */,
-    cl_int *                /* errcode_ret */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueMarker)
-(   cl_command_queue    /* command_queue */,
-    cl_event *          /* event */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueWaitForEvents)
-(   cl_command_queue /* command_queue */,
-    cl_uint          /* num_events */,
-    const cl_event * /* event_list */);
-
-OCL_API(cl_int, CL_API_CALL, clEnqueueBarrier)
-(   cl_command_queue /* command_queue */);
-
-OCL_API(cl_int, CL_API_CALL, clUnloadCompiler)
-(   void);
-
-OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddress)
-(   const char * /* func_name */);
-
-#define OCL_DECLARE_FUNC(name) name##_func name
-
-typedef struct
-{
-    void *library;
-
-    OCL_DECLARE_FUNC( clBuildProgram );
-    OCL_DECLARE_FUNC( clCreateBuffer );
-    OCL_DECLARE_FUNC( clCreateCommandQueue );
-    OCL_DECLARE_FUNC( clCreateContext );
-    OCL_DECLARE_FUNC( clCreateImage2D );
-    OCL_DECLARE_FUNC( clCreateKernel );
-    OCL_DECLARE_FUNC( clCreateProgramWithBinary );
-    OCL_DECLARE_FUNC( clCreateProgramWithSource );
-    OCL_DECLARE_FUNC( clEnqueueCopyBuffer );
-    OCL_DECLARE_FUNC( clEnqueueMapBuffer );
-    OCL_DECLARE_FUNC( clEnqueueNDRangeKernel );
-    OCL_DECLARE_FUNC( clEnqueueReadBuffer );
-    OCL_DECLARE_FUNC( clEnqueueWriteBuffer );
-    OCL_DECLARE_FUNC( clFinish );
-    OCL_DECLARE_FUNC( clGetCommandQueueInfo );
-    OCL_DECLARE_FUNC( clGetDeviceIDs );
-    OCL_DECLARE_FUNC( clGetDeviceInfo );
-    OCL_DECLARE_FUNC( clGetKernelWorkGroupInfo );
-    OCL_DECLARE_FUNC( clGetPlatformIDs );
-    OCL_DECLARE_FUNC( clGetProgramBuildInfo );
-    OCL_DECLARE_FUNC( clGetProgramInfo );
-    OCL_DECLARE_FUNC( clGetSupportedImageFormats );
-    OCL_DECLARE_FUNC( clReleaseCommandQueue );
-    OCL_DECLARE_FUNC( clReleaseContext );
-    OCL_DECLARE_FUNC( clReleaseKernel );
-    OCL_DECLARE_FUNC( clReleaseMemObject );
-    OCL_DECLARE_FUNC( clReleaseProgram );
-    OCL_DECLARE_FUNC( clSetKernelArg );
-} x264_opencl_function_t;
-
-/* Number of downscale resolutions to use for motion search */
-#define NUM_IMAGE_SCALES 4
-
-/* Number of PCIe copies that can be queued before requiring a flush */
-#define MAX_FINISH_COPIES 1024
-
-/* Size (in bytes) of the page-locked buffer used for PCIe xfers */
-#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
-
-typedef struct
-{
-    x264_opencl_function_t *ocl;
-
-    cl_context       context;
-    cl_device_id     device;
-    cl_command_queue queue;
-
-    cl_program  lookahead_program;
-    cl_int      last_buf;
-
-    cl_mem      page_locked_buffer;
-    char       *page_locked_ptr;
-    int         pl_occupancy;
-
-    struct
-    {
-        void *src;
-        void *dest;
-        int   bytes;
-    } copies[MAX_FINISH_COPIES];
-    int         num_copies;
-
-    int         b_device_AMD_SI;
-    int         b_fatal_error;
-    int         lookahead_thread_pri;
-    int         opencl_thread_pri;
-
-    /* downscale lowres luma */
-    cl_kernel   downscale_hpel_kernel;
-    cl_kernel   downscale_kernel1;
-    cl_kernel   downscale_kernel2;
-    cl_mem      luma_16x16_image[2];
-
-    /* weightp filtering */
-    cl_kernel   weightp_hpel_kernel;
-    cl_kernel   weightp_scaled_images_kernel;
-    cl_mem      weighted_scaled_images[NUM_IMAGE_SCALES];
-    cl_mem      weighted_luma_hpel;
-
-    /* intra */
-    cl_kernel   memset_kernel;
-    cl_kernel   intra_kernel;
-    cl_kernel   rowsum_intra_kernel;
-    cl_mem      row_satds[2];
-
-    /* hierarchical motion estimation */
-    cl_kernel   hme_kernel;
-    cl_kernel   subpel_refine_kernel;
-    cl_mem      mv_buffers[2];
-    cl_mem      lowres_mv_costs;
-    cl_mem      mvp_buffer;
-
-    /* bidir */
-    cl_kernel   mode_select_kernel;
-    cl_kernel   rowsum_inter_kernel;
-    cl_mem      lowres_costs[2];
-    cl_mem      frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
-} x264_opencl_t;
-
-typedef struct
-{
-    x264_opencl_function_t *ocl;
-
-    cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
-    cl_mem luma_hpel;
-    cl_mem inv_qscale_factor;
-    cl_mem intra_cost;
-    cl_mem lowres_mvs0;
-    cl_mem lowres_mvs1;
-    cl_mem lowres_mv_costs0;
-    cl_mem lowres_mv_costs1;
-} x264_frame_opencl_t;
-
-typedef struct x264_frame x264_frame;
-
-x264_opencl_function_t *x264_opencl_load_library( void );
-void x264_opencl_close_library( x264_opencl_function_t *ocl );
-
-int x264_opencl_lookahead_init( x264_t *h );
-void x264_opencl_lookahead_delete( x264_t *h );
-
-void x264_opencl_frame_delete( x264_frame *frame );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/bidir.cl b/android/src/main/libenc/jni/libx264/common/opencl/bidir.cl
deleted file mode 100755
index 9c21626..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/bidir.cl
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Mode selection routines, select the least SATD cost mode for each lowres
- * macroblock.  When measuring B slices, this includes measuring the cost of
- * three bidir modes.  */
-
-/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
-int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
-                             int2 fencpos,
-                             read_only image2d_t fref0_planes,
-                             int2 qpos0,
-                             read_only image2d_t fref1_planes,
-                             int2 qpos1,
-                             int weight,
-                             local sum2_t *tmpp,
-                             int idx )
-{
-    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
-    sum2_t b0, b1, b2, b3;
-    sum2_t sum = 0;
-
-    // fencpos is full-pel position of original MB
-    // qpos0 is qpel position within reference frame 0
-    // qpos1 is qpel position within reference frame 1
-
-    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
-    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
-
-    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
-    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
-    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
-
-    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
-    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
-
-    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
-    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
-    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
-
-    uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
-    uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
-
-    uint vA, vB;
-    uint enc, ref0, ref1;
-    uint a0, a1;
-    const int weight2 = 64 - weight;
-
-#define READ_BIDIR_DIFF( OUT, X )\
-    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
-    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
-    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
-    ref0 = rhadd( vA, vB );\
-    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
-    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
-    ref1 = rhadd( vA, vB );\
-    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
-
-#define READ_DIFF_EX( OUT, a, b )\
-    READ_BIDIR_DIFF( a0, a );\
-    READ_BIDIR_DIFF( a1, b );\
-    OUT = a0 + (a1<<BITS_PER_SUM);
-
-#define ROW_8x4_SATD( a, b, c )\
-    fencpos.y += a;\
-    fref0Apos.y += b;\
-    fref0Bpos.y += b;\
-    fref1Apos.y += c;\
-    fref1Bpos.y += c;\
-    READ_DIFF_EX( b0, 0, 4 );\
-    READ_DIFF_EX( b1, 1, 5 );\
-    READ_DIFF_EX( b2, 2, 6 );\
-    READ_DIFF_EX( b3, 3, 7 );\
-    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
-    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
-    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
-
-    ROW_8x4_SATD( 0, 0, 0 );
-    ROW_8x4_SATD( 4, 4, 4 );
-
-#undef READ_BIDIR_DIFF
-#undef READ_DIFF_EX
-#undef ROW_8x4_SATD
-
-    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
-}
-
-/*
- * mode selection - pick the least cost partition type for each 8x8 macroblock.
- * Intra, list0 or list1.  When measuring a B slice, also test three bidir
- * possibilities.
- *
- * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
- * hold many frames worth of motion vectors.  We must offset into the correct
- * location for this frame's vectors:
- *
- *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
- *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
- *
- * global launch dimensions for P slice estimate:  [mb_width, mb_height]
- * global launch dimensions for B slice estimate:  [mb_width * 4, mb_height]
- */
-kernel void mode_selection( read_only image2d_t   fenc_lowres,
-                            read_only image2d_t   fref0_planes,
-                            read_only image2d_t   fref1_planes,
-                            const global short2  *fenc_lowres_mvs0,
-                            const global short2  *fenc_lowres_mvs1,
-                            const global short2  *fref1_lowres_mvs0,
-                            const global int16_t *fenc_lowres_mv_costs0,
-                            const global int16_t *fenc_lowres_mv_costs1,
-                            const global uint16_t *fenc_intra_cost,
-                            global uint16_t      *lowres_costs,
-                            global int           *frame_stats,
-                            local int16_t        *cost_local,
-                            local sum2_t         *satd_local,
-                            int                   mb_width,
-                            int                   bipred_weight,
-                            int                   dist_scale_factor,
-                            int                   b,
-                            int                   p0,
-                            int                   p1,
-                            int                   lambda )
-{
-    int mb_x = get_global_id( 0 );
-    int b_bidir = b < p1;
-    if( b_bidir )
-    {
-        /* when mode_selection is run for B frames, it must perform BIDIR SATD
-         * measurements, so it is launched with four times as many threads in
-         * order to spread the work around more of the GPU.  And it can add
-         * padding threads in the X direction. */
-        mb_x >>= 2;
-        if( mb_x >= mb_width )
-            return;
-    }
-    int mb_y = get_global_id( 1 );
-    int mb_height = get_global_size( 1 );
-    int mb_count = mb_width * mb_height;
-    int mb_xy = mb_x + mb_y * mb_width;
-
-    /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
-    if( mb_x < 4 && mb_y == 0 )
-        frame_stats[mb_x] = 0;
-
-    int bcost = COST_MAX;
-    int list_used = 0;
-
-    if( !b_bidir )
-    {
-        int icost = fenc_intra_cost[mb_xy];
-        COPY2_IF_LT( bcost, icost, list_used, 0 );
-    }
-    if( b != p0 )
-    {
-        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
-        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
-    }
-    if( b != p1 )
-    {
-        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
-        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
-    }
-
-    if( b_bidir )
-    {
-        int2 coord = (int2)(mb_x, mb_y) << 3;
-        int mb_i = get_global_id( 0 ) & 3;
-        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
-        cost_local += mb_in_group * 4;
-        satd_local += mb_in_group * 16;
-
-#define TRY_BIDIR( mv0, mv1, penalty )\
-{\
-    int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
-    int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
-    cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
-    int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
-    COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
-}
-
-        /* temporal prediction */
-        short2 dmv0, dmv1;
-        short2 mvr = fref1_lowres_mvs0[mb_xy];
-        dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
-        dmv1 = dmv0 - mvr;
-        TRY_BIDIR( dmv0, dmv1, 0 )
-
-        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
-        {
-            /* B-direct prediction */
-            dmv0 = 0; dmv1 = 0;
-            TRY_BIDIR( dmv0, dmv1, 0 );
-        }
-
-        /* L0+L1 prediction */
-        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
-        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
-        TRY_BIDIR( dmv0, dmv1, 5 );
-#undef TRY_BIDIR
-    }
-
-    lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
-}
-
-/*
- * parallel sum inter costs
- *
- * global launch dimensions: [256, mb_height]
- */
-kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
-                            const global uint16_t *inv_qscale_factor,
-                            global int           *fenc_row_satds,
-                            global int           *frame_stats,
-                            int                   mb_width,
-                            int                   bframe_bias,
-                            int                   b,
-                            int                   p0,
-                            int                   p1 )
-{
-    int y = get_global_id( 1 );
-    int mb_height = get_global_size( 1 );
-
-    int row_satds = 0;
-    int cost_est = 0;
-    int cost_est_aq = 0;
-    int intra_mbs = 0;
-
-    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
-    {
-        int mb_xy = x + y * mb_width;
-        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
-        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
-        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
-
-        if( list == 0 && b_frame_score_mb )
-            intra_mbs++;
-
-        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
-
-        row_satds += cost_aq;
-
-        if( b_frame_score_mb )
-        {
-            cost_est += cost;
-            cost_est_aq += cost_aq;
-        }
-    }
-
-    local int buffer[256];
-    int x = get_global_id( 0 );
-
-    row_satds   = parallel_sum( row_satds, x, buffer );
-    cost_est    = parallel_sum( cost_est, x, buffer );
-    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
-    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
-
-    if( b != p1 )
-        // Use floating point math to avoid 32bit integer overflow conditions
-        cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
-
-    if( get_global_id( 0 ) == 0 )
-    {
-        fenc_row_satds[y] = row_satds;
-        atomic_add( frame_stats + COST_EST, cost_est );
-        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
-        atomic_add( frame_stats + INTRA_MBS, intra_mbs );
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/downscale.cl b/android/src/main/libenc/jni/libx264/common/opencl/downscale.cl
deleted file mode 100755
index f7ceeb8..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/downscale.cl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
- *
- * --
- *
- * fenc_img is an output image (area of memory referenced through a texture
- * cache). A read of any pixel location (x,y) returns four pixel values:
- *
- * val.s0 = P(x,y)
- * val.s1 = P(x+1,y)
- * val.s2 = P(x+2,y)
- * val.s3 = P(x+3,y)
- *
- * This is a 4x replication of the lowres pixels, a trade-off between memory
- * size and read latency.
- *
- * --
- *
- * hpel_planes is an output image that contains the four HPEL planes used for
- * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
- * the four planar values C | V | H | F
- *
- * launch dimensions:  [lowres-width, lowres-height]
- */
-kernel void downscale_hpel( const global pixel *fenc,
-                            write_only image2d_t fenc_img,
-                            write_only image2d_t hpel_planes,
-                            int stride )
-{
-    int x = get_global_id( 0 );
-    int y = get_global_id( 1 );
-    uint4 values;
-
-    fenc += y * stride * 2;
-    const global pixel *src1 = fenc + stride;
-    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
-    int2 pos = (int2)(x, y);
-    pixel right, left;
-
-    right = rhadd( fenc[x*2], src1[x*2] );
-    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
-    values.s0 = rhadd( right, left );           // F
-
-    right = rhadd( fenc[2*x+1], src1[2*x+1] );
-    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
-    values.s1 = rhadd( right, left );           // H
-
-    right = rhadd( src1[2*x], src2[2*x] );
-    left  = rhadd( src1[2*x+1], src2[2*x+1] );
-    values.s2 = rhadd( right, left );           // V
-
-    right = rhadd( src1[2*x+1], src2[2*x+1] );
-    left  = rhadd( src1[2*x+2], src2[2*x+2] );
-    values.s3 = rhadd( right, left );           // C
-
-    uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
-    write_imageui( hpel_planes, pos, val );
-
-    x = select( x, x+1, x+1 < get_global_size( 0 ) );
-    right = rhadd( fenc[x*2], src1[x*2] );
-    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
-    values.s1 = rhadd( right, left );
-
-    x = select( x, x+1, x+1 < get_global_size( 0 ) );
-    right = rhadd( fenc[x*2], src1[x*2] );
-    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
-    values.s2 = rhadd( right, left );
-
-    x = select( x, x+1, x+1 < get_global_size( 0 ) );
-    right = rhadd( fenc[x*2], src1[x*2] );
-    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
-    values.s3 = rhadd( right, left );
-
-    write_imageui( fenc_img, pos, values );
-}
-
-/*
- * downscale lowres hierarchical motion search image, copy from one image to
- * another decimated image.  This kernel is called iteratively to generate all
- * of the downscales.
- *
- * launch dimensions:  [lower_res width, lower_res height]
- */
-kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
-{
-    int x = get_global_id( 0 );
-    int y = get_global_id( 1 );
-    int2 pos = (int2)(x, y);
-    int gs = get_global_size( 0 );
-    uint4 top, bot, values;
-    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
-    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
-    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
-
-    /* these select statements appear redundant, and they should be, but tests break when
-     * they are not here.  I believe this was caused by a driver bug
-     */
-    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
-    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
-    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
-    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
-    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
-    write_imageui( lower_res, pos, (uint4)(values) );
-}
-
-/*
- * Second copy of downscale kernel, no differences. This is a (no perf loss)
- * workaround for a scheduling bug in current Tahiti drivers.  This bug has
- * theoretically been fixed in the July 2012 driver release from AMD.
- */
-kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
-{
-    int x = get_global_id( 0 );
-    int y = get_global_id( 1 );
-    int2 pos = (int2)(x, y);
-    int gs = get_global_size( 0 );
-    uint4 top, bot, values;
-    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
-    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
-    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
-
-    // see comment in above function copy
-    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
-    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
-    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
-    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
-    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
-    write_imageui( lower_res, pos, (uint4)(values) );
-}
-
-/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
-kernel void memset_int16( global int16_t *buf, int16_t value )
-{
-    buf[get_global_id( 0 )] = value;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/intra.cl b/android/src/main/libenc/jni/libx264/common/opencl/intra.cl
deleted file mode 100755
index d55978b..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/intra.cl
+++ /dev/null
@@ -1,1072 +0,0 @@
-/* Lookahead lowres intra analysis
- *
- * Each intra analysis function has been implemented twice, once for scalar GPUs
- * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
- * the GPU type and sets the -DVECTORIZE compile flag accordingly.
- *
- * All the intra analysis functions were based on their C versions in pixel.c
- * and produce the exact same results.
- */
-
-/* force all clamp arguments and return value to int, prevent ambiguous types */
-#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
-
-#if VECTORIZE
-int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
-{
-    int8 a_v, d_v;
-    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
-    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
-
-    d_v = convert_int8( vload8( 0, data ) );
-    a_v.s01234567 = (d_v - pr0).s04152637;
-    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
-
-    data += data_stride;
-    d_v = convert_int8( vload8( 0, data ) );
-    a_v.s01234567 = (d_v - pr1).s04152637;
-    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
-
-    data += data_stride;
-    d_v = convert_int8( vload8( 0, data ) );
-    a_v.s01234567 = (d_v - pr2).s04152637;
-    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
-
-    data += data_stride;
-    d_v = convert_int8( vload8( 0, data ) );
-    a_v.s01234567 = (d_v - pr3).s04152637;
-    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
-
-    uint8 sum_v;
-
-    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
-    sum_v = abs( a_v );
-
-    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
-    sum_v += abs( a_v );
-
-    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
-    sum_v += abs( a_v );
-
-    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
-    sum_v += abs( a_v );
-
-    uint4 sum2 = sum_v.hi + sum_v.lo;
-    uint2 sum3 = sum2.hi + sum2.lo;
-    return ( sum3.hi + sum3.lo ) >> 1;
-}
-#else
-SATD_C_8x4_Q( satd_8x4_lp, const local, private )
-#endif
-
-/****************************************************************************
- * 8x8 prediction for intra luma block
- ****************************************************************************/
-
-#define F1            rhadd
-#define F2( a, b, c ) ( a+2*b+c+2 )>>2
-
-#if VECTORIZE
-int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
-    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
-    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
-    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
-    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
-    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-
-    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
-    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
-    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
-    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
-    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-
-    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
-    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
-    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
-    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-
-    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
-    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
-    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
-    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
-    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
-
-    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
-    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
-    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
-    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
-
-    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
-    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
-    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
-    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
-    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
-
-    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
-    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
-    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
-    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
-    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
-    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
-    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
-    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
-
-    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr3.s0 = F2( left[1], left[2], left[3] );
-    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
-    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
-    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
-    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
-    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
-    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
-    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
-    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
-    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
-    pr0.s7 = F2( top[5], top[6], top[7] );
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr3.s0 = F2( left[5], left[6], left[7] );
-    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
-    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
-    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
-    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
-    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
-    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
-    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
-    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
-    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
-    pr0.s7 = F2( top[1], top[2], top[3] );
-    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr2.s0 = F2( left[1], left[0], left_top );
-    pr3.s0 = F2( left[2], left[1], left[0] );
-    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
-    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
-    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
-    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
-    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
-    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
-    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
-    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
-    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
-    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
-    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
-    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
-    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
-    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
-    pr1.s7 = F2( top[5], top[6], top[7] );
-    pr0.s7 = F1( top[6], top[7] );
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr2.s0 = F2( left[5], left[4], left[3] );
-    pr3.s0 = F2( left[6], left[5], left[4] );
-    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
-    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
-    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
-    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
-    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
-    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
-    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
-    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
-    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
-    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
-    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
-    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
-    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
-    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
-    pr1.s7 = F2( top[3], top[4], top[5] );
-    pr0.s7 = F1( top[4], top[5] );
-    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
-#undef PRED
-}
-
-int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
-    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
-    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
-    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
-
-    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
-    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
-    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
-    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
-
-    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
-    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
-    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
-    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
-
-    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
-    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
-    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
-    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
-
-    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
-
-    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-
-    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
-    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr0.s0 = F1( top[0], top[1] );
-    pr1.s0 = F2( top[0], top[1], top[2] );
-    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
-    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
-    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
-    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
-    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
-    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
-    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
-    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
-    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
-    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
-    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
-    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
-    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
-    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
-    pr2.s7 = F1( top[8], top[9] );
-    pr3.s7 = F2( top[8], top[9], top[10] );
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr0.s0 = F1( top[2], top[3] );
-    pr1.s0 = F2( top[2], top[3], top[4] );
-    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
-    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
-    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
-    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
-    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
-    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
-    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
-    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
-    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
-    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
-    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
-    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
-    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
-    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
-    pr2.s7 = F1( top[10], top[11] );
-    pr3.s7 = F2( top[10], top[11], top[12] );
-    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
-{
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
-    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-
-    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
-    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-
-    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
-    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
-
-    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
-    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
-    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    // Lower half of pred[]
-    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
-    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
-    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
-    pr0.s6 = left[7]; pr0.s7 = left[7];
-
-    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
-    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
-    pr1.s4 = left[7]; pr1.s5 = left[7];
-    pr1.s6 = left[7]; pr1.s7 = left[7];
-
-    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
-    pr2.s2 = left[7]; pr2.s3 = left[7];
-    pr2.s4 = left[7]; pr2.s5 = left[7];
-    pr2.s6 = left[7]; pr2.s7 = left[7];
-
-    pr3 = (int8)left[7];
-
-    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8c_h( const local pixel *src, int src_stride )
-{
-    const local pixel *src_l = src;
-    int8 pr0, pr1, pr2, pr3;
-
-    // Upper half of pred[]
-    pr0 = (int8)src[-1]; src += src_stride;
-    pr1 = (int8)src[-1]; src += src_stride;
-    pr2 = (int8)src[-1]; src += src_stride;
-    pr3 = (int8)src[-1]; src += src_stride;
-    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
-
-    //Lower half of pred[]
-    pr0 = (int8)src[-1]; src += src_stride;
-    pr1 = (int8)src[-1]; src += src_stride;
-    pr2 = (int8)src[-1]; src += src_stride;
-    pr3 = (int8)src[-1];
-    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8c_v( const local pixel *src, int src_stride )
-{
-    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
-    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
-           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
-}
-
-int x264_predict_8x8c_p( const local pixel *src, int src_stride )
-{
-    int H = 0, V = 0;
-    for( int i = 0; i < 4; i++ )
-    {
-        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
-        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
-    }
-
-    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
-    int b = (17 * H + 16) >> 5;
-    int c = (17 * V + 16) >> 5;
-    int i00 = a - 3 * b - 3 * c + 16;
-
-    // Upper half of pred[]
-    int pix = i00;
-    int8 pr0, pr1, pr2, pr3;
-    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
-
-    //Lower half of pred[]
-    pix = i00;
-    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-
-    pix = i00;
-    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
-    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
-    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
-}
-
-int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
-{
-    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-    for( int i = 0; i < 4; i++ )
-    {
-        s0 += src[i - src_stride];
-        s1 += src[i + 4 - src_stride];
-        s2 += src[-1 + i * src_stride];
-        s3 += src[-1 + (i+4)*src_stride];
-    }
-
-    // Upper half of pred[]
-    int8 dc0;
-    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
-    dc0.hi = (int4)( (s1 + 2) >> 2 );
-    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
-
-    // Lower half of pred[]
-    dc0.lo = (int4)( (s3 + 2) >> 2 );
-    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
-    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
-}
-
-#else  /* not vectorized: private is cheap registers are scarce */
-
-int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
-{
-    private pixel pred[32];
-
-    // Upper half of pred[]
-    for( int y = 0; y < 4; y++ )
-    {
-        for( int x = 0; x < 8; x++ )
-        {
-            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
-            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
-        }
-    }
-    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
-    //Lower half of pred[]
-    for( int y = 4; y < 8; y++ )
-    {
-        for( int x = 0; x < 8; x++ )
-        {
-            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
-            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
-        }
-    }
-    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-}
-
-int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    private pixel pred[32];
-#define PRED( x, y ) pred[(x) + (y)*8]
-    // Upper half of pred[]
-    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
-    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
-    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
-    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
-    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
-    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
-    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
-    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
-    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
-    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
-    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
-    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
-
-    // Lower half of pred[]
-    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
-    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
-    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
-    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
-    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
-    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
-    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
-    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
-    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
-    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
-    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-#undef PRED
-}
-
-int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    private pixel pred[32];
-#define PRED( x, y ) pred[(x) + (y)*8]
-    // Upper half of pred[]
-    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
-    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
-    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
-    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
-    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
-    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
-    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
-    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
-    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
-    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
-    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
-    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
-    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
-    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
-    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
-    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
-    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
-    PRED( 7, 0 ) = F1( top[6], top[7] );
-    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
-
-    //Lower half of pred[]
-    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
-    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
-    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
-    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
-    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
-    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
-    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
-    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
-    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
-    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
-    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
-    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
-    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
-    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
-    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
-    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
-    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
-    PRED( 7, 0 ) = F1( top[4], top[5] );
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-#undef PRED
-}
-
-inline uint32_t pack16to32( uint32_t a, uint32_t b )
-{
-    return a + (b << 16);
-}
-
-inline uint32_t pack8to16( uint32_t a, uint32_t b )
-{
-    return a + (b << 8);
-}
-
-int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
-{
-    private pixel pred[32];
-    int satd;
-    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
-    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
-    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
-    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
-    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
-    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
-    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
-    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
-    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
-    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
-    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
-    // Upper half of pred[]
-    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
-    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
-    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
-    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
-    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
-    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
-    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
-    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
-    satd = satd_8x4_lp( src, src_stride, pred, 8 );
-    // Lower half of pred[]
-    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
-    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
-    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
-    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
-    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
-    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
-    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
-    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-}
-
-int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
-{
-    private pixel pred[32];
-    int satd;
-#define PRED( x, y ) pred[(x) + (y)*8]
-    // Upper half of pred[]
-    PRED( 0, 0 ) = F1( top[0], top[1] );
-    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
-    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
-    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
-    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
-    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
-    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
-    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
-    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
-    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
-    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
-    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
-    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
-    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
-    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
-    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
-    PRED( 7, 2 ) = F1( top[8], top[9] );
-    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
-    satd = satd_8x4_lp( src, src_stride, pred, 8 );
-    // Lower half of pred[]
-    PRED( 0, 0 ) = F1( top[2], top[3] );
-    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
-    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
-    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
-    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
-    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
-    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
-    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
-    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
-    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
-    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
-    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
-    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
-    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
-    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
-    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
-    PRED( 7, 2 ) = F1( top[10], top[11] );
-    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-#undef PRED
-}
-
-int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
-{
-    private pixel pred[32];
-    int satd;
-    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
-    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
-    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
-    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
-    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
-    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
-    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
-    int p8 = pack8to16( left[7], left[7] );
-    // Upper half of pred[]
-    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
-    satd = satd_8x4_lp( src, src_stride, pred, 8 );
-    // Lower half of pred[]
-    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
-    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-}
-
-int x264_predict_8x8c_h( const local pixel *src, int src_stride )
-{
-    private pixel pred[32];
-    const local pixel *src_l = src;
-
-    // Upper half of pred[]
-    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
-    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
-
-    // Lower half of pred[]
-    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
-    vstore8( (uchar8)(src[-1]), 3, pred );
-    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
-}
-
-int x264_predict_8x8c_v( const local pixel *src, int src_stride )
-{
-    private pixel pred[32];
-    uchar16 v16;
-    v16.lo = vload8( 0, &src[-src_stride] );
-    v16.hi = vload8( 0, &src[-src_stride] );
-
-    vstore16( v16, 0, pred );
-    vstore16( v16, 1, pred );
-
-    return satd_8x4_lp( src, src_stride, pred, 8 ) +
-           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
-}
-
-int x264_predict_8x8c_p( const local pixel *src, int src_stride )
-{
-    int H = 0, V = 0;
-    private pixel pred[32];
-    int satd;
-
-    for( int i = 0; i < 4; i++ )
-    {
-        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
-        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
-    }
-
-    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
-    int b = (17 * H + 16) >> 5;
-    int c = (17 * V + 16) >> 5;
-    int i00 = a - 3 * b - 3 * c + 16;
-
-    // Upper half of pred[]
-    for( int y = 0; y < 4; y++ )
-    {
-        int pix = i00;
-        for( int x = 0; x < 8; x++ )
-        {
-            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
-            pix += b;
-        }
-        i00 += c;
-    }
-    satd = satd_8x4_lp( src, src_stride, pred, 8 );
-    // Lower half of pred[]
-    for( int y = 0; y < 4; y++ )
-    {
-        int pix = i00;
-        for( int x = 0; x < 8; x++ )
-        {
-            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
-            pix += b;
-        }
-        i00 += c;
-    }
-    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-    return satd;
-}
-
-int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
-{
-    private pixel pred[32];
-    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-    for( int i = 0; i < 4; i++ )
-    {
-        s0 += src[i - src_stride];
-        s1 += src[i + 4 - src_stride];
-        s2 += src[-1 + i * src_stride];
-        s3 += src[-1 + (i+4)*src_stride];
-    }
-
-    // Upper half of pred[]
-    uchar8 dc0;
-    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
-    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
-    vstore8( dc0, 0, pred );
-    vstore8( dc0, 1, pred );
-    vstore8( dc0, 2, pred );
-    vstore8( dc0, 3, pred );
-    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
-
-    // Lower half of pred[]
-    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
-    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
-    vstore8( dc0, 0, pred );
-    vstore8( dc0, 1, pred );
-    vstore8( dc0, 2, pred );
-    vstore8( dc0, 3, pred );
-    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
-}
-#endif
-
-/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
- *
- * Loads 33 macroblocks plus the pixels directly above them into local memory,
- * padding where necessary with edge pixels.  It then cooperatively calculates
- * smoothed top and left pixels for use in some of the analysis.
- *
- * Then groups of 32 threads each calculate a single intra mode for each 8x8
- * block.  Since consecutive threads are calculating the same intra mode there
- * is no code-path divergence.  8 intra costs are calculated simultaneously.  If
- * the "slow" argument is not zero, the final two (least likely) intra modes are
- * tested in a second pass.  The slow mode is only enabled for presets slow,
- * slower, and placebo.
- *
- * This allows all of the pixels functions to read pixels from local memory, and
- * avoids re-fetching edge pixels from global memory.  And it allows us to
- * calculate all of the intra mode costs simultaneously without branch divergence.
- *
- * Local dimension:    [ 32, 8 ]
- * Global dimensions:  [ paddedWidth, height ] */
-kernel void mb_intra_cost_satd_8x8( read_only image2d_t  fenc,
-                                    global uint16_t     *fenc_intra_cost,
-                                    global int          *frame_stats,
-                                    int                  lambda,
-                                    int                  mb_width,
-                                    int                  slow )
-{
-#define CACHE_STRIDE 265
-#define BLOCK_OFFSET 266
-    local pixel cache[2385];
-    local int cost_buf[32];
-    local pixel top[32 * 16];
-    local pixel left[32 * 8];
-    local pixel left_top[32];
-
-    int lx = get_local_id( 0 );
-    int ly = get_local_id( 1 );
-    int gx = get_global_id( 0 );
-    int gy = get_global_id( 1 );
-    int gidx = get_group_id( 0 );
-    int gidy = get_group_id( 1 );
-    int linear_id = ly * get_local_size( 0 ) + lx;
-    int satd = COST_MAX;
-    int basex = gidx << 8;
-    int basey = (gidy << 3) - 1;
-
-    /* Load 33 8x8 macroblocks and the pixels above them into local cache */
-    for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
-    {
-        int x = linear_id << 2;
-        uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
-        cache[y * CACHE_STRIDE + 1 + x] = data.s0;
-        cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
-        cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
-        cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
-    }
-    /* load pixels on left edge */
-    if( linear_id < 9 )
-        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
-
-    barrier( CLK_LOCAL_MEM_FENCE );
-
-    // Cooperatively build the top edge for the macroblock using lowpass filter
-    int j = ly;
-    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
-                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
-                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
-    j += 8;
-    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
-                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
-                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
-    // Cooperatively build the left edge for the macroblock using lowpass filter
-    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
-                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
-                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
-    // One left_top per macroblock
-    if( 0 == ly )
-    {
-        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
-                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
-        cost_buf[lx] = COST_MAX;
-    }
-    barrier( CLK_LOCAL_MEM_FENCE );
-
-    // each warp/wavefront generates a different prediction type; no divergence
-    switch( ly )
-    {
-        case 0:
-            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
-            break;
-        case 1:
-            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
-            break;
-        case 2:
-            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
-            break;
-        case 3:
-            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
-            break;
-        case 4:
-            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
-            break;
-        case 5:
-            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
-            break;
-        case 6:
-            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
-            break;
-        case 7:
-            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
-            break;
-        default:
-            break;
-    }
-    atom_min( &cost_buf[lx], satd );
-    if( slow )
-    {
-        // Do the remaining two (least likely) prediction modes
-        switch( ly )
-        {
-            case 0: // DDL
-                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
-                atom_min( &cost_buf[lx], satd );
-                break;
-            case 1: // VL
-                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
-                atom_min( &cost_buf[lx], satd );
-                break;
-            default:
-                break;
-        }
-    }
-    barrier( CLK_LOCAL_MEM_FENCE );
-
-    if( (0 == ly) && (gx < mb_width) )
-        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
-
-    // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
-    if( gx < 2 && gy == 0 )
-        frame_stats[gx] = 0;
-#undef CACHE_STRIDE
-#undef BLOCK_OFFSET
-}
-
-/*
- * parallel sum intra costs
- *
- * global launch dimensions: [256, mb_height]
- */
-kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
-                            const global uint16_t *inv_qscale_factor,
-                            global int           *fenc_row_satds,
-                            global int           *frame_stats,
-                            int                   mb_width )
-{
-    int y = get_global_id( 1 );
-    int mb_height = get_global_size( 1 );
-
-    int row_satds = 0;
-    int cost_est = 0;
-    int cost_est_aq = 0;
-
-    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
-    {
-        int mb_xy = x + y * mb_width;
-        int cost = fenc_intra_cost[mb_xy];
-        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
-        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
-
-        row_satds += cost_aq;
-        if( b_frame_score_mb )
-        {
-            cost_est += cost;
-            cost_est_aq += cost_aq;
-        }
-    }
-
-    local int buffer[256];
-    int x = get_global_id( 0 );
-
-    row_satds   = parallel_sum( row_satds, x, buffer );
-    cost_est    = parallel_sum( cost_est, x, buffer );
-    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
-
-    if( get_global_id( 0 ) == 0 )
-    {
-        fenc_row_satds[y] = row_satds;
-        atomic_add( frame_stats + COST_EST,    cost_est );
-        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/motionsearch.cl b/android/src/main/libenc/jni/libx264/common/opencl/motionsearch.cl
deleted file mode 100755
index 77a07ce..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/motionsearch.cl
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Hierarchical (iterative) OpenCL lowres motion search */
-
-inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
-{
-    /* edge macroblocks might not have a direct descendant, use nearest */
-    x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
-    y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
-    return (mb_width>>1) * y + x;
-}
-
-/* Four threads calculate an 8x8 SAD.  Each does two rows */
-int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
-{
-    frefpos.y += idx << 1;
-    fencpos.y += idx << 1;
-    int cost = 0;
-    if( frefpos.x < 0 )
-    {
-        /* slow path when MV goes past left edge.  The GPU clamps reads from
-         * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
-         * want are [0, 0, 1, 2]
-         */
-        for( int y = 0; y < 2; y++ )
-        {
-            for( int x = 0; x < 8; x++ )
-            {
-                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
-                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
-                cost += abs_diff( enc, ref );
-            }
-        }
-    }
-    else
-    {
-        uint4 enc, ref, costs = 0;
-        enc = read_imageui( fenc, sampler, fencpos );
-        ref = read_imageui( fref, sampler, frefpos );
-        costs += abs_diff( enc, ref );
-        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
-        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
-        costs += abs_diff( enc, ref );
-        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
-        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
-        costs += abs_diff( enc, ref );
-        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
-        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
-        costs += abs_diff( enc, ref );
-        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
-    }
-    costs[idx] = cost;
-    return costs[0] + costs[1] + costs[2] + costs[3];
-}
-
-/* One thread performs 8x8 SAD */
-int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
-{
-    if( frefpos.x < 0 )
-    {
-        /* slow path when MV goes past left edge */
-        int cost = 0;
-        for( int y = 0; y < 8; y++ )
-        {
-            for( int x = 0; x < 8; x++ )
-            {
-                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
-                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
-                cost += abs_diff( enc, ref );
-            }
-        }
-        return cost;
-    }
-    else
-    {
-        uint4 enc, ref, cost = 0;
-        for( int y = 0; y < 8; y++ )
-        {
-            for( int x = 0; x < 8; x += 4 )
-            {
-                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
-                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
-                cost += abs_diff( enc, ref );
-            }
-        }
-        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
-    }
-}
-/*
- * hierarchical motion estimation
- *
- * Each kernel launch is a single iteration
- *
- * MB per work group is determined by lclx / 4 * lcly
- *
- * global launch dimensions:  [mb_width * 4, mb_height]
- */
-kernel void hierarchical_motion( read_only image2d_t  fenc,
-                                 read_only image2d_t  fref,
-                                 const global short2 *in_mvs,
-                                 global short2       *out_mvs,
-                                 global int16_t      *out_mv_costs,
-                                 global short2       *mvp_buffer,
-                                 local int16_t       *cost_local,
-                                 local short2        *mvc_local,
-                                 int                  mb_width,
-                                 int                  lambda,
-                                 int                  me_range,
-                                 int                  scale,
-                                 int                  b_shift_index,
-                                 int                  b_first_iteration,
-                                 int                  b_reverse_references )
-{
-    int mb_x = get_global_id( 0 ) >> 2;
-    if( mb_x >= mb_width )
-        return;
-    int mb_height = get_global_size( 1 );
-    int mb_i = get_global_id( 0 ) & 3;
-    int mb_y = get_global_id( 1 );
-    int mb_xy = mb_y * mb_width + mb_x;
-    const int mb_size = 8;
-    int2 coord = (int2)(mb_x, mb_y) * mb_size;
-
-    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
-    cost_local += 4 * mb_in_group;
-
-    int i_mvc = 0;
-    mvc_local += 4 * mb_in_group;
-    mvc_local[mb_i] = 0;
-    int2 mvp =0;
-
-    if( !b_first_iteration )
-    {
-#define MVC( DX, DY )\
-    {\
-        int px = mb_x + DX;\
-        int py = mb_y + DY;\
-        mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
-                                           in_mvs[mb_width * py + px];\
-        mvc_local[i_mvc] >>= (short) scale;\
-        i_mvc++;\
-    }
-        /* Find MVP from median of MVCs */
-        if( b_reverse_references )
-        {
-            /* odd iterations: derive MVP from down and right */
-            if( mb_x < mb_width - 1 )
-                MVC( 1, 0 );
-            if( mb_y < mb_height - 1 )
-            {
-                MVC( 0, 1 );
-                if( mb_x > b_shift_index )
-                    MVC( -1, 1 );
-                if( mb_x < mb_width - 1 )
-                    MVC( 1, 1 );
-            }
-        }
-        else
-        {
-            /* even iterations: derive MVP from up and left */
-            if( mb_x > 0 )
-                MVC( -1, 0 );
-            if( mb_y > 0 )
-            {
-                MVC( 0, -1 );
-                if( mb_x < mb_width - 1 )
-                    MVC( 1, -1 );
-                if( mb_x > b_shift_index )
-                    MVC( -1, -1 );
-            }
-        }
-#undef MVC
-        mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
-    }
-    /* current mvp matches the previous mvp and we have not changed scale.  We know
-     * we're going to arrive at the same MV again, so just copy the previous
-     * result to our output. */
-    if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
-    {
-        out_mvs[mb_xy] = in_mvs[mb_xy];
-        return;
-    }
-    mvp_buffer[mb_xy] = convert_short2_sat(mvp);
-    int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
-    int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
-
-    int2 bestmv = clamp(mvp, mv_min, mv_max);
-    int2 refcrd = coord + bestmv;
-
-    /* measure cost at bestmv */
-    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
-                lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
-
-    do
-    {
-        /* measure costs at offsets from bestmv */
-        refcrd = coord + bestmv + dia_offs[mb_i];
-        int2 trymv = bestmv + dia_offs[mb_i];
-        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
-                   lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
-
-        cost_local[mb_i] = (cost<<2) | mb_i;
-        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
-
-        if( (cost >> 2) >= bcost )
-            break;
-
-        bestmv += dia_offs[cost&3];
-        bcost = cost>>2;
-
-        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
-            break;
-    }
-    while( --me_range > 0 );
-
-    int2 trymv = 0, diff = 0;
-
-#define COST_MV_NO_PAD( L )\
-    trymv = clamp( trymv, mv_min, mv_max );\
-    diff = convert_int2_sat(abs_diff( mvp, trymv ));\
-    if( diff.x > 1 || diff.y > 1 ) {\
-        int2 refcrd = coord + trymv;\
-        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
-                   L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
-        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
-
-    COST_MV_NO_PAD( 0 );
-
-    if( !b_first_iteration )
-    {
-        /* try cost at previous iteration's MV, if MVP was too far away */
-        int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
-        prevmv >>= scale;
-        trymv = prevmv;
-        COST_MV_NO_PAD( lambda );
-    }
-
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        /* try cost at each candidate MV, if MVP was too far away */
-        trymv = convert_int2_sat( mvc_local[i] );
-        COST_MV_NO_PAD( lambda );
-    }
-
-    if( mb_i == 0 )
-    {
-        bestmv <<= scale;
-        out_mvs[mb_xy] = convert_short2_sat(bestmv);
-        out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/subpel.cl b/android/src/main/libenc/jni/libx264/common/opencl/subpel.cl
deleted file mode 100755
index 8c7216c..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/subpel.cl
+++ /dev/null
@@ -1,242 +0,0 @@
-/* OpenCL lowres subpel Refine */
-
-/* Each thread performs 8x8 SAD.  4 threads per MB, so the 4 DIA HPEL offsets are
- * calculated simultaneously */
-int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
-{
-    int2 frefpos = qpos >> 2;
-    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
-    uint mask_shift = 8 * hpel_idx;
-
-    uint4 cost4 = 0;
-
-    for( int y = 0; y < 8; y++ )
-    {
-        uint4 enc, val4;
-        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
-        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
-        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
-        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
-        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
-        cost4 += abs_diff( enc, val4 );
-
-        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
-        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
-        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
-        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
-        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
-        cost4 += abs_diff( enc, val4 );
-    }
-
-    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
-}
-
-/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
-int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
-{
-    int2 frefApos = qpos >> 2;
-    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
-
-    int2 qposB = qpos + ((qpos & 1) << 1);
-    int2 frefBpos = qposB >> 2;
-    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
-
-    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
-
-    int cost = 0;
-
-    for( int y = 0; y < 8; y++ )
-    {
-        for( int x = 0; x < 8; x++ )
-        {
-            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
-            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
-            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
-            cost += abs_diff( enc, rhadd( vA, vB ) );
-        }
-    }
-
-    return cost;
-}
-
-/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
- *
- * Each thread collects 1/4 of the rows of diffs and processes one quarter of
- * the transforms
- */
-int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
-                            int2 fencpos,
-                            read_only image2d_t fref_planes,
-                            int2 qpos,
-                            local sum2_t *tmpp,
-                            int idx )
-{
-    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
-    sum2_t b0, b1, b2, b3;
-
-    // fencpos is full-pel position of original MB
-    // qpos is qpel position within reference frame
-    int2 frefApos = qpos >> 2;
-    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
-
-    int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
-    int2 frefBpos = qposB >> 2;
-    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
-
-    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
-
-    uint vA, vB;
-    uint a0, a1;
-    uint enc;
-    sum2_t sum = 0;
-
-#define READ_DIFF( OUT, X )\
-    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
-    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
-    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
-    OUT = enc - rhadd( vA, vB );
-
-#define READ_DIFF_EX( OUT, a, b )\
-    {\
-        READ_DIFF( a0, a );\
-        READ_DIFF( a1, b );\
-        OUT = a0 + (a1<<BITS_PER_SUM);\
-    }
-#define ROW_8x4_SATD( a, b )\
-    {\
-        fencpos.y += a;\
-        frefApos.y += b;\
-        frefBpos.y += b;\
-        READ_DIFF_EX( b0, 0, 4 );\
-        READ_DIFF_EX( b1, 1, 5 );\
-        READ_DIFF_EX( b2, 2, 6 );\
-        READ_DIFF_EX( b3, 3, 7 );\
-        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
-        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
-        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
-    }
-    ROW_8x4_SATD( 0, 0 );
-    ROW_8x4_SATD( 4, 4 );
-
-#undef READ_DIFF
-#undef READ_DIFF_EX
-#undef ROW_8x4_SATD
-    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
-}
-
-constant int2 hpoffs[4] =
-{
-    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
-};
-
-/* sub pixel refinement of motion vectors, output MVs and costs are moved from
- * temporary buffers into final per-frame buffer
- *
- * global launch dimensions:  [mb_width * 4, mb_height]
- *
- * With X being the source 16x16 pixels, F is the lowres pixel used by the
- * motion search.  We will now utilize the H V and C pixels (stored in separate
- * planes) to search at half-pel increments.
- *
- * X X X X X X
- *  F H F H F
- * X X X X X X
- *  V C V C V
- * X X X X X X
- *  F H F H F
- * X X X X X X
- *
- * The YX HPEL bits of the motion vector selects the plane we search in.  The
- * four planes are packed in the fref_planes 2D image buffer.  Each sample
- * returns:  s0 = F, s1 = H, s2 = V, s3 = C */
-kernel void subpel_refine( read_only image2d_t   fenc,
-                           read_only image2d_t   fref_planes,
-                           const global short2  *in_mvs,
-                           const global int16_t *in_sad_mv_costs,
-                           local int16_t        *cost_local,
-                           local sum2_t         *satd_local,
-                           local short2         *mvc_local,
-                           global short2        *fenc_lowres_mv,
-                           global int16_t       *fenc_lowres_mv_costs,
-                           int                   mb_width,
-                           int                   lambda,
-                           int                   b,
-                           int                   ref,
-                           int                   b_islist1 )
-{
-    int mb_x = get_global_id( 0 ) >> 2;
-    if( mb_x >= mb_width )
-        return;
-    int mb_height = get_global_size( 1 );
-
-    int mb_i = get_global_id( 0 ) & 3;
-    int mb_y = get_global_id( 1 );
-    int mb_xy = mb_y * mb_width + mb_x;
-
-    /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
-     * hold many frames worth of motion vectors.  We must offset into the correct
-     * location for this frame's vectors.  The kernel will be passed the correct
-     * directional buffer for the direction of the search: list1 or list0
-     *
-     *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
-     *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
-    fenc_lowres_mv +=       (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
-    fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
-
-    /* Adjust pointers into local memory buffers for this thread's data */
-    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
-    cost_local += mb_in_group * 4;
-    satd_local += mb_in_group * 16;
-    mvc_local += mb_in_group * 4;
-
-    int i_mvc = 0;
-
-    mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
-
-#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
-    if( mb_x > 0 )
-        MVC( -1, 0 );
-    if( mb_y > 0 )
-    {
-        MVC( 0, -1 );
-        if( mb_x < mb_width - 1 )
-            MVC( 1, -1 );
-        if( mb_x > 0 )
-            MVC( -1, -1 );
-    }
-#undef MVC
-    int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
-
-    int bcost =  in_sad_mv_costs[mb_xy];
-    int2 coord = (int2)(mb_x, mb_y) << 3;
-    int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
-
-    /* Make mvp and bmv QPEL MV */
-    mvp <<= 2; bmv <<= 2;
-
-#define HPEL_QPEL( ARR, FUNC )\
-    {\
-        int2 trymv = bmv + ARR[mb_i];\
-        int2 qpos = (coord << 2) + trymv;\
-        int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
-        cost_local[mb_i] = (cost<<2) + mb_i;\
-        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
-        if( (cost>>2) < bcost )\
-        {\
-            bmv += ARR[cost&3];\
-            bcost = cost>>2;\
-        }\
-    }
-
-    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
-    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
-    fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
-
-    /* remeasure cost of bmv using SATD */
-    int2 qpos = (coord << 2) + bmv;
-    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
-    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
-    bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
-
-    fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/weightp.cl b/android/src/main/libenc/jni/libx264/common/opencl/weightp.cl
deleted file mode 100755
index 1524cce..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/weightp.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Weightp filter a downscaled image into a temporary output buffer.
- * This kernel is launched once for each scale.
- *
- * Launch dimensions: width x height (in pixels)
- */
-kernel void weightp_scaled_images( read_only image2d_t in_plane,
-                                   write_only image2d_t out_plane,
-                                   uint offset,
-                                   uint scale,
-                                   uint denom )
-{
-    int gx = get_global_id( 0 );
-    int gy = get_global_id( 1 );
-    uint4 input_val;
-    uint4 output_val;
-
-    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
-    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
-    write_imageui( out_plane, (int2)(gx, gy), output_val );
-}
-
-/* Weightp filter for the half-pel interpolated image
- *
- * Launch dimensions: width x height (in pixels)
- */
-kernel void weightp_hpel( read_only image2d_t in_plane,
-                          write_only image2d_t out_plane,
-                          uint offset,
-                          uint scale,
-                          uint denom )
-{
-    int gx = get_global_id( 0 );
-    int gy = get_global_id( 1 );
-    uint input_val;
-    uint output_val;
-
-    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
-    //Unpack
-    uint4 temp;
-    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
-    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
-
-    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
-
-    //Pack
-    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
-    write_imageui( out_plane, (int2)(gx, gy), output_val );
-}
diff --git a/android/src/main/libenc/jni/libx264/common/opencl/x264-cl.h b/android/src/main/libenc/jni/libx264/common/opencl/x264-cl.h
deleted file mode 100755
index 892904d..0000000
--- a/android/src/main/libenc/jni/libx264/common/opencl/x264-cl.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
-
-constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-/* 7.18.1.1  Exact-width integer types */
-typedef signed char int8_t;
-typedef unsigned char   uint8_t;
-typedef short  int16_t;
-typedef unsigned short  uint16_t;
-typedef int  int32_t;
-typedef unsigned   uint32_t;
-
-typedef uint8_t  pixel;
-typedef uint16_t sum_t;
-typedef uint32_t sum2_t;
-
-#define LOWRES_COST_MASK ((1<<14)-1)
-#define LOWRES_COST_SHIFT 14
-#define COST_MAX (1<<28)
-
-#define PIXEL_MAX 255
-#define BITS_PER_SUM (8 * sizeof(sum_t))
-
-/* Constants for offsets into frame statistics buffer */
-#define COST_EST    0
-#define COST_EST_AQ 1
-#define INTRA_MBS   2
-
-#define COPY2_IF_LT( x, y, a, b )\
-    if((y)<(x))\
-    {\
-        (x) = (y);\
-        (a) = (b);\
-    }
-
-constant int2 dia_offs[4] =
-{
-    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
-};
-
-inline pixel x264_clip_pixel( int x )
-{
-    return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
-}
-
-inline int2 x264_median_mv( short2 a, short2 b, short2 c )
-{
-    short2 t1 = min(a, b);
-    short2 t2 = min(max(a, b), c);
-    return convert_int2(max(t1, t2));
-}
-
-inline sum2_t abs2( sum2_t a )
-{
-    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
-    return (a + s) ^ s;
-}
-
-#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
-    sum2_t t0 = s0 + s1;\
-    sum2_t t1 = s0 - s1;\
-    sum2_t t2 = s2 + s3;\
-    sum2_t t3 = s2 - s3;\
-    d0 = t0 + t2;\
-    d2 = t0 - t2;\
-    d1 = t1 + t3;\
-    d3 = t1 - t3;\
-}
-
-#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
-    int2 t0 = s0 + s1;\
-    int2 t1 = s0 - s1;\
-    int2 t2 = s2 + s3;\
-    int2 t3 = s2 - s3;\
-    d0 = t0 + t2;\
-    d2 = t0 - t2;\
-    d1 = t1 + t3;\
-    d3 = t1 - t3;\
-}
-
-#define SATD_C_8x4_Q( name, q1, q2 )\
-    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
-    {\
-        sum2_t tmp[4][4];\
-        sum2_t a0, a1, a2, a3;\
-        sum2_t sum = 0;\
-        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
-        {\
-            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
-            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
-            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
-            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
-            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
-        }\
-        for( int i = 0; i < 4; i++ )\
-        {\
-            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
-            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
-        }\
-        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
-    }
-
-/*
- * Utility function to perform a parallel sum reduction of an array of integers
- */
-int parallel_sum( int value, int x, volatile local int *array )
-{
-    array[x] = value;
-    barrier( CLK_LOCAL_MEM_FENCE );
-
-    int dim = get_local_size( 0 );
-
-    while( dim > 1 )
-    {
-        dim >>= 1;
-
-        if( x < dim )
-            array[x] += array[x + dim];
-
-        if( dim > 32 )
-            barrier( CLK_LOCAL_MEM_FENCE );
-    }
-
-    return array[0];
-}
-
-int mv_cost( uint2 mvd )
-{
-    float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
-    float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
-    return (int) (cost.x + cost.y);
-}
diff --git a/android/src/main/libenc/jni/libx264/common/osdep.c b/android/src/main/libenc/jni/libx264/common/osdep.c
deleted file mode 100755
index 81301f5..0000000
--- a/android/src/main/libenc/jni/libx264/common/osdep.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*****************************************************************************
- * osdep.c: platform-specific code
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#ifdef _WIN32
-#include <windows.h>
-#include <io.h>
-#endif
-
-#if SYS_WINDOWS
-#include <sys/types.h>
-#include <sys/timeb.h>
-#else
-#include <sys/time.h>
-#endif
-#include <time.h>
-
-#if PTW32_STATIC_LIB
-/* this is a global in pthread-win32 to indicate if it has been initialized or not */
-extern int ptw32_processInitialized;
-#endif
-
-int64_t x264_mdate( void )
-{
-#if SYS_WINDOWS
-    struct timeb tb;
-    ftime( &tb );
-    return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000;
-#else
-    struct timeval tv_date;
-    gettimeofday( &tv_date, NULL );
-    return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec;
-#endif
-}
-
-#if HAVE_WIN32THREAD || PTW32_STATIC_LIB
-/* state of the threading library being initialized */
-static volatile LONG x264_threading_is_init = 0;
-
-static void x264_threading_destroy( void )
-{
-#if PTW32_STATIC_LIB
-    pthread_win32_thread_detach_np();
-    pthread_win32_process_detach_np();
-#else
-    x264_win32_threading_destroy();
-#endif
-}
-
-int x264_threading_init( void )
-{
-    /* if already init, then do nothing */
-    if( InterlockedCompareExchange( &x264_threading_is_init, 1, 0 ) )
-        return 0;
-#if PTW32_STATIC_LIB
-    /* if static pthread-win32 is already initialized, then do nothing */
-    if( ptw32_processInitialized )
-        return 0;
-    if( !pthread_win32_process_attach_np() )
-        return -1;
-#else
-    if( x264_win32_threading_init() )
-        return -1;
-#endif
-    /* register cleanup to run at process termination */
-    atexit( x264_threading_destroy );
-
-    return 0;
-}
-#endif
-
-#ifdef _WIN32
-/* Functions for dealing with Unicode on Windows. */
-FILE *x264_fopen( const char *filename, const char *mode )
-{
-    wchar_t filename_utf16[MAX_PATH];
-    wchar_t mode_utf16[16];
-    if( utf8_to_utf16( filename, filename_utf16 ) && utf8_to_utf16( mode, mode_utf16 ) )
-        return _wfopen( filename_utf16, mode_utf16 );
-    return NULL;
-}
-
-int x264_rename( const char *oldname, const char *newname )
-{
-    wchar_t oldname_utf16[MAX_PATH];
-    wchar_t newname_utf16[MAX_PATH];
-    if( utf8_to_utf16( oldname, oldname_utf16 ) && utf8_to_utf16( newname, newname_utf16 ) )
-    {
-        /* POSIX says that rename() removes the destination, but Win32 doesn't. */
-        _wunlink( newname_utf16 );
-        return _wrename( oldname_utf16, newname_utf16 );
-    }
-    return -1;
-}
-
-int x264_stat( const char *path, x264_struct_stat *buf )
-{
-    wchar_t path_utf16[MAX_PATH];
-    if( utf8_to_utf16( path, path_utf16 ) )
-        return _wstati64( path_utf16, buf );
-    return -1;
-}
-
-#if !HAVE_WINRT
-int x264_vfprintf( FILE *stream, const char *format, va_list arg )
-{
-    HANDLE console = NULL;
-    DWORD mode;
-
-    if( stream == stdout )
-        console = GetStdHandle( STD_OUTPUT_HANDLE );
-    else if( stream == stderr )
-        console = GetStdHandle( STD_ERROR_HANDLE );
-
-    /* Only attempt to convert to UTF-16 when writing to a non-redirected console screen buffer. */
-    if( GetConsoleMode( console, &mode ) )
-    {
-        char buf[4096];
-        wchar_t buf_utf16[4096];
-        va_list arg2;
-
-        va_copy( arg2, arg );
-        int length = vsnprintf( buf, sizeof(buf), format, arg2 );
-        va_end( arg2 );
-
-        if( length > 0 && length < sizeof(buf) )
-        {
-            /* WriteConsoleW is the most reliable way to output Unicode to a console. */
-            int length_utf16 = MultiByteToWideChar( CP_UTF8, 0, buf, length, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t) );
-            DWORD written;
-            WriteConsoleW( console, buf_utf16, length_utf16, &written, NULL );
-            return length;
-        }
-    }
-    return vfprintf( stream, format, arg );
-}
-
-int x264_is_pipe( const char *path )
-{
-    wchar_t path_utf16[MAX_PATH];
-    if( utf8_to_utf16( path, path_utf16 ) )
-        return WaitNamedPipeW( path_utf16, 0 );
-    return 0;
-}
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER < 1900
-/* MSVC pre-VS2015 has broken snprintf/vsnprintf implementations which are incompatible with C99. */
-int x264_snprintf( char *s, size_t n, const char *fmt, ... )
-{
-    va_list arg;
-    va_start( arg, fmt );
-    int length = x264_vsnprintf( s, n, fmt, arg );
-    va_end( arg );
-    return length;
-}
-
-int x264_vsnprintf( char *s, size_t n, const char *fmt, va_list arg )
-{
-    int length = -1;
-
-    if( n )
-    {
-        va_list arg2;
-        va_copy( arg2, arg );
-        length = _vsnprintf( s, n, fmt, arg2 );
-        va_end( arg2 );
-
-        /* _(v)snprintf adds a null-terminator only if the length is less than the buffer size. */
-        if( length < 0 || length >= n )
-            s[n-1] = '\0';
-    }
-
-    /* _(v)snprintf returns a negative number if the length is greater than the buffer size. */
-    if( length < 0 )
-        return _vscprintf( fmt, arg );
-
-    return length;
-}
-#endif
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/osdep.h b/android/src/main/libenc/jni/libx264/common/osdep.h
deleted file mode 100755
index e3152d0..0000000
--- a/android/src/main/libenc/jni/libx264/common/osdep.h
+++ /dev/null
@@ -1,411 +0,0 @@
-/*****************************************************************************
- * osdep.h: platform-specific code
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_OSDEP_H
-#define X264_OSDEP_H
-
-#define _LARGEFILE_SOURCE 1
-#define _FILE_OFFSET_BITS 64
-#include <stdio.h>
-#include <sys/stat.h>
-#include <inttypes.h>
-#include <stdarg.h>
-
-#include "config.h"
-
-#ifdef __INTEL_COMPILER
-#include <mathimf.h>
-#else
-#include <math.h>
-#endif
-
-#if !HAVE_LOG2F
-#define log2f(x) (logf(x)/0.693147180559945f)
-#define log2(x) (log(x)/0.693147180559945)
-#endif
-
-#ifdef _MSC_VER
-#define inline __inline
-#define strcasecmp _stricmp
-#define strncasecmp _strnicmp
-#define strtok_r strtok_s
-#define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
-#if _MSC_VER < 1900
-int x264_snprintf( char *s, size_t n, const char *fmt, ... );
-int x264_vsnprintf( char *s, size_t n, const char *fmt, va_list arg );
-#define snprintf  x264_snprintf
-#define vsnprintf x264_vsnprintf
-#endif
-#else
-#include <strings.h>
-#endif
-
-#if !defined(va_copy) && defined(__INTEL_COMPILER)
-#define va_copy(dst, src) ((dst) = (src))
-#endif
-
-#if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS)
-#define isfinite finite
-#endif
-
-#ifdef _WIN32
-#ifndef strtok_r
-#define strtok_r(str,delim,save) strtok(str,delim)
-#endif
-
-#define utf8_to_utf16( utf8, utf16 )\
-    MultiByteToWideChar( CP_UTF8, MB_ERR_INVALID_CHARS, utf8, -1, utf16, sizeof(utf16)/sizeof(wchar_t) )
-FILE *x264_fopen( const char *filename, const char *mode );
-int x264_rename( const char *oldname, const char *newname );
-#define x264_struct_stat struct _stati64
-#define x264_fstat _fstati64
-int x264_stat( const char *path, x264_struct_stat *buf );
-#else
-#define x264_fopen       fopen
-#define x264_rename      rename
-#define x264_struct_stat struct stat
-#define x264_fstat       fstat
-#define x264_stat        stat
-#endif
-
-#if defined(_WIN32) && !HAVE_WINRT
-int x264_vfprintf( FILE *stream, const char *format, va_list arg );
-int x264_is_pipe( const char *path );
-#else
-#define x264_vfprintf vfprintf
-#define x264_is_pipe(x) 0
-#endif
-
-#ifdef _MSC_VER
-#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
-#else
-#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
-#endif
-#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
-#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
-#define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
-
-// ARM compiliers don't reliably align stack variables
-// - EABI requires only 8 byte stack alignment to be maintained
-// - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
-// - armcc can't either, but is nice enough to actually tell you so
-// - Apple gcc only maintains 4 byte alignment
-// - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...
-
-#define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\
-    uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \
-    type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)
-
-#if ARCH_ARM && SYS_MACOSX
-#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
-#else
-#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
-    ALIGNED_8( type name sub1 __VA_ARGS__ )
-#endif
-
-#if ARCH_ARM
-#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
-#else
-#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
-    ALIGNED_16( type name sub1 __VA_ARGS__ )
-#endif
-
-#define EXPAND(x) x
-
-#if STACK_ALIGNMENT >= 32
-#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
-    ALIGNED_32( type name sub1 __VA_ARGS__ )
-#else
-#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
-#endif
-
-#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
-
-/* For AVX2 */
-#if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
-#define ALIGNED_N ALIGNED_32
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
-#else
-#define NATIVE_ALIGN 16
-#define ALIGNED_N ALIGNED_16
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
-#endif
-
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
-#define UNUSED __attribute__((unused))
-#define ALWAYS_INLINE __attribute__((always_inline)) inline
-#define NOINLINE __attribute__((noinline))
-#define MAY_ALIAS __attribute__((may_alias))
-#define x264_constant_p(x) __builtin_constant_p(x)
-#define x264_nonconstant_p(x) (!__builtin_constant_p(x))
-#else
-#ifdef _MSC_VER
-#define ALWAYS_INLINE __forceinline
-#define NOINLINE __declspec(noinline)
-#else
-#define ALWAYS_INLINE inline
-#define NOINLINE
-#endif
-#define UNUSED
-#define MAY_ALIAS
-#define x264_constant_p(x) 0
-#define x264_nonconstant_p(x) 0
-#endif
-
-/* threads */
-#if HAVE_BEOSTHREAD
-#include <kernel/OS.h>
-#define x264_pthread_t               thread_id
-static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(void *), void *d )
-{
-     *t = spawn_thread( f, "", 10, d );
-     if( *t < B_NO_ERROR )
-         return -1;
-     resume_thread( *t );
-     return 0;
-}
-#define x264_pthread_join(t,s)       { long tmp; \
-                                       wait_for_thread(t,(s)?(long*)(s):&tmp); }
-
-#elif HAVE_POSIXTHREAD
-#include <pthread.h>
-#define x264_pthread_t               pthread_t
-#define x264_pthread_create          pthread_create
-#define x264_pthread_join            pthread_join
-#define x264_pthread_mutex_t         pthread_mutex_t
-#define x264_pthread_mutex_init      pthread_mutex_init
-#define x264_pthread_mutex_destroy   pthread_mutex_destroy
-#define x264_pthread_mutex_lock      pthread_mutex_lock
-#define x264_pthread_mutex_unlock    pthread_mutex_unlock
-#define x264_pthread_cond_t          pthread_cond_t
-#define x264_pthread_cond_init       pthread_cond_init
-#define x264_pthread_cond_destroy    pthread_cond_destroy
-#define x264_pthread_cond_broadcast  pthread_cond_broadcast
-#define x264_pthread_cond_wait       pthread_cond_wait
-#define x264_pthread_attr_t          pthread_attr_t
-#define x264_pthread_attr_init       pthread_attr_init
-#define x264_pthread_attr_destroy    pthread_attr_destroy
-#define x264_pthread_num_processors_np pthread_num_processors_np
-#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
-
-#elif HAVE_WIN32THREAD
-#include "win32thread.h"
-
-#else
-#define x264_pthread_t               int
-#define x264_pthread_create(t,u,f,d) 0
-#define x264_pthread_join(t,s)
-#endif //HAVE_*THREAD
-
-#if !HAVE_POSIXTHREAD && !HAVE_WIN32THREAD
-#define x264_pthread_mutex_t         int
-#define x264_pthread_mutex_init(m,f) 0
-#define x264_pthread_mutex_destroy(m)
-#define x264_pthread_mutex_lock(m)
-#define x264_pthread_mutex_unlock(m)
-#define x264_pthread_cond_t          int
-#define x264_pthread_cond_init(c,f)  0
-#define x264_pthread_cond_destroy(c)
-#define x264_pthread_cond_broadcast(c)
-#define x264_pthread_cond_wait(c,m)
-#define x264_pthread_attr_t          int
-#define x264_pthread_attr_init(a)    0
-#define x264_pthread_attr_destroy(a)
-#define X264_PTHREAD_MUTEX_INITIALIZER 0
-#endif
-
-#if HAVE_WIN32THREAD || PTW32_STATIC_LIB
-int x264_threading_init( void );
-#else
-#define x264_threading_init() 0
-#endif
-
-static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex )
-{
-#if HAVE_THREAD
-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && (ARCH_X86 || ARCH_X86_64)
-    return __sync_fetch_and_add( val, add );
-#else
-    x264_pthread_mutex_lock( mutex );
-    int res = *val;
-    *val += add;
-    x264_pthread_mutex_unlock( mutex );
-    return res;
-#endif
-#else
-    int res = *val;
-    *val += add;
-    return res;
-#endif
-}
-
-#define WORD_SIZE sizeof(void*)
-
-#define asm __asm__
-
-#if WORDS_BIGENDIAN
-#define endian_fix(x) (x)
-#define endian_fix64(x) (x)
-#define endian_fix32(x) (x)
-#define endian_fix16(x) (x)
-#else
-#if HAVE_X86_INLINE_ASM && HAVE_MMX
-static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
-{
-    asm("bswap %0":"+r"(x));
-    return x;
-}
-#elif defined(__GNUC__) && HAVE_ARMV6
-static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
-{
-    asm("rev %0, %0":"+r"(x));
-    return x;
-}
-#else
-static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
-{
-    return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
-}
-#endif
-#if HAVE_X86_INLINE_ASM && ARCH_X86_64
-static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
-{
-    asm("bswap %0":"+r"(x));
-    return x;
-}
-#else
-static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
-{
-    return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
-}
-#endif
-static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
-{
-    return WORD_SIZE == 8 ? endian_fix64(x) : endian_fix32(x);
-}
-static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
-{
-    return (x<<8)|(x>>8);
-}
-#endif
-
-/* For values with 4 bits or less. */
-static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x )
-{
-    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
-    return lut[x];
-}
-
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
-#define x264_clz(x) __builtin_clz(x)
-#define x264_ctz(x) __builtin_ctz(x)
-#else
-static int ALWAYS_INLINE x264_clz( uint32_t x )
-{
-    static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
-    int y, z = (((x >> 16) - 1) >> 27) & 16;
-    x >>= z^16;
-    z += y = ((x - 0x100) >> 28) & 8;
-    x >>= y^8;
-    z += y = ((x - 0x10) >> 29) & 4;
-    x >>= y^4;
-    return z + lut[x];
-}
-
-static int ALWAYS_INLINE x264_ctz( uint32_t x )
-{
-    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
-    int y, z = (((x & 0xffff) - 1) >> 27) & 16;
-    x >>= z;
-    z += y = (((x & 0xff) - 1) >> 28) & 8;
-    x >>= y;
-    z += y = (((x & 0xf) - 1) >> 29) & 4;
-    x >>= y;
-    return z + lut[x&0xf];
-}
-#endif
-
-#if HAVE_X86_INLINE_ASM && HAVE_MMX
-/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
- * using complex address modes properly unless we use inline asm. */
-static ALWAYS_INLINE void x264_prefetch( void *p )
-{
-    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
-}
-/* We require that prefetch not fault on invalid reads, so we only enable it on
- * known architectures. */
-#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\
-      (ARCH_X86 || ARCH_X86_64 || ARCH_ARM || ARCH_PPC)
-#define x264_prefetch(x) __builtin_prefetch(x)
-#else
-#define x264_prefetch(x)
-#endif
-
-#if HAVE_POSIXTHREAD
-#if SYS_WINDOWS
-#define x264_lower_thread_priority(p)\
-{\
-    x264_pthread_t handle = pthread_self();\
-    struct sched_param sp;\
-    int policy = SCHED_OTHER;\
-    pthread_getschedparam( handle, &policy, &sp );\
-    sp.sched_priority -= p;\
-    pthread_setschedparam( handle, policy, &sp );\
-}
-#elif SYS_HAIKU
-#include <OS.h>
-#define x264_lower_thread_priority(p)\
-    { UNUSED status_t nice_ret = set_thread_priority( find_thread( NULL ), B_LOW_PRIORITY ); }
-#else
-#include <unistd.h>
-#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
-#endif /* SYS_WINDOWS */
-#elif HAVE_WIN32THREAD
-#define x264_lower_thread_priority(p) SetThreadPriority( GetCurrentThread(), X264_MAX( -2, -p ) )
-#else
-#define x264_lower_thread_priority(p)
-#endif
-
-static inline int x264_is_regular_file( FILE *filehandle )
-{
-    x264_struct_stat file_stat;
-    if( x264_fstat( fileno( filehandle ), &file_stat ) )
-        return 1;
-    return S_ISREG( file_stat.st_mode );
-}
-
-static inline int x264_is_regular_file_path( const char *filename )
-{
-    x264_struct_stat file_stat;
-    if( x264_stat( filename, &file_stat ) )
-        return !x264_is_pipe( filename );
-    return S_ISREG( file_stat.st_mode );
-}
-
-#endif /* X264_OSDEP_H */
diff --git a/android/src/main/libenc/jni/libx264/common/pixel.c b/android/src/main/libenc/jni/libx264/common/pixel.c
deleted file mode 100755
index 4db4b46..0000000
--- a/android/src/main/libenc/jni/libx264/common/pixel.c
+++ /dev/null
@@ -1,1503 +0,0 @@
-/*****************************************************************************
- * pixel.c: pixel metrics
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#if HAVE_MMX
-#   include "x86/pixel.h"
-#   include "x86/predict.h"
-#endif
-#if ARCH_PPC
-#   include "ppc/pixel.h"
-#endif
-#if ARCH_ARM
-#   include "arm/pixel.h"
-#   include "arm/predict.h"
-#endif
-#if ARCH_AARCH64
-#   include "aarch64/pixel.h"
-#   include "aarch64/predict.h"
-#endif
-#if ARCH_MIPS
-#   include "mips/pixel.h"
-#endif
-
-
-/****************************************************************************
- * pixel_sad_WxH
- ****************************************************************************/
-#define PIXEL_SAD_C( name, lx, ly ) \
-static int name( pixel *pix1, intptr_t i_stride_pix1,  \
-                 pixel *pix2, intptr_t i_stride_pix2 ) \
-{                                                   \
-    int i_sum = 0;                                  \
-    for( int y = 0; y < ly; y++ )                   \
-    {                                               \
-        for( int x = 0; x < lx; x++ )               \
-        {                                           \
-            i_sum += abs( pix1[x] - pix2[x] );      \
-        }                                           \
-        pix1 += i_stride_pix1;                      \
-        pix2 += i_stride_pix2;                      \
-    }                                               \
-    return i_sum;                                   \
-}
-
-
-PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
-PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
-PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
-PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
-PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
-PIXEL_SAD_C( x264_pixel_sad_4x16,   4, 16 )
-PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
-PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
-
-/****************************************************************************
- * pixel_ssd_WxH
- ****************************************************************************/
-#define PIXEL_SSD_C( name, lx, ly ) \
-static int name( pixel *pix1, intptr_t i_stride_pix1,  \
-                 pixel *pix2, intptr_t i_stride_pix2 ) \
-{                                                   \
-    int i_sum = 0;                                  \
-    for( int y = 0; y < ly; y++ )                   \
-    {                                               \
-        for( int x = 0; x < lx; x++ )               \
-        {                                           \
-            int d = pix1[x] - pix2[x];              \
-            i_sum += d*d;                           \
-        }                                           \
-        pix1 += i_stride_pix1;                      \
-        pix2 += i_stride_pix2;                      \
-    }                                               \
-    return i_sum;                                   \
-}
-
-PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
-PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
-PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
-PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
-PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
-PIXEL_SSD_C( x264_pixel_ssd_4x16,   4, 16 )
-PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
-PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
-
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1,
-                             pixel *pix2, intptr_t i_pix2, int i_width, int i_height )
-{
-    uint64_t i_ssd = 0;
-    int y;
-    int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
-
-#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
-                                          pix2 + y*i_pix2 + x, i_pix2 );
-    for( y = 0; y < i_height-15; y += 16 )
-    {
-        int x = 0;
-        if( align )
-            for( ; x < i_width-15; x += 16 )
-                SSD(PIXEL_16x16);
-        for( ; x < i_width-7; x += 8 )
-            SSD(PIXEL_8x16);
-    }
-    if( y < i_height-7 )
-        for( int x = 0; x < i_width-7; x += 8 )
-            SSD(PIXEL_8x8);
-#undef SSD
-
-#define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
-    if( i_width & 7 )
-    {
-        for( y = 0; y < (i_height & ~7); y++ )
-            for( int x = i_width & ~7; x < i_width; x++ )
-                SSD1;
-    }
-    if( i_height & 7 )
-    {
-        for( y = i_height & ~7; y < i_height; y++ )
-            for( int x = 0; x < i_width; x++ )
-                SSD1;
-    }
-#undef SSD1
-
-    return i_ssd;
-}
-
-static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2,
-                                 int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-{
-    *ssd_u = 0, *ssd_v = 0;
-    for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
-        for( int x = 0; x < width; x++ )
-        {
-            int du = pixuv1[2*x]   - pixuv2[2*x];
-            int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
-            *ssd_u += du*du;
-            *ssd_v += dv*dv;
-        }
-}
-
-void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
-                          int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
-{
-    pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
-    if( i_width&7 )
-    {
-        uint64_t tmp[2];
-        pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] );
-        *ssd_u += tmp[0];
-        *ssd_v += tmp[1];
-    }
-}
-
-/****************************************************************************
- * pixel_var_wxh
- ****************************************************************************/
-#define PIXEL_VAR_C( name, w, h ) \
-static uint64_t name( pixel *pix, intptr_t i_stride ) \
-{                                             \
-    uint32_t sum = 0, sqr = 0;                \
-    for( int y = 0; y < h; y++ )              \
-    {                                         \
-        for( int x = 0; x < w; x++ )          \
-        {                                     \
-            sum += pix[x];                    \
-            sqr += pix[x] * pix[x];           \
-        }                                     \
-        pix += i_stride;                      \
-    }                                         \
-    return sum + ((uint64_t)sqr << 32);       \
-}
-
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
-PIXEL_VAR_C( x264_pixel_var_8x16,   8, 16 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8,  8 )
-
-/****************************************************************************
- * pixel_var2_wxh
- ****************************************************************************/
-#define PIXEL_VAR2_C( name, w, h, shift ) \
-static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
-{ \
-    int var = 0, sum = 0, sqr = 0; \
-    for( int y = 0; y < h; y++ ) \
-    { \
-        for( int x = 0; x < w; x++ ) \
-        { \
-            int diff = pix1[x] - pix2[x]; \
-            sum += diff; \
-            sqr += diff * diff; \
-        } \
-        pix1 += i_stride1; \
-        pix2 += i_stride2; \
-    } \
-    var = sqr - ((int64_t)sum * sum >> shift); \
-    *ssd = sqr; \
-    return var; \
-}
-
-PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
-PIXEL_VAR2_C( x264_pixel_var2_8x8,  8,  8, 6 )
-
-#if BIT_DEPTH > 8
-    typedef uint32_t sum_t;
-    typedef uint64_t sum2_t;
-#else
-    typedef uint16_t sum_t;
-    typedef uint32_t sum2_t;
-#endif
-#define BITS_PER_SUM (8 * sizeof(sum_t))
-
-#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
-    sum2_t t0 = s0 + s1;\
-    sum2_t t1 = s0 - s1;\
-    sum2_t t2 = s2 + s3;\
-    sum2_t t3 = s2 - s3;\
-    d0 = t0 + t2;\
-    d2 = t0 - t2;\
-    d1 = t1 + t3;\
-    d3 = t1 - t3;\
-}
-
-// in: a pseudo-simd number of the form x+(y<<16)
-// return: abs(x)+(abs(y)<<16)
-static ALWAYS_INLINE sum2_t abs2( sum2_t a )
-{
-    sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1);
-    return (a+s)^s;
-}
-
-/****************************************************************************
- * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
- ****************************************************************************/
-
-static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
-{
-    sum2_t tmp[4][2];
-    sum2_t a0, a1, a2, a3, b0, b1;
-    sum2_t sum = 0;
-    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
-    {
-        a0 = pix1[0] - pix2[0];
-        a1 = pix1[1] - pix2[1];
-        b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
-        a2 = pix1[2] - pix2[2];
-        a3 = pix1[3] - pix2[3];
-        b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
-        tmp[i][0] = b0 + b1;
-        tmp[i][1] = b0 - b1;
-    }
-    for( int i = 0; i < 2; i++ )
-    {
-        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
-        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
-        sum += ((sum_t)a0) + (a0>>BITS_PER_SUM);
-    }
-    return sum >> 1;
-}
-
-static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
-{
-    sum2_t tmp[4][4];
-    sum2_t a0, a1, a2, a3;
-    sum2_t sum = 0;
-    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
-    {
-        a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
-        a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
-        a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
-        a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
-        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
-    }
-    for( int i = 0; i < 4; i++ )
-    {
-        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
-        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
-    }
-    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
-}
-
-#define PIXEL_SATD_C( w, h, sub )\
-static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\
-{\
-    int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
-            + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
-    if( w==16 )\
-        sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\
-            + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\
-    if( h==16 )\
-        sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\
-            + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\
-    if( w==16 && h==16 )\
-        sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\
-            + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\
-    return sum;\
-}
-PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
-PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
-PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
-PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
-PIXEL_SATD_C( 4,  16, x264_pixel_satd_4x4 )
-PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
-
-static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
-{
-    sum2_t tmp[8][4];
-    sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
-    sum2_t sum = 0;
-    for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 )
-    {
-        a0 = pix1[0] - pix2[0];
-        a1 = pix1[1] - pix2[1];
-        b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
-        a2 = pix1[2] - pix2[2];
-        a3 = pix1[3] - pix2[3];
-        b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
-        a4 = pix1[4] - pix2[4];
-        a5 = pix1[5] - pix2[5];
-        b2 = (a4+a5) + ((a4-a5)<<BITS_PER_SUM);
-        a6 = pix1[6] - pix2[6];
-        a7 = pix1[7] - pix2[7];
-        b3 = (a6+a7) + ((a6-a7)<<BITS_PER_SUM);
-        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 );
-    }
-    for( int i = 0; i < 4; i++ )
-    {
-        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
-        HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] );
-        b0  = abs2(a0+a4) + abs2(a0-a4);
-        b0 += abs2(a1+a5) + abs2(a1-a5);
-        b0 += abs2(a2+a6) + abs2(a2-a6);
-        b0 += abs2(a3+a7) + abs2(a3-a7);
-        sum += (sum_t)b0 + (b0>>BITS_PER_SUM);
-    }
-    return sum;
-}
-
-static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
-{
-    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
-    return (sum+2)>>2;
-}
-
-static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
-{
-    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
-            + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
-            + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
-            + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 );
-    return (sum+2)>>2;
-}
-
-static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
-{
-    sum2_t tmp[32];
-    sum2_t a0, a1, a2, a3, dc;
-    sum2_t sum4 = 0, sum8 = 0;
-    for( int i = 0; i < 8; i++, pix+=stride )
-    {
-        sum2_t *t = tmp + (i&3) + (i&4)*4;
-        a0 = (pix[0]+pix[1]) + ((sum2_t)(pix[0]-pix[1])<<BITS_PER_SUM);
-        a1 = (pix[2]+pix[3]) + ((sum2_t)(pix[2]-pix[3])<<BITS_PER_SUM);
-        t[0] = a0 + a1;
-        t[4] = a0 - a1;
-        a2 = (pix[4]+pix[5]) + ((sum2_t)(pix[4]-pix[5])<<BITS_PER_SUM);
-        a3 = (pix[6]+pix[7]) + ((sum2_t)(pix[6]-pix[7])<<BITS_PER_SUM);
-        t[8] = a2 + a3;
-        t[12] = a2 - a3;
-    }
-    for( int i = 0; i < 8; i++ )
-    {
-        HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] );
-        tmp[i*4+0] = a0;
-        tmp[i*4+1] = a1;
-        tmp[i*4+2] = a2;
-        tmp[i*4+3] = a3;
-        sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
-    }
-    for( int i = 0; i < 8; i++ )
-    {
-        HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] );
-        sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
-    }
-    dc = (sum_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]);
-    sum4 = (sum_t)sum4 + (sum4>>BITS_PER_SUM) - dc;
-    sum8 = (sum_t)sum8 + (sum8>>BITS_PER_SUM) - dc;
-    return ((uint64_t)sum8<<32) + sum4;
-}
-
-#define HADAMARD_AC(w,h) \
-static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\
-{\
-    uint64_t sum = pixel_hadamard_ac( pix, stride );\
-    if( w==16 )\
-        sum += pixel_hadamard_ac( pix+8, stride );\
-    if( h==16 )\
-        sum += pixel_hadamard_ac( pix+8*stride, stride );\
-    if( w==16 && h==16 )\
-        sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
-    return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
-}
-HADAMARD_AC( 16, 16 )
-HADAMARD_AC( 16, 8 )
-HADAMARD_AC( 8, 16 )
-HADAMARD_AC( 8, 8 )
-
-
-/****************************************************************************
- * pixel_sad_x4
- ****************************************************************************/
-#define SAD_X( size ) \
-static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
-                                      intptr_t i_stride, int scores[3] )\
-{\
-    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
-}\
-static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\
-                                      intptr_t i_stride, int scores[4] )\
-{\
-    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
-    scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
-}
-
-SAD_X( 16x16 )
-SAD_X( 16x8 )
-SAD_X( 8x16 )
-SAD_X( 8x8 )
-SAD_X( 8x4 )
-SAD_X( 4x8 )
-SAD_X( 4x4 )
-
-/****************************************************************************
- * pixel_satd_x4
- * no faster than single satd, but needed for satd to be a drop-in replacement for sad
- ****************************************************************************/
-
-#define SATD_X( size, cpu ) \
-static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
-                                            intptr_t i_stride, int scores[3] )\
-{\
-    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
-}\
-static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\
-                                            intptr_t i_stride, int scores[4] )\
-{\
-    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
-    scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
-}
-#define SATD_X_DECL6( cpu )\
-SATD_X( 16x16, cpu )\
-SATD_X( 16x8, cpu )\
-SATD_X( 8x16, cpu )\
-SATD_X( 8x8, cpu )\
-SATD_X( 8x4, cpu )\
-SATD_X( 4x8, cpu )
-#define SATD_X_DECL7( cpu )\
-SATD_X_DECL6( cpu )\
-SATD_X( 4x4, cpu )
-
-SATD_X_DECL7()
-#if HAVE_MMX
-SATD_X_DECL7( _mmx2 )
-#if !HIGH_BIT_DEPTH
-SATD_X_DECL6( _sse2 )
-SATD_X_DECL7( _ssse3 )
-SATD_X_DECL6( _ssse3_atom )
-SATD_X_DECL7( _sse4 )
-SATD_X_DECL7( _avx )
-SATD_X_DECL7( _xop )
-#endif // !HIGH_BIT_DEPTH
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_ARMV6 || ARCH_AARCH64
-SATD_X_DECL7( _neon )
-#endif
-#endif // !HIGH_BIT_DEPTH
-
-#define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\
-void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\
-{\
-    ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\
-    x264_predict_8x8_v##cpu2( pix, edge );\
-    res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_8x8_h##cpu2( pix, edge );\
-    res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_8x8_dc##cpu2( pix, edge );\
-    res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-}
-
-INTRA_MBCMP_8x8( sad,, _c )
-INTRA_MBCMP_8x8(sa8d,, _c )
-#if HIGH_BIT_DEPTH && HAVE_MMX
-#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse
-INTRA_MBCMP_8x8( sad, _mmx2,  _c )
-INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
-#endif
-#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64)
-INTRA_MBCMP_8x8( sad, _neon, _neon )
-INTRA_MBCMP_8x8(sa8d, _neon, _neon )
-#endif
-
-#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
-void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
-{\
-    x264_predict_##size##chroma##_##pred1##cpu2( fdec );\
-    res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_##size##chroma##_##pred2##cpu2( fdec );\
-    res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_##size##chroma##_##pred3##cpu2( fdec );\
-    res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-}
-
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  ,, _c )
-INTRA_MBCMP(satd,  4x4,   v, h, dc,  ,, _c )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c,, _c )
-INTRA_MBCMP(satd,  8x8,  dc, h,  v, c,, _c )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c,, _c )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c,, _c )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  ,, _c )
-INTRA_MBCMP(satd, 16x16,  v, h, dc,  ,, _c )
-
-#if HAVE_MMX
-#if HIGH_BIT_DEPTH
-#define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
-#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
-#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c
-#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
-#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
-#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _mmx2, _c )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _mmx2, _mmx2 )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _mmx2, _mmx2 )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _sse2, _sse2 )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _sse2, _sse2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse2, _sse2 )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _sse2, _sse2 )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _ssse3, _sse2 )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _ssse3, _sse2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _ssse3, _sse2 )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _ssse3, _sse2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse4, _sse2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _avx, _sse2 )
-#else
-#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _mmx2, _mmx2 )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _sse2, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse2, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _ssse3, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _sse4, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _avx, _mmx2 )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _xop, _mmx2 )
-#endif
-#endif
-#if !HIGH_BIT_DEPTH && HAVE_ARMV6
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _armv6 )
-INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _armv6 )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
-INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
-#endif
-#if !HIGH_BIT_DEPTH && ARCH_AARCH64
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _neon )
-INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _neon )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
-INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
-#endif
-
-// No C implementation of intra_satd_x9. See checkasm for its behavior,
-// or see x264_mb_analyse_intra for the entirely different algorithm we
-// use when lacking an asm implementation of it.
-
-
-
-/****************************************************************************
- * structural similarity metric
- ****************************************************************************/
-static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1,
-                             const pixel *pix2, intptr_t stride2,
-                             int sums[2][4] )
-{
-    for( int z = 0; z < 2; z++ )
-    {
-        uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
-        for( int y = 0; y < 4; y++ )
-            for( int x = 0; x < 4; x++ )
-            {
-                int a = pix1[x+y*stride1];
-                int b = pix2[x+y*stride2];
-                s1  += a;
-                s2  += b;
-                ss  += a*a;
-                ss  += b*b;
-                s12 += a*b;
-            }
-        sums[z][0] = s1;
-        sums[z][1] = s2;
-        sums[z][2] = ss;
-        sums[z][3] = s12;
-        pix1 += 4;
-        pix2 += 4;
-    }
-}
-
-static float ssim_end1( int s1, int s2, int ss, int s12 )
-{
-/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
- * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
- * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
-#if BIT_DEPTH > 9
-#define type float
-    static const float ssim_c1 = .01*.01*PIXEL_MAX*PIXEL_MAX*64;
-    static const float ssim_c2 = .03*.03*PIXEL_MAX*PIXEL_MAX*64*63;
-#else
-#define type int
-    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
-    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
-#endif
-    type fs1 = s1;
-    type fs2 = s2;
-    type fss = ss;
-    type fs12 = s12;
-    type vars = fss*64 - fs1*fs1 - fs2*fs2;
-    type covar = fs12*64 - fs1*fs2;
-    return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2)
-         / ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2));
-#undef type
-}
-
-static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
-{
-    float ssim = 0.0;
-    for( int i = 0; i < width; i++ )
-        ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
-                           sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
-                           sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
-                           sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
-    return ssim;
-}
-
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
-                           pixel *pix1, intptr_t stride1,
-                           pixel *pix2, intptr_t stride2,
-                           int width, int height, void *buf, int *cnt )
-{
-    int z = 0;
-    float ssim = 0.0;
-    int (*sum0)[4] = buf;
-    int (*sum1)[4] = sum0 + (width >> 2) + 3;
-    width >>= 2;
-    height >>= 2;
-    for( int y = 1; y < height; y++ )
-    {
-        for( ; z <= y; z++ )
-        {
-            XCHG( void*, sum0, sum1 );
-            for( int x = 0; x < width; x+=2 )
-                pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
-        }
-        for( int x = 0; x < width-1; x += 4 )
-            ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
-    }
-    *cnt = (height-1) * (width-1);
-    return ssim;
-}
-
-static int pixel_vsad( pixel *src, intptr_t stride, int height )
-{
-    int score = 0;
-    for( int i = 1; i < height; i++, src += stride )
-        for( int j = 0; j < 16; j++ )
-            score += abs(src[j] - src[j+stride]);
-    return score;
-}
-
-int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
-{
-    int score_field, score_frame;
-    int stride = h->fenc->i_stride[0];
-    int mb_stride = h->mb.i_mb_stride;
-    pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride);
-    int mb_xy = mb_x + mb_y*mb_stride;
-
-    /* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */
-    int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 );
-    score_frame  = h->pixf.vsad( fenc,          stride, mbpair_height );
-    score_field  = h->pixf.vsad( fenc,        stride*2, mbpair_height >> 1 );
-    score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 );
-
-    if( mb_x > 0 )
-        score_field += 512 - h->mb.field[mb_xy        -1]*1024;
-    if( mb_y > 0 )
-        score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024;
-
-    return (score_field < score_frame);
-}
-
-static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height )
-{
-    int sum = 0;
-    for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 )
-        for( int x = 0; x < 8; x++ )
-            sum += pix1[x] - pix2[x];
-    return abs( sum );
-}
-
-/****************************************************************************
- * successive elimination
- ****************************************************************************/
-static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
-                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-{
-    int nmv = 0;
-    for( int i = 0; i < width; i++, sums++ )
-    {
-        int ads = abs( enc_dc[0] - sums[0] )
-                + abs( enc_dc[1] - sums[8] )
-                + abs( enc_dc[2] - sums[delta] )
-                + abs( enc_dc[3] - sums[delta+8] )
-                + cost_mvx[i];
-        if( ads < thresh )
-            mvs[nmv++] = i;
-    }
-    return nmv;
-}
-
-static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
-                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-{
-    int nmv = 0;
-    for( int i = 0; i < width; i++, sums++ )
-    {
-        int ads = abs( enc_dc[0] - sums[0] )
-                + abs( enc_dc[1] - sums[delta] )
-                + cost_mvx[i];
-        if( ads < thresh )
-            mvs[nmv++] = i;
-    }
-    return nmv;
-}
-
-static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
-                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-{
-    int nmv = 0;
-    for( int i = 0; i<width; i++, sums++ )
-    {
-        int ads = abs( enc_dc[0] - sums[0] )
-                + cost_mvx[i];
-        if( ads < thresh )
-            mvs[nmv++] = i;
-    }
-    return nmv;
-}
-
-
-/****************************************************************************
- * x264_pixel_init:
- ****************************************************************************/
-void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
-{
-    memset( pixf, 0, sizeof(*pixf) );
-
-#define INIT2_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
-    pixf->name1[PIXEL_16x8]  = x264_pixel_##name2##_16x8##cpu;
-#define INIT4_NAME( name1, name2, cpu ) \
-    INIT2_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_8x16]  = x264_pixel_##name2##_8x16##cpu;\
-    pixf->name1[PIXEL_8x8]   = x264_pixel_##name2##_8x8##cpu;
-#define INIT5_NAME( name1, name2, cpu ) \
-    INIT4_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_8x4]   = x264_pixel_##name2##_8x4##cpu;
-#define INIT6_NAME( name1, name2, cpu ) \
-    INIT5_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_4x8]   = x264_pixel_##name2##_4x8##cpu;
-#define INIT7_NAME( name1, name2, cpu ) \
-    INIT6_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
-#define INIT8_NAME( name1, name2, cpu ) \
-    INIT7_NAME( name1, name2, cpu ) \
-    pixf->name1[PIXEL_4x16]  = x264_pixel_##name2##_4x16##cpu;
-#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
-#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
-#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
-#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
-#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
-#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
-
-#define INIT_ADS( cpu ) \
-    pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
-    pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
-    pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
-
-    INIT8( sad, );
-    INIT8_NAME( sad_aligned, sad, );
-    INIT7( sad_x3, );
-    INIT7( sad_x4, );
-    INIT8( ssd, );
-    INIT8( satd, );
-    INIT7( satd_x3, );
-    INIT7( satd_x4, );
-    INIT4( hadamard_ac, );
-    INIT_ADS( );
-
-    pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
-    pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
-    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
-    pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16;
-    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
-    pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16;
-    pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8;
-
-    pixf->ssd_nv12_core = pixel_ssd_nv12_core;
-    pixf->ssim_4x4x2_core = ssim_4x4x2_core;
-    pixf->ssim_end4 = ssim_end4;
-    pixf->vsad = pixel_vsad;
-    pixf->asd8 = pixel_asd8;
-
-    pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
-    pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4;
-    pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8;
-    pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8;
-    pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c;
-    pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c;
-    pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c;
-    pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
-    pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
-    pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
-
-#if HIGH_BIT_DEPTH
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX2 )
-    {
-        INIT7( sad, _mmx2 );
-        INIT7_NAME( sad_aligned, sad, _mmx2 );
-        INIT7( sad_x3, _mmx2 );
-        INIT7( sad_x4, _mmx2 );
-        INIT8( satd, _mmx2 );
-        INIT7( satd_x3, _mmx2 );
-        INIT7( satd_x4, _mmx2 );
-        INIT4( hadamard_ac, _mmx2 );
-        INIT8( ssd, _mmx2 );
-        INIT_ADS( _mmx2 );
-
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-#if ARCH_X86
-        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_mmx2;
-        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
-#endif
-
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
-        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmx2;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmx2;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmx2;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_mmx2;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmx2;
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
-    }
-    if( cpu&X264_CPU_SSE2 )
-    {
-        INIT4_NAME( sad_aligned, sad, _sse2_aligned );
-        INIT5( ssd, _sse2 );
-        INIT6( satd, _sse2 );
-        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
-
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
-#if ARCH_X86_64
-        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
-#endif
-        pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_sse2;
-        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
-        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
-        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_sse2;
-        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_sse2;
-        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
-        pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
-    }
-    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
-    {
-        INIT5( sad, _sse2 );
-        INIT2( sad_x3, _sse2 );
-        INIT2( sad_x4, _sse2 );
-        INIT_ADS( _sse2 );
-
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _sse2 );
-        }
-        pixf->vsad = x264_pixel_vsad_sse2;
-        pixf->asd8 = x264_pixel_asd8_sse2;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_sse2;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_sse2;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_sse2;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_sse2;
-    }
-    if( cpu&X264_CPU_SSE2_IS_FAST )
-    {
-        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
-        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
-        pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_sse2;
-        pixf->sad_x3[PIXEL_8x4]  = x264_pixel_sad_x3_8x4_sse2;
-        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
-        pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_sse2;
-        pixf->sad_x4[PIXEL_8x4]  = x264_pixel_sad_x4_8x4_sse2;
-    }
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        INIT4_NAME( sad_aligned, sad, _ssse3_aligned );
-        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3;
-        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3;
-        INIT7( sad, _ssse3 );
-        INIT7( sad_x3, _ssse3 );
-        INIT7( sad_x4, _ssse3 );
-        INIT_ADS( _ssse3 );
-        INIT6( satd, _ssse3 );
-        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
-
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _ssse3 );
-        }
-        pixf->vsad = x264_pixel_vsad_ssse3;
-        pixf->asd8 = x264_pixel_asd8_ssse3;
-        pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_ssse3;
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
-#endif
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_ssse3;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
-    }
-    if( cpu&X264_CPU_SSE4 )
-    {
-        INIT6( satd, _sse4 );
-        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _sse4 );
-        }
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
-#endif
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
-    }
-    if( cpu&X264_CPU_AVX )
-    {
-        INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
-        INIT_ADS( _avx );
-        INIT6( satd, _avx );
-        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _avx );
-        }
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_avx;
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
-        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
-        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
-#endif
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
-    }
-    if( cpu&X264_CPU_XOP )
-    {
-        INIT5( sad_x3, _xop );
-        INIT5( sad_x4, _xop );
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
-        pixf->vsad = x264_pixel_vsad_xop;
-        pixf->asd8 = x264_pixel_asd8_xop;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
-#endif
-    }
-    if( cpu&X264_CPU_AVX2 )
-    {
-        INIT2( ssd, _avx2 );
-        INIT2( sad, _avx2 );
-        INIT2_NAME( sad_aligned, sad, _avx2 );
-        INIT2( sad_x3, _avx2 );
-        INIT2( sad_x4, _avx2 );
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
-        pixf->vsad = x264_pixel_vsad_avx2;
-        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
-        pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
-    }
-#endif // HAVE_MMX
-#else // !HIGH_BIT_DEPTH
-#if HAVE_MMX
-    if( cpu&X264_CPU_MMX )
-    {
-        INIT8( ssd, _mmx );
-    }
-
-    if( cpu&X264_CPU_MMX2 )
-    {
-        INIT8( sad, _mmx2 );
-        INIT8_NAME( sad_aligned, sad, _mmx2 );
-        INIT7( sad_x3, _mmx2 );
-        INIT7( sad_x4, _mmx2 );
-        INIT8( satd, _mmx2 );
-        INIT7( satd_x3, _mmx2 );
-        INIT7( satd_x4, _mmx2 );
-        INIT4( hadamard_ac, _mmx2 );
-        INIT_ADS( _mmx2 );
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
-        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_mmx2;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-#if ARCH_X86
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;
-        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
-        pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
-        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
-        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
-        pixf->vsad = x264_pixel_vsad_mmx2;
-
-        if( cpu&X264_CPU_CACHELINE_32 )
-        {
-            INIT5( sad, _cache32_mmx2 );
-            INIT4( sad_x3, _cache32_mmx2 );
-            INIT4( sad_x4, _cache32_mmx2 );
-        }
-        else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
-        {
-            INIT5( sad, _cache64_mmx2 );
-            INIT4( sad_x3, _cache64_mmx2 );
-            INIT4( sad_x4, _cache64_mmx2 );
-        }
-#else
-        if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
-        {
-            pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
-            pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmx2;
-            pixf->sad[PIXEL_8x4]  = x264_pixel_sad_8x4_cache64_mmx2;
-            pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2;
-            pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_cache64_mmx2;
-            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2;
-            pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_cache64_mmx2;
-        }
-#endif
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmx2;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_mmx2;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmx2;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmx2;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmx2;
-        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
-    }
-
-    if( cpu&X264_CPU_SSE2 )
-    {
-        INIT5( ssd, _sse2slow );
-        INIT2_NAME( sad_aligned, sad, _sse2_aligned );
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_sse2;
-        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
-        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
-#if ARCH_X86_64
-        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
-#endif
-        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
-        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_sse2;
-        pixf->vsad = x264_pixel_vsad_sse2;
-        pixf->asd8 = x264_pixel_asd8_sse2;
-    }
-
-    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
-    {
-        INIT2( sad, _sse2 );
-        INIT2( sad_x3, _sse2 );
-        INIT2( sad_x4, _sse2 );
-        INIT6( satd, _sse2 );
-        pixf->satd[PIXEL_4x16]   = x264_pixel_satd_4x16_sse2;
-        INIT6( satd_x3, _sse2 );
-        INIT6( satd_x4, _sse2 );
-        INIT4( hadamard_ac, _sse2 );
-        INIT_ADS( _sse2 );
-        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
-        pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
-        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_sse2;
-        if( cpu&X264_CPU_CACHELINE_64 )
-        {
-            INIT2( ssd, _sse2); /* faster for width 16 on p4 */
-#if ARCH_X86
-            INIT2( sad, _cache64_sse2 );
-            INIT2( sad_x3, _cache64_sse2 );
-            INIT2( sad_x4, _cache64_sse2 );
-#endif
-           if( cpu&X264_CPU_SSE2_IS_FAST )
-           {
-               pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
-               pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
-           }
-        }
-    }
-
-    if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
-    {
-        pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
-        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
-        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
-        pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
-        pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
-        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
-        pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
-        pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
-    }
-
-    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
-    {
-        INIT2( sad, _sse3 );
-        INIT2( sad_x3, _sse3 );
-        INIT2( sad_x4, _sse3 );
-    }
-
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        INIT4( hadamard_ac, _ssse3 );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_ssse3;
-            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
-            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_ssse3;
-#if ARCH_X86_64
-            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
-#endif
-        }
-        INIT_ADS( _ssse3 );
-        if( cpu&X264_CPU_SLOW_ATOM )
-        {
-            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
-            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3_atom;
-            INIT6( satd, _ssse3_atom );
-            pixf->satd[PIXEL_4x16]  = x264_pixel_satd_4x16_ssse3_atom;
-            INIT6( satd_x3, _ssse3_atom );
-            INIT6( satd_x4, _ssse3_atom );
-            INIT4( hadamard_ac, _ssse3_atom );
-#if ARCH_X86_64
-            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
-#endif
-        }
-        else
-        {
-            INIT8( ssd, _ssse3 );
-            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
-            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
-            INIT8( satd, _ssse3 );
-            INIT7( satd_x3, _ssse3 );
-            INIT7( satd_x4, _ssse3 );
-#if ARCH_X86_64
-            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
-#endif
-        }
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
-        if( !(cpu&X264_CPU_SLOW_PSHUFB) )
-            pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
-        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
-        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
-        pixf->asd8 = x264_pixel_asd8_ssse3;
-        if( cpu&X264_CPU_CACHELINE_64 )
-        {
-            INIT2( sad, _cache64_ssse3 );
-            INIT2( sad_x3, _cache64_ssse3 );
-            INIT2( sad_x4, _cache64_ssse3 );
-        }
-        else
-        {
-            INIT2( sad_x3, _ssse3 );
-            INIT5( sad_x4, _ssse3 );
-        }
-        if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
-        {
-            INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
-        }
-    }
-
-    if( cpu&X264_CPU_SSE4 )
-    {
-        INIT8( satd, _sse4 );
-        INIT7( satd_x3, _sse4 );
-        INIT7( satd_x4, _sse4 );
-        INIT4( hadamard_ac, _sse4 );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_sse4;
-            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
-            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_sse4;
-#if ARCH_X86_64
-            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4;
-#endif
-        }
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
-#endif
-    }
-
-    if( cpu&X264_CPU_AVX )
-    {
-        INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
-        INIT2( sad_x3, _avx );
-        INIT2( sad_x4, _avx );
-        INIT8( satd, _avx );
-        INIT7( satd_x3, _avx );
-        INIT7( satd_x4, _avx );
-        INIT_ADS( _avx );
-        INIT4( hadamard_ac, _avx );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_avx;
-            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
-            pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx;
-#if ARCH_X86_64
-            pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx;
-#endif
-        }
-        INIT5( ssd, _avx );
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
-        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_avx;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
-        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
-        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
-#endif
-    }
-
-    if( cpu&X264_CPU_XOP )
-    {
-        INIT7( satd, _xop );
-        INIT7( satd_x3, _xop );
-        INIT7( satd_x4, _xop );
-        INIT4( hadamard_ac, _xop );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
-        }
-        INIT5( ssd, _xop );
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_xop;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
-        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_xop;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
-        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
-        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
-#endif
-    }
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        INIT2( ssd, _avx2 );
-        INIT2( sad_x3, _avx2 );
-        INIT2( sad_x4, _avx2 );
-        INIT4( satd, _avx2 );
-        INIT2( hadamard_ac, _avx2 );
-        INIT_ADS( _avx2 );
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx2;
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
-        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_avx2;
-        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_avx2;
-        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
-        pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx2;
-        pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2;
-        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
-#endif
-    }
-#endif //HAVE_MMX
-
-#if HAVE_ARMV6
-    if( cpu&X264_CPU_ARMV6 )
-    {
-        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
-        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
-        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
-        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
-    }
-    if( cpu&X264_CPU_NEON )
-    {
-        INIT5( sad, _neon );
-        INIT5( sad_aligned, _neon );
-        INIT7( sad_x3, _neon );
-        INIT7( sad_x4, _neon );
-        INIT7( ssd, _neon );
-        INIT7( satd, _neon );
-        INIT7( satd_x3, _neon );
-        INIT7( satd_x4, _neon );
-        INIT4( hadamard_ac, _neon );
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
-        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
-        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
-        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
-        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
-        pixf->vsad = x264_pixel_vsad_neon;
-        pixf->asd8 = x264_pixel_asd8_neon;
-
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
-        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_neon;
-        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_neon;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_neon;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_neon;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_neon;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
-
-        pixf->ssd_nv12_core     = x264_pixel_ssd_nv12_core_neon;
-        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
-        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
-
-        if( cpu&X264_CPU_FAST_NEON_MRC )
-        {
-            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
-            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
-            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
-            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
-        }
-        else    // really just scheduled for dual issue / A8
-        {
-            INIT5( sad_aligned, _neon_dual );
-        }
-    }
-#endif
-
-#if ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        INIT8( sad, _neon );
-        // AArch64 has no distinct instructions for aligned load/store
-        INIT8_NAME( sad_aligned, sad, _neon );
-        INIT7( sad_x3, _neon );
-        INIT7( sad_x4, _neon );
-        INIT8( ssd, _neon );
-        INIT8( satd, _neon );
-        INIT7( satd_x3, _neon );
-        INIT7( satd_x4, _neon );
-        INIT4( hadamard_ac, _neon );
-
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
-
-        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
-        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
-        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
-        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
-        pixf->vsad = x264_pixel_vsad_neon;
-        pixf->asd8 = x264_pixel_asd8_neon;
-
-        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
-        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
-        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_neon;
-        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_neon;
-        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_neon;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_neon;
-        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_neon;
-        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
-
-        pixf->ssd_nv12_core     = x264_pixel_ssd_nv12_core_neon;
-        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
-        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
-    }
-#endif // ARCH_AARCH64
-
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        INIT8( sad, _msa );
-        INIT8_NAME( sad_aligned, sad, _msa );
-        INIT8( ssd, _msa );
-        INIT7( sad_x3, _msa );
-        INIT7( sad_x4, _msa );
-        INIT8( satd, _msa );
-        INIT4( hadamard_ac, _msa );
-
-        pixf->intra_sad_x3_4x4   = x264_intra_sad_x3_4x4_msa;
-        pixf->intra_sad_x3_8x8   = x264_intra_sad_x3_8x8_msa;
-        pixf->intra_sad_x3_8x8c  = x264_intra_sad_x3_8x8c_msa;
-        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa;
-        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_msa;
-        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa;
-        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_msa;
-        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_msa;
-
-        pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa;
-
-        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
-        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_msa;
-        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_msa;
-        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_msa;
-        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_msa;
-        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
-        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
-    }
-#endif // HAVE_MSA
-
-#endif // HIGH_BIT_DEPTH
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-    {
-        x264_pixel_altivec_init( pixf );
-    }
-#endif
-
-    pixf->ads[PIXEL_8x16] =
-    pixf->ads[PIXEL_8x4] =
-    pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
-    pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/pixel.h b/android/src/main/libenc/jni/libx264/common/pixel.h
deleted file mode 100755
index 35be44c..0000000
--- a/android/src/main/libenc/jni/libx264/common/pixel.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*****************************************************************************
- * pixel.c: pixel metrics
- *****************************************************************************
- * Copyright (C) 2004-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
-            Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PIXEL_H
-#define X264_PIXEL_H
-
-// SSD assumes all args aligned
-// other cmp functions assume first arg aligned
-typedef int  (*x264_pixel_cmp_t) ( pixel *, intptr_t, pixel *, intptr_t );
-typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] );
-typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] );
-
-enum
-{
-    PIXEL_16x16 = 0,
-    PIXEL_16x8  = 1,
-    PIXEL_8x16  = 2,
-    PIXEL_8x8   = 3,
-    PIXEL_8x4   = 4,
-    PIXEL_4x8   = 5,
-    PIXEL_4x4   = 6,
-
-    /* Subsampled chroma only */
-    PIXEL_4x16  = 7,  /* 4:2:2 */
-    PIXEL_4x2   = 8,
-    PIXEL_2x8   = 9,  /* 4:2:2 */
-    PIXEL_2x4   = 10,
-    PIXEL_2x2   = 11,
-};
-
-static const struct { uint8_t w, h; } x264_pixel_size[12] =
-{
-    { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 },
-    {  4, 16 }, {  4, 2 }, { 2,  8 }, { 2, 4 }, { 2, 2 },
-};
-
-static const uint8_t x264_size2pixel[5][5] =
-{
-    { 0, },
-    { 0, PIXEL_4x4, PIXEL_8x4, 0, 0 },
-    { 0, PIXEL_4x8, PIXEL_8x8, 0, PIXEL_16x8 },
-    { 0, },
-    { 0, 0,        PIXEL_8x16, 0, PIXEL_16x16 }
-};
-
-static const uint8_t x264_luma2chroma_pixel[4][7] =
-{
-    { 0 },
-    { PIXEL_8x8,   PIXEL_8x4,  PIXEL_4x8,  PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */
-    { PIXEL_8x16,  PIXEL_8x8,  PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */
-    { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */
-};
-
-typedef struct
-{
-    x264_pixel_cmp_t  sad[8];
-    x264_pixel_cmp_t  ssd[8];
-    x264_pixel_cmp_t satd[8];
-    x264_pixel_cmp_t ssim[7];
-    x264_pixel_cmp_t sa8d[4];
-    x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */
-    x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */
-    x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */
-    x264_pixel_cmp_x3_t fpelcmp_x3[7];
-    x264_pixel_cmp_x4_t fpelcmp_x4[7];
-    x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
-    int (*vsad)( pixel *, intptr_t, int );
-    int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-    uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-
-    uint64_t (*var[4])( pixel *pix, intptr_t stride );
-    int (*var2[4])( pixel *pix1, intptr_t stride1,
-                    pixel *pix2, intptr_t stride2, int *ssd );
-    uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
-
-    void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
-                           pixel *pixuv2, intptr_t stride2, int width, int height,
-                           uint64_t *ssd_u, uint64_t *ssd_v );
-    void (*ssim_4x4x2_core)( const pixel *pix1, intptr_t stride1,
-                             const pixel *pix2, intptr_t stride2, int sums[2][4] );
-    float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
-
-    /* multiple parallel calls to cmp. */
-    x264_pixel_cmp_x3_t sad_x3[7];
-    x264_pixel_cmp_x4_t sad_x4[7];
-    x264_pixel_cmp_x3_t satd_x3[7];
-    x264_pixel_cmp_x4_t satd_x4[7];
-
-    /* abs-diff-sum for successive elimination.
-     * may round width up to a multiple of 16. */
-    int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
-                   uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
-
-    /* calculate satd or sad of V, H, and DC modes. */
-    void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_16x16)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_4x4)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_4x4)   ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_4x4)    ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_chroma)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_8x16c)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_8x16c)   ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_8x8c)    ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_8x8)  ( pixel *fenc, pixel edge[36], int res[3] );
-    void (*intra_sa8d_x3_8x8)   ( pixel *fenc, pixel edge[36], int res[3] );
-    void (*intra_sad_x3_8x8)    ( pixel *fenc, pixel edge[36], int res[3] );
-    /* find minimum satd or sad of all modes, and set fdec.
-     * may be NULL, in which case just use pred+satd instead. */
-    int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
-    int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
-    int (*intra_sad_x9_4x4)  ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
-    int (*intra_mbcmp_x9_8x8)( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
-    int (*intra_sa8d_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
-    int (*intra_sad_x9_8x8)  ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
-} x264_pixel_function_t;
-
-void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-void x264_pixel_ssd_nv12   ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
-                             int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
-                             int i_width, int i_height );
-float x264_pixel_ssim_wxh  ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
-                             int i_width, int i_height, void *buf, int *cnt );
-int x264_field_vsad( x264_t *h, int mb_x, int mb_y );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/dct.c b/android/src/main/libenc/jni/libx264/common/ppc/dct.c
deleted file mode 100755
index 901659e..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/dct.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/*****************************************************************************
- * dct.c: ppc transform and zigzag
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *          Eric Petit <eric.petit@lapsus.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "ppccommon.h"
-
-#if !HIGH_BIT_DEPTH
-#define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \
-    b1 = vec_add( a0, a3 );              \
-    b3 = vec_add( a1, a2 );              \
-    b0 = vec_add( b1, b3 );              \
-    b2 = vec_sub( b1, b3 );              \
-    a0 = vec_sub( a0, a3 );              \
-    a1 = vec_sub( a1, a2 );              \
-    b1 = vec_add( a0, a0 );              \
-    b1 = vec_add( b1, a1 );              \
-    b3 = vec_sub( a0, a1 );              \
-    b3 = vec_sub( b3, a1 )
-
-void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
-{
-    PREP_DIFF_8BYTEALIGNED;
-    vec_s16_t dct0v, dct1v, dct2v, dct3v;
-    vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v;
-
-    vec_u8_t permHighv;
-
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
-    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
-    VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v,
-                     dct0v, dct1v, dct2v, dct3v );
-    permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
-    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
-
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,  dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
-}
-
-void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
-{
-    PREP_DIFF_8BYTEALIGNED;
-    vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v;
-    vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v;
-
-    vec_u8_t permHighv, permLowv;
-
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
-    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
-    VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
-    VEC_TRANSPOSE_8( tmp0v, tmp1v, tmp2v, tmp3v,
-                     tmp4v, tmp5v, tmp6v, tmp7v,
-                     dct0v, dct1v, dct2v, dct3v,
-                     dct4v, dct5v, dct6v, dct7v );
-
-    permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
-    permLowv  = (vec_u8_t) CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
-
-    VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
-    VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
-
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,   *dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16,  *dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32,  *dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48,  *dct);
-    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64,  *dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permLowv),  80,  *dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permLowv),  96,  *dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permLowv),  112, *dct);
-}
-
-void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub8x8_dct_altivec( &dct[ 0], &pix1[0], &pix2[0] );
-    x264_sub8x8_dct_altivec( &dct[ 4], &pix1[8], &pix2[8] );
-    x264_sub8x8_dct_altivec( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
-    x264_sub8x8_dct_altivec( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-/***************************************************************************
- * 8x8 transform:
- ***************************************************************************/
-
-/* DCT8_1D unrolled by 8 in Altivec */
-#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
-{ \
-    /* int s07 = SRC(0) + SRC(7);         */ \
-    vec_s16_t s07v = vec_add( dct0v, dct7v); \
-    /* int s16 = SRC(1) + SRC(6);         */ \
-    vec_s16_t s16v = vec_add( dct1v, dct6v); \
-    /* int s25 = SRC(2) + SRC(5);         */ \
-    vec_s16_t s25v = vec_add( dct2v, dct5v); \
-    /* int s34 = SRC(3) + SRC(4);         */ \
-    vec_s16_t s34v = vec_add( dct3v, dct4v); \
-\
-    /* int a0 = s07 + s34;                */ \
-    vec_s16_t a0v = vec_add(s07v, s34v);     \
-    /* int a1 = s16 + s25;                */ \
-    vec_s16_t a1v = vec_add(s16v, s25v);     \
-    /* int a2 = s07 - s34;                */ \
-    vec_s16_t a2v = vec_sub(s07v, s34v);     \
-    /* int a3 = s16 - s25;                */ \
-    vec_s16_t a3v = vec_sub(s16v, s25v);     \
-\
-    /* int d07 = SRC(0) - SRC(7);         */ \
-    vec_s16_t d07v = vec_sub( dct0v, dct7v); \
-    /* int d16 = SRC(1) - SRC(6);         */ \
-    vec_s16_t d16v = vec_sub( dct1v, dct6v); \
-    /* int d25 = SRC(2) - SRC(5);         */ \
-    vec_s16_t d25v = vec_sub( dct2v, dct5v); \
-    /* int d34 = SRC(3) - SRC(4);         */ \
-    vec_s16_t d34v = vec_sub( dct3v, dct4v); \
-\
-    /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
-    vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
-    /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
-    vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
-    /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
-    vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
-    /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
-    vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
-\
-    /* DST(0) =  a0 + a1;                    */ \
-    dct0v = vec_add( a0v, a1v );                \
-    /* DST(1) =  a4 + (a7>>2);               */ \
-    dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
-    /* DST(2) =  a2 + (a3>>1);               */ \
-    dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
-    /* DST(3) =  a5 + (a6>>2);               */ \
-    dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
-    /* DST(4) =  a0 - a1;                    */ \
-    dct4v = vec_sub( a0v, a1v );                \
-    /* DST(5) =  a6 - (a5>>2);               */ \
-    dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
-    /* DST(6) = (a2>>1) - a3 ;               */ \
-    dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
-    /* DST(7) = (a4>>2) - a7 ;               */ \
-    dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
-}
-
-
-void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
-{
-    vec_u16_t onev = vec_splat_u16(1);
-    vec_u16_t twov = vec_add( onev, onev );
-
-    PREP_DIFF_8BYTEALIGNED;
-
-    vec_s16_t dct0v, dct1v, dct2v, dct3v,
-              dct4v, dct5v, dct6v, dct7v;
-
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
-
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
-    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
-
-    DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
-                     dct4v, dct5v, dct6v, dct7v );
-
-    vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
-        dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
-
-    VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
-                    dct4v, dct5v, dct6v, dct7v,
-                    dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
-                    dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
-
-    DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
-                     dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
-
-    vec_st( dct_tr0v,  0,  dct );
-    vec_st( dct_tr1v, 16,  dct );
-    vec_st( dct_tr2v, 32,  dct );
-    vec_st( dct_tr3v, 48,  dct );
-
-    vec_st( dct_tr4v, 64,  dct );
-    vec_st( dct_tr5v, 80,  dct );
-    vec_st( dct_tr6v, 96,  dct );
-    vec_st( dct_tr7v, 112, dct );
-}
-
-void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
-{
-    x264_sub8x8_dct8_altivec( dct[0], &pix1[0],               &pix2[0] );
-    x264_sub8x8_dct8_altivec( dct[1], &pix1[8],               &pix2[8] );
-    x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
-    x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
-}
-
-
-/****************************************************************************
- * IDCT transform:
- ****************************************************************************/
-
-#define IDCT_1D_ALTIVEC(s0, s1, s2, s3,  d0, d1, d2, d3) \
-{                                                        \
-    /*        a0  = SRC(0) + SRC(2); */                  \
-    vec_s16_t a0v = vec_add(s0, s2);                     \
-    /*        a1  = SRC(0) - SRC(2); */                  \
-    vec_s16_t a1v = vec_sub(s0, s2);                     \
-    /*        a2  =           (SRC(1)>>1) - SRC(3); */   \
-    vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3);      \
-    /*        a3  =           (SRC(3)>>1) + SRC(1); */   \
-    vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1);      \
-    /* DST(0,    a0 + a3); */                            \
-    d0 = vec_add(a0v, a3v);                              \
-    /* DST(1,    a1 + a2); */                            \
-    d1 = vec_add(a1v, a2v);                              \
-    /* DST(2,    a1 - a2); */                            \
-    d2 = vec_sub(a1v, a2v);                              \
-    /* DST(3,    a0 - a3); */                            \
-    d3 = vec_sub(a0v, a3v);                              \
-}
-
-#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)             \
-    vdst_orig = vec_ld(0, dst);                      \
-    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
-    vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \
-    va = vec_add(va, vdst_ss);                       \
-    va_u8 = vec_s16_to_u8(va);                       \
-    va_u32 = vec_splat((vec_u32_t)va_u8, 0);         \
-    vec_ste(va_u32, element, (uint32_t*)dst);
-
-#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv)          \
-{                                                               \
-    /* unaligned load */                                        \
-    vec_u8_t lv = vec_ld(0, dest);                              \
-    vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
-    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
-    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                    \
-    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
-    vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
-    /* unaligned store */                                       \
-    vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0);        \
-    int element = ((unsigned long)dest & 0xf) >> 2;             \
-    vec_ste(bodyv, element, (uint32_t *)dest);                  \
-}
-
-void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
-{
-    vec_u16_t onev = vec_splat_u16(1);
-
-    dct[0] += 32; // rounding for the >>6 at the end
-
-    vec_s16_t s0, s1, s2, s3;
-
-    s0 = vec_ld( 0x00, dct );
-    s1 = vec_sld( s0, s0, 8 );
-    s2 = vec_ld( 0x10, dct );
-    s3 = vec_sld( s2, s2, 8 );
-
-    vec_s16_t d0, d1, d2, d3;
-    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );
-
-    vec_s16_t tr0, tr1, tr2, tr3;
-
-    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );
-
-    vec_s16_t idct0, idct1, idct2, idct3;
-    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );
-
-    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
-    vec_u16_t sixv = vec_splat_u16(6);
-    LOAD_ZERO;
-
-    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
-    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
-    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
-    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
-}
-
-void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] )
-{
-    x264_add4x4_idct_altivec( &p_dst[0],               dct[0] );
-    x264_add4x4_idct_altivec( &p_dst[4],               dct[1] );
-    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] );
-    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] );
-}
-
-void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] )
-{
-    x264_add8x8_idct_altivec( &p_dst[0],               &dct[0] );
-    x264_add8x8_idct_altivec( &p_dst[8],               &dct[4] );
-    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
-    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
-}
-
-#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7)\
-{\
-    /*        a0  = SRC(0) + SRC(4); */ \
-    vec_s16_t a0v = vec_add(s0, s4);    \
-    /*        a2  = SRC(0) - SRC(4); */ \
-    vec_s16_t a2v = vec_sub(s0, s4);    \
-    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
-    vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
-    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
-    vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
-    /*        b0  =         a0 + a6; */ \
-    vec_s16_t b0v = vec_add(a0v, a6v);  \
-    /*        b2  =         a2 + a4; */ \
-    vec_s16_t b2v = vec_add(a2v, a4v);  \
-    /*        b4  =         a2 - a4; */ \
-    vec_s16_t b4v = vec_sub(a2v, a4v);  \
-    /*        b6  =         a0 - a6; */ \
-    vec_s16_t b6v = vec_sub(a0v, a6v);  \
-    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
-    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
-    vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) );\
-    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
-    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
-    vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
-    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
-    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */  \
-    vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
-    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */  \
-    vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
-    /*        b1 =                  (a7>>2)  +  a1; */  \
-    vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v);  \
-    /*        b3 =          a3 +        (a5>>2); */     \
-    vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov));   \
-    /*        b5 =                  (a3>>2)  -   a5; */ \
-    vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v);  \
-    /*        b7 =           a7 -        (a1>>2); */    \
-    vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov));  \
-    /* DST(0,    b0 + b7); */ \
-    d0 = vec_add(b0v, b7v); \
-    /* DST(1,    b2 + b5); */ \
-    d1 = vec_add(b2v, b5v); \
-    /* DST(2,    b4 + b3); */ \
-    d2 = vec_add(b4v, b3v); \
-    /* DST(3,    b6 + b1); */ \
-    d3 = vec_add(b6v, b1v); \
-    /* DST(4,    b6 - b1); */ \
-    d4 = vec_sub(b6v, b1v); \
-    /* DST(5,    b4 - b3); */ \
-    d5 = vec_sub(b4v, b3v); \
-    /* DST(6,    b2 - b5); */ \
-    d6 = vec_sub(b2v, b5v); \
-    /* DST(7,    b0 - b7); */ \
-    d7 = vec_sub(b0v, b7v); \
-}
-
-#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel)\
-{\
-    /* unaligned load */                                       \
-    vec_u8_t hv = vec_ld( 0, dest );                           \
-    vec_u8_t lv = vec_ld( 7, dest );                           \
-    vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
-    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                   \
-    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
-    vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    /* unaligned store */                                      \
-    vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
-    lv    = vec_sel( lv, bodyv, edgelv );                      \
-    vec_st( lv, 7, dest );                                     \
-    hv    = vec_ld( 0, dest );                                 \
-    vec_u8_t edgehv = vec_perm( zero_u8v, sel, perm_stv );     \
-    hv    = vec_sel( hv, bodyv, edgehv );                      \
-    vec_st( hv, 0, dest );                                     \
-}
-
-void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
-{
-    vec_u16_t onev = vec_splat_u16(1);
-    vec_u16_t twov = vec_splat_u16(2);
-
-    dct[0] += 32; // rounding for the >>6 at the end
-
-    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-    s0 = vec_ld(0x00, dct);
-    s1 = vec_ld(0x10, dct);
-    s2 = vec_ld(0x20, dct);
-    s3 = vec_ld(0x30, dct);
-    s4 = vec_ld(0x40, dct);
-    s5 = vec_ld(0x50, dct);
-    s6 = vec_ld(0x60, dct);
-    s7 = vec_ld(0x70, dct);
-
-    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
-    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7);
-
-    vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7;
-
-    VEC_TRANSPOSE_8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
-                    tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);
-
-    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
-    IDCT8_1D_ALTIVEC(tr0,     tr1,   tr2,   tr3,   tr4,   tr5,   tr6,   tr7,
-                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
-
-    vec_u8_t perm_ldv = vec_lvsl(0, dst);
-    vec_u8_t perm_stv = vec_lvsr(8, dst);
-    vec_u16_t sixv = vec_splat_u16(6);
-    const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
-    LOAD_ZERO;
-
-    ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel);
-    ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel);
-}
-
-void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] )
-{
-    x264_add8x8_idct8_altivec( &dst[0],               dct[0] );
-    x264_add8x8_idct8_altivec( &dst[8],               dct[1] );
-    x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] );
-    x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] );
-}
-
-void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] )
-{
-    vec_s16_t dct0v, dct1v;
-    vec_s16_t tmp0v, tmp1v;
-
-    dct0v = vec_ld(0x00, dct);
-    dct1v = vec_ld(0x10, dct);
-
-    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,8,9,2,3,4,5,10,11,16,17,24,25,18,19);
-    const vec_u8_t sel1 = (vec_u8_t) CV(12,13,6,7,14,15,20,21,26,27,28,29,22,23,30,31);
-
-    tmp0v = vec_perm( dct0v, dct1v, sel0 );
-    tmp1v = vec_perm( dct0v, dct1v, sel1 );
-
-    vec_st( tmp0v, 0x00, level );
-    vec_st( tmp1v, 0x10, level );
-}
-
-void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] )
-{
-    vec_s16_t dct0v, dct1v;
-    vec_s16_t tmp0v, tmp1v;
-
-    dct0v = vec_ld(0x00, dct);
-    dct1v = vec_ld(0x10, dct);
-
-    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,2,3,8,9,4,5,6,7,10,11,12,13,14,15);
-
-    tmp0v = vec_perm( dct0v, dct1v, sel0 );
-    tmp1v = dct1v;
-
-    vec_st( tmp0v, 0x00, level );
-    vec_st( tmp1v, 0x10, level );
-}
-#endif // !HIGH_BIT_DEPTH
-
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/dct.h b/android/src/main/libenc/jni/libx264/common/ppc/dct.h
deleted file mode 100755
index 332f3cc..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/dct.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*****************************************************************************
- * dct.h: ppc transform and zigzag
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *          Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PPC_DCT_H
-#define X264_PPC_DCT_H
-
-void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[16] );
-void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] );
-void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] );
-
-void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] );
-void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] );
-
-void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/deblock.c b/android/src/main/libenc/jni/libx264/common/ppc/deblock.c
deleted file mode 100755
index bf5c833..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/deblock.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*****************************************************************************
- * deblock.c: ppc deblocking
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "ppccommon.h"
-
-#if !HIGH_BIT_DEPTH
-#define transpose4x16(r0, r1, r2, r3)        \
-{                                            \
-    register vec_u8_t r4;                    \
-    register vec_u8_t r5;                    \
-    register vec_u8_t r6;                    \
-    register vec_u8_t r7;                    \
-                                             \
-    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
-    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
-    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
-    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
-                                             \
-    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
-    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
-    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
-    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
-}
-
-static inline void write16x4( uint8_t *dst, int dst_stride,
-                              register vec_u8_t r0, register vec_u8_t r1,
-                              register vec_u8_t r2, register vec_u8_t r3 )
-{
-    ALIGNED_16(unsigned char result[64]);
-    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
-    int int_dst_stride = dst_stride >> 2;
-
-    vec_st(r0, 0, result);
-    vec_st(r1, 16, result);
-    vec_st(r2, 32, result);
-    vec_st(r3, 48, result);
-    /* FIXME: there has to be a better way!!!! */
-    *dst_int = *src_int;
-    *(dst_int+   int_dst_stride) = *(src_int + 1);
-    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
-    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
-    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
-    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
-    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
-    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
-    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
-    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
-    *(dst_int+10*int_dst_stride) = *(src_int + 10);
-    *(dst_int+11*int_dst_stride) = *(src_int + 11);
-    *(dst_int+12*int_dst_stride) = *(src_int + 12);
-    *(dst_int+13*int_dst_stride) = *(src_int + 13);
-    *(dst_int+14*int_dst_stride) = *(src_int + 14);
-    *(dst_int+15*int_dst_stride) = *(src_int + 15);
-}
-
-/** \brief performs a 6x16 transpose of data in src, and stores it to dst */
-#define read_and_transpose16x6(src, src_stride, r8, r9, r10, r11, r12, r13)\
-{\
-    register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\
-    VEC_LOAD(src,                  r0, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +    src_stride,  r1, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  2*src_stride,  r2, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  3*src_stride,  r3, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  4*src_stride,  r4, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  5*src_stride,  r5, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  6*src_stride,  r6, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src +  7*src_stride,  r7, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix );    \
-                                                               \
-    VEC_LOAD(src + 8*src_stride,   r8, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 9*src_stride,   r9, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix );    \
-    VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix );    \
-                                                               \
-    /*Merge first pairs*/                                      \
-    r0 = vec_mergeh(r0, r8);    /*0, 8*/                       \
-    r1 = vec_mergeh(r1, r9);    /*1, 9*/                       \
-    r2 = vec_mergeh(r2, r10);   /*2,10*/                       \
-    r3 = vec_mergeh(r3, r11);   /*3,11*/                       \
-    r4 = vec_mergeh(r4, r12);   /*4,12*/                       \
-    r5 = vec_mergeh(r5, r13);   /*5,13*/                       \
-    r6 = vec_mergeh(r6, r14);   /*6,14*/                       \
-    r7 = vec_mergeh(r7, r15);   /*7,15*/                       \
-                                                               \
-    /*Merge second pairs*/                                     \
-    r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/            \
-    r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/            \
-    r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/            \
-    r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/            \
-    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/            \
-    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/            \
-    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/            \
-    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/            \
-                                                               \
-    /*Third merge*/                                            \
-    r0 = vec_mergeh(r8, r12);   /*0,2,4,6,8,10,12,14 set 0*/   \
-    r1 = vec_mergel(r8, r12);   /*0,2,4,6,8,10,12,14 set 1*/   \
-    r2 = vec_mergeh(r9, r13);   /*0,2,4,6,8,10,12,14 set 2*/   \
-    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/   \
-    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/   \
-    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/   \
-    /* Don't need to compute 3 and 7*/                         \
-                                                               \
-    /*Final merge*/                                            \
-    r8  = vec_mergeh(r0, r4);   /*all set 0*/                  \
-    r9  = vec_mergel(r0, r4);   /*all set 1*/                  \
-    r10 = vec_mergeh(r1, r5);   /*all set 2*/                  \
-    r11 = vec_mergel(r1, r5);   /*all set 3*/                  \
-    r12 = vec_mergeh(r2, r6);   /*all set 4*/                  \
-    r13 = vec_mergel(r2, r6);   /*all set 5*/                  \
-    /* Don't need to compute 14 and 15*/                       \
-                                                               \
-}
-
-// out: o = |x-y| < a
-static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a )
-{
-    register vec_u8_t diff = vec_subs(x, y);
-    register vec_u8_t diffneg = vec_subs(y, x);
-    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
-    o = (vec_u8_t)vec_cmplt(o, a);
-    return o;
-}
-
-static inline vec_u8_t h264_deblock_mask( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t q0,
-                                          register vec_u8_t q1, register vec_u8_t alpha, register vec_u8_t beta )
-{
-    register vec_u8_t mask;
-    register vec_u8_t tempmask;
-
-    mask = diff_lt_altivec(p0, q0, alpha);
-    tempmask = diff_lt_altivec(p1, p0, beta);
-    mask = vec_and(mask, tempmask);
-    tempmask = diff_lt_altivec(q1, q0, beta);
-    mask = vec_and(mask, tempmask);
-
-    return mask;
-}
-
-// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
-static inline vec_u8_t h264_deblock_q1( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2,
-                                        register vec_u8_t q0, register vec_u8_t tc0 )
-{
-
-    register vec_u8_t average = vec_avg(p0, q0);
-    register vec_u8_t temp;
-    register vec_u8_t uncliped;
-    register vec_u8_t ones;
-    register vec_u8_t max;
-    register vec_u8_t min;
-    register vec_u8_t newp1;
-
-    temp = vec_xor(average, p2);
-    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
-    ones = vec_splat_u8(1);
-    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
-    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
-    max = vec_adds(p1, tc0);
-    min = vec_subs(p1, tc0);
-    newp1 = vec_max(min, uncliped);
-    newp1 = vec_min(max, newp1);
-    return newp1;
-}
-
-#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked)                                           \
-{                                                                                               \
-    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                             \
-                                                                                                \
-    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                  \
-    register vec_u8_t q1minus;                                                                  \
-    register vec_u8_t p0minus;                                                                  \
-    register vec_u8_t stage1;                                                                   \
-    register vec_u8_t stage2;                                                                   \
-    register vec_u8_t vec160;                                                                   \
-    register vec_u8_t delta;                                                                    \
-    register vec_u8_t deltaneg;                                                                 \
-                                                                                                \
-    q1minus = vec_nor(q1, q1);                /* 255 - q1 */                                    \
-    stage1 = vec_avg(p1, q1minus);            /* (p1 - q1 + 256)>>1 */                          \
-    stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */    \
-    p0minus = vec_nor(p0, p0);                /* 255 - p0 */                                    \
-    stage1 = vec_avg(q0, p0minus);            /* (q0 - p0 + 256)>>1 */                          \
-    pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                  \
-    stage2 = vec_avg(stage2, pq0bit);         /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */\
-    stage2 = vec_adds(stage2, stage1);        /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
-    vec160 = vec_ld(0, &A0v);                                                                   \
-    deltaneg = vec_subs(vec160, stage2);      /* -d */                                          \
-    delta = vec_subs(stage2, vec160);         /*  d */                                          \
-    deltaneg = vec_min(tc0masked, deltaneg);                                                    \
-    delta = vec_min(tc0masked, delta);                                                          \
-    p0 = vec_subs(p0, deltaneg);                                                                \
-    q0 = vec_subs(q0, delta);                                                                   \
-    p0 = vec_adds(p0, delta);                                                                   \
-    q0 = vec_adds(q0, deltaneg);                                                                \
-}
-
-#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0)              \
-{                                                                                            \
-    ALIGNED_16(unsigned char temp[16]);                                                      \
-    register vec_u8_t alphavec;                                                              \
-    register vec_u8_t betavec;                                                               \
-    register vec_u8_t mask;                                                                  \
-    register vec_u8_t p1mask;                                                                \
-    register vec_u8_t q1mask;                                                                \
-    register vec_s8_t tc0vec;                                                                \
-    register vec_u8_t finaltc0;                                                              \
-    register vec_u8_t tc0masked;                                                             \
-    register vec_u8_t newp1;                                                                 \
-    register vec_u8_t newq1;                                                                 \
-                                                                                             \
-    temp[0] = alpha;                                                                         \
-    temp[1] = beta;                                                                          \
-    alphavec = vec_ld(0, temp);                                                              \
-    betavec = vec_splat(alphavec, 0x1);                                                      \
-    alphavec = vec_splat(alphavec, 0x0);                                                     \
-    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
-                                                                                             \
-    M32( temp ) = M32( tc0 );                                                                \
-    tc0vec = vec_ld(0, (signed char*)temp);                                                  \
-    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
-    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
-    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
-    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);                 /* tc = tc0 */               \
-                                                                                             \
-    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
-    p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
-    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
-    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
-    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
-    /*end if*/                                                                               \
-                                                                                             \
-    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
-    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
-    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
-    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
-    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
-    /*end if*/                                                                               \
-                                                                                             \
-    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
-    p1 = newp1;                                                                              \
-    q1 = newq1;                                                                              \
-}
-
-void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-    if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 )
-    {
-        register vec_u8_t p2 = vec_ld(-3*stride, pix);
-        register vec_u8_t p1 = vec_ld(-2*stride, pix);
-        register vec_u8_t p0 = vec_ld(-1*stride, pix);
-        register vec_u8_t q0 = vec_ld(0, pix);
-        register vec_u8_t q1 = vec_ld(stride, pix);
-        register vec_u8_t q2 = vec_ld(2*stride, pix);
-        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
-        vec_st(p1, -2*stride, pix);
-        vec_st(p0, -1*stride, pix);
-        vec_st(q0, 0, pix);
-        vec_st(q1, stride, pix);
-    }
-}
-
-void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-{
-
-    register vec_u8_t line0, line1, line2, line3, line4, line5;
-    if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0 )
-        return;
-    PREP_LOAD;
-    vec_u8_t _pix_ = vec_lvsl(0, pix-3);
-    read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
-    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
-    transpose4x16(line1, line2, line3, line4);
-    write16x4(pix-2, stride, line1, line2, line3, line4);
-}
-#endif // !HIGH_BIT_DEPTH
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/mc.c b/android/src/main/libenc/jni/libx264/common/ppc/mc.c
deleted file mode 100755
index e169166..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/mc.c
+++ /dev/null
@@ -1,1202 +0,0 @@
-/*****************************************************************************
- * mc.c: ppc motion compensation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *          Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "mc.h"
-#include "ppccommon.h"
-
-#if !HIGH_BIT_DEPTH
-typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
-                         uint8_t *dst, intptr_t i_dst, int i_height );
-
-static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
-{
-    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
-           pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
-           pix[ 3*i_pix_next];
-}
-
-static inline int x264_tapfilter1( uint8_t *pix )
-{
-    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
-           pix[ 3];
-}
-
-static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
-                                               uint8_t *src1, intptr_t i_src1,
-                                               uint8_t *src2, int i_height )
-{
-    for( int y = 0; y < i_height; y++ )
-    {
-        for( int x = 0; x < 4; x++ )
-            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
-        dst  += i_dst;
-        src1 += i_src1;
-        src2 += i_src1;
-    }
-}
-
-static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
-                                               uint8_t *src1, intptr_t i_src1,
-                                               uint8_t *src2, int i_height )
-{
-    vec_u8_t src1v, src2v;
-    PREP_LOAD;
-    PREP_STORE8;
-    PREP_LOAD_SRC( src1 );
-    PREP_LOAD_SRC( src2 );
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
-        VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
-        src1v = vec_avg( src1v, src2v );
-        VEC_STORE8( src1v, dst );
-
-        dst  += i_dst;
-        src1 += i_src1;
-        src2 += i_src1;
-    }
-}
-
-static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  intptr_t i_dst,
-                                                uint8_t *src1, intptr_t i_src1,
-                                                uint8_t *src2, int i_height )
-{
-    vec_u8_t src1v, src2v;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src1 );
-    PREP_LOAD_SRC( src2 );
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
-        VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
-        src1v = vec_avg( src1v, src2v );
-        vec_st(src1v, 0, dst);
-
-        dst  += i_dst;
-        src1 += i_src1;
-        src2 += i_src1;
-    }
-}
-
-static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  intptr_t i_dst,
-                                                uint8_t *src1, intptr_t i_src1,
-                                                uint8_t *src2, int i_height )
-{
-    x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
-    x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
-}
-
-/* mc_copy: plain c */
-
-#define MC_COPY( name, a )                                \
-static void name( uint8_t *dst, intptr_t i_dst,           \
-                  uint8_t *src, intptr_t i_src, int i_height ) \
-{                                                         \
-    int y;                                                \
-    for( y = 0; y < i_height; y++ )                       \
-    {                                                     \
-        memcpy( dst, src, a );                            \
-        src += i_src;                                     \
-        dst += i_dst;                                     \
-    }                                                     \
-}
-MC_COPY( x264_mc_copy_w4_altivec,  4  )
-MC_COPY( x264_mc_copy_w8_altivec,  8  )
-
-static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
-                                      uint8_t *src, intptr_t i_src, int i_height )
-{
-    vec_u8_t cpyV;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
-        vec_st(cpyV, 0, dst);
-
-        src += i_src;
-        dst += i_dst;
-    }
-}
-
-
-static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
-                                              uint8_t *src, intptr_t i_src, int i_height )
-{
-    for( int y = 0; y < i_height; ++y )
-    {
-        vec_u8_t cpyV = vec_ld( 0, src );
-        vec_st(cpyV, 0, dst);
-
-        src += i_src;
-        dst += i_dst;
-    }
-}
-
-
-static void mc_luma_altivec( uint8_t *dst,    intptr_t i_dst_stride,
-                             uint8_t *src[4], intptr_t i_src_stride,
-                             int mvx, int mvy,
-                             int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-
-        switch( i_width )
-        {
-            case 4:
-                x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-            case 8:
-                x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-            case 16:
-            default:
-                x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
-        }
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
-    }
-    else if( weight->weightfn )
-        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
-    else
-    {
-        switch( i_width )
-        {
-            case 4:
-                x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
-                break;
-            case 8:
-                x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
-                break;
-            case 16:
-                x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
-                break;
-        }
-    }
-}
-
-
-
-static uint8_t *get_ref_altivec( uint8_t *dst,   intptr_t *i_dst_stride,
-                                 uint8_t *src[4], intptr_t i_src_stride,
-                                 int mvx, int mvy,
-                                 int i_width, int i_height, const x264_weight_t *weight )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        switch( i_width )
-        {
-            case 4:
-                x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-            case 8:
-                x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-            case 12:
-            case 16:
-            default:
-                x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-            case 20:
-                x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
-                break;
-        }
-        if( weight->weightfn )
-            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
-        return dst;
-    }
-    else if( weight->weightfn )
-    {
-        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
-        return dst;
-    }
-    else
-    {
-        *i_dst_stride = i_src_stride;
-        return src1;
-    }
-}
-
-static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
-                           uint8_t *src, intptr_t i_src_stride,
-                           int mvx, int mvy, int i_height )
-{
-    uint8_t *srcp;
-    int d8x = mvx&0x07;
-    int d8y = mvy&0x07;
-
-    int cA = (8-d8x)*(8-d8y);
-    int cB = d8x    *(8-d8y);
-    int cC = (8-d8x)*d8y;
-    int cD = d8x    *d8y;
-
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
-    srcp = &src[i_src_stride];
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
-        dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
-        dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
-        dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
-
-        src  += i_src_stride;
-        srcp += i_src_stride;
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-    }
- }
-
-#ifdef WORDS_BIGENDIAN
-#define VSLD(a,b,n) vec_sld(a,b,n)
-#else
-#define VSLD(a,b,n) vec_sld(b,a,16-n)
-#endif
-
-static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
-                                   uint8_t *src, intptr_t i_src_stride,
-                                   int mvx, int mvy, int i_height )
-{
-    uint8_t *srcp;
-    int d8x = mvx & 0x07;
-    int d8y = mvy & 0x07;
-
-    ALIGNED_16( uint16_t coeff[4] );
-    coeff[0] = (8-d8x)*(8-d8y);
-    coeff[1] = d8x    *(8-d8y);
-    coeff[2] = (8-d8x)*d8y;
-    coeff[3] = d8x    *d8y;
-
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
-    srcp = &src[i_src_stride];
-
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
-    vec_u8_t    src2v_8, dstuv, dstvv;
-    vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
-    vec_u16_t   shiftv, k32v;
-
-#ifdef WORDS_BIGENDIAN
-    static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
-    static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
-#else
-    static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
-    static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
-#endif
-
-    coeff0v = vec_ld( 0, coeff );
-    coeff3v = vec_splat( coeff0v, 3 );
-    coeff2v = vec_splat( coeff0v, 2 );
-    coeff1v = vec_splat( coeff0v, 1 );
-    coeff0v = vec_splat( coeff0v, 0 );
-    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
-    shiftv  = vec_splat_u16( 6 );
-
-    VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
-    src2v_16 = vec_u8_to_u16( src2v_8 );
-    src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
-
-    for( int y = 0; y < i_height; y += 2 )
-    {
-        src0v_16 = src2v_16;
-        src1v_16 = src3v_16;
-        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
-        src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
-
-        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
-        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
-        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
-        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
-
-        dstv16 = vec_sr( dstv16, shiftv );
-
-        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
-        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
-        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
-        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
-
-        srcp += i_src_stride;
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-
-        src0v_16 = src2v_16;
-        src1v_16 = src3v_16;
-        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
-        src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
-
-        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
-        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
-        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
-        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
-
-        dstv16 = vec_sr( dstv16, shiftv );
-
-        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
-        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
-        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
-        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
-
-        srcp += i_src_stride;
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-    }
-}
-
-static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
-                                   uint8_t *src, intptr_t i_src_stride,
-                                   int mvx, int mvy, int i_height )
-{
-    uint8_t *srcp;
-    int d8x = mvx & 0x07;
-    int d8y = mvy & 0x07;
-
-    ALIGNED_16( uint16_t coeff[4] );
-    coeff[0] = (8-d8x)*(8-d8y);
-    coeff[1] = d8x    *(8-d8y);
-    coeff[2] = (8-d8x)*d8y;
-    coeff[3] = d8x    *d8y;
-
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
-    srcp = &src[i_src_stride];
-
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    PREP_STORE8;
-    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
-    vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
-    vec_u8_t    dstuv, dstvv;
-    vec_u16_t   src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
-    vec_u16_t   src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
-    vec_u16_t   shiftv, k32v;
-
-    coeff0v = vec_ld( 0, coeff );
-    coeff3v = vec_splat( coeff0v, 3 );
-    coeff2v = vec_splat( coeff0v, 2 );
-    coeff1v = vec_splat( coeff0v, 1 );
-    coeff0v = vec_splat( coeff0v, 0 );
-    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
-    shiftv  = vec_splat_u16( 6 );
-
-#ifdef WORDS_BIGENDIAN
-    static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
-    static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
-#else
-    static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
-    static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
-#endif
-
-    VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
-    VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
-    src3v_8 = VSLD( src2v_8, src3v_8, 2 );
-
-    for( int y = 0; y < i_height; y += 2 )
-    {
-        src0v_8 = src2v_8;
-        src1v_8 = src3v_8;
-        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
-        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
-
-        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
-
-        src0v_16h = vec_u8_to_u16_h( src0v_8 );
-        src0v_16l = vec_u8_to_u16_l( src0v_8 );
-        src1v_16h = vec_u8_to_u16_h( src1v_8 );
-        src1v_16l = vec_u8_to_u16_l( src1v_8 );
-        src2v_16h = vec_u8_to_u16_h( src2v_8 );
-        src2v_16l = vec_u8_to_u16_l( src2v_8 );
-        src3v_16h = vec_u8_to_u16_h( src3v_8 );
-        src3v_16l = vec_u8_to_u16_l( src3v_8 );
-
-        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
-        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
-        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
-        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
-        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
-
-        dstv_16h = vec_sr( dstv_16h, shiftv );
-        dstv_16l = vec_sr( dstv_16l, shiftv );
-
-        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
-        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
-
-        VEC_STORE8( dstuv, dstu );
-        VEC_STORE8( dstvv, dstv );
-
-        srcp += i_src_stride;
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-
-        src0v_8 = src2v_8;
-        src1v_8 = src3v_8;
-        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
-        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
-
-        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
-
-        src0v_16h = vec_u8_to_u16_h( src0v_8 );
-        src0v_16l = vec_u8_to_u16_l( src0v_8 );
-        src1v_16h = vec_u8_to_u16_h( src1v_8 );
-        src1v_16l = vec_u8_to_u16_l( src1v_8 );
-        src2v_16h = vec_u8_to_u16_h( src2v_8 );
-        src2v_16l = vec_u8_to_u16_l( src2v_8 );
-        src3v_16h = vec_u8_to_u16_h( src3v_8 );
-        src3v_16l = vec_u8_to_u16_l( src3v_8 );
-
-        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
-        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
-        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
-        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
-        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
-        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
-
-        dstv_16h = vec_sr( dstv_16h, shiftv );
-        dstv_16l = vec_sr( dstv_16l, shiftv );
-
-        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
-        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
-
-        VEC_STORE8( dstuv, dstu );
-        VEC_STORE8( dstvv, dstv );
-
-        srcp += i_src_stride;
-        dstu += i_dst_stride;
-        dstv += i_dst_stride;
-    }
-}
-
-static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
-                               uint8_t *src, intptr_t i_src_stride,
-                               int mvx, int mvy, int i_width, int i_height )
-{
-    if( i_width == 8 )
-        mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
-                               mvx, mvy, i_height );
-    else if( i_width == 4 )
-        mc_chroma_altivec_4xh( dstu, dstv, i_dst_stride, src, i_src_stride,
-                               mvx, mvy, i_height );
-    else
-        mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
-                       mvx, mvy, i_height );
-}
-
-#define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
-{                                                     \
-    t1v = vec_add( t1v, t6v );                        \
-    t2v = vec_add( t2v, t5v );                        \
-    t3v = vec_add( t3v, t4v );                        \
-                                                      \
-    t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
-    t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
-    t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
-    t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
-    t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
-    t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
-}
-
-#define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
-{                                                     \
-    t1v = vec_add( t1v, t6v );                        \
-    t2v = vec_add( t2v, t5v );                        \
-    t3v = vec_add( t3v, t4v );                        \
-                                                      \
-    t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
-    t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
-    t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
-    t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
-    t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
-    t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
-}
-
-#define HPEL_FILTER_HORIZONTAL()                             \
-{                                                            \
-    VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
-    VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
-                                                             \
-    src2v = VSLD( src1v, src6v,  1 );                        \
-    src3v = VSLD( src1v, src6v,  2 );                        \
-    src4v = VSLD( src1v, src6v,  3 );                        \
-    src5v = VSLD( src1v, src6v,  4 );                        \
-    src6v = VSLD( src1v, src6v,  5 );                        \
-                                                             \
-    temp1v = vec_u8_to_s16_h( src1v );                       \
-    temp2v = vec_u8_to_s16_h( src2v );                       \
-    temp3v = vec_u8_to_s16_h( src3v );                       \
-    temp4v = vec_u8_to_s16_h( src4v );                       \
-    temp5v = vec_u8_to_s16_h( src5v );                       \
-    temp6v = vec_u8_to_s16_h( src6v );                       \
-                                                             \
-    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
-                   temp4v, temp5v, temp6v );                 \
-                                                             \
-    dest1v = vec_add( temp1v, sixteenv );                    \
-    dest1v = vec_sra( dest1v, fivev );                       \
-                                                             \
-    temp1v = vec_u8_to_s16_l( src1v );                       \
-    temp2v = vec_u8_to_s16_l( src2v );                       \
-    temp3v = vec_u8_to_s16_l( src3v );                       \
-    temp4v = vec_u8_to_s16_l( src4v );                       \
-    temp5v = vec_u8_to_s16_l( src5v );                       \
-    temp6v = vec_u8_to_s16_l( src6v );                       \
-                                                             \
-    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
-                   temp4v, temp5v, temp6v );                 \
-                                                             \
-    dest2v = vec_add( temp1v, sixteenv );                    \
-    dest2v = vec_sra( dest2v, fivev );                       \
-                                                             \
-    destv = vec_packsu( dest1v, dest2v );                    \
-                                                             \
-    VEC_STORE16( destv, &dsth[x+i_stride*y], dsth );         \
-}
-
-#define HPEL_FILTER_VERTICAL()                                    \
-{                                                                 \
-    VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
-    VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
-    VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
-    VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
-    VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
-    VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
-                                                                  \
-    temp1v = vec_u8_to_s16_h( src1v );                            \
-    temp2v = vec_u8_to_s16_h( src2v );                            \
-    temp3v = vec_u8_to_s16_h( src3v );                            \
-    temp4v = vec_u8_to_s16_h( src4v );                            \
-    temp5v = vec_u8_to_s16_h( src5v );                            \
-    temp6v = vec_u8_to_s16_h( src6v );                            \
-                                                                  \
-    HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
-                   temp4v, temp5v, temp6v );                      \
-                                                                  \
-    dest1v = vec_add( temp1v, sixteenv );                         \
-    dest1v = vec_sra( dest1v, fivev );                            \
-                                                                  \
-    temp4v = vec_u8_to_s16_l( src1v );                            \
-    temp5v = vec_u8_to_s16_l( src2v );                            \
-    temp6v = vec_u8_to_s16_l( src3v );                            \
-    temp7v = vec_u8_to_s16_l( src4v );                            \
-    temp8v = vec_u8_to_s16_l( src5v );                            \
-    temp9v = vec_u8_to_s16_l( src6v );                            \
-                                                                  \
-    HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
-                   temp7v, temp8v, temp9v );                      \
-                                                                  \
-    dest2v = vec_add( temp4v, sixteenv );                         \
-    dest2v = vec_sra( dest2v, fivev );                            \
-                                                                  \
-    destv = vec_packsu( dest1v, dest2v );                         \
-                                                                  \
-    VEC_STORE16( destv, &dstv[x+i_stride*y], dsth );              \
-}
-
-#define HPEL_FILTER_CENTRAL()                           \
-{                                                       \
-    temp1v = VSLD( tempav, tempbv, 12 );                \
-    temp2v = VSLD( tempav, tempbv, 14 );                \
-    temp3v = tempbv;                                    \
-    temp4v = VSLD( tempbv, tempcv,  2 );                \
-    temp5v = VSLD( tempbv, tempcv,  4 );                \
-    temp6v = VSLD( tempbv, tempcv,  6 );                \
-                                                        \
-    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
-                   temp4v, temp5v, temp6v );            \
-                                                        \
-    dest1v = vec_add( temp1v, thirtytwov );             \
-    dest1v = vec_sra( dest1v, sixv );                   \
-                                                        \
-    temp1v = VSLD( tempbv, tempcv, 12 );                \
-    temp2v = VSLD( tempbv, tempcv, 14 );                \
-    temp3v = tempcv;                                    \
-    temp4v = VSLD( tempcv, tempdv,  2 );                \
-    temp5v = VSLD( tempcv, tempdv,  4 );                \
-    temp6v = VSLD( tempcv, tempdv,  6 );                \
-                                                        \
-    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
-                   temp4v, temp5v, temp6v );            \
-                                                        \
-    dest2v = vec_add( temp1v, thirtytwov );             \
-    dest2v = vec_sra( dest2v, sixv );                   \
-                                                        \
-    destv = vec_packsu( dest1v, dest2v );               \
-                                                        \
-    VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
-}
-
-void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                               intptr_t i_stride, int i_width, int i_height, int16_t *buf )
-{
-    vec_u8_t destv;
-    vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
-    vec_s16_t dest1v, dest2v;
-    vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
-    vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
-
-    PREP_LOAD;
-    PREP_LOAD_SRC( src);
-    PREP_STORE16;
-    PREP_STORE16_DST( dsth );
-    LOAD_ZERO;
-
-    vec_u16_t twov, fourv, fivev, sixv;
-    vec_s16_t sixteenv, thirtytwov;
-    vec_u16_u temp_u;
-
-    temp_u.s[0]=2;
-    twov = vec_splat( temp_u.v, 0 );
-    temp_u.s[0]=4;
-    fourv = vec_splat( temp_u.v, 0 );
-    temp_u.s[0]=5;
-    fivev = vec_splat( temp_u.v, 0 );
-    temp_u.s[0]=6;
-    sixv = vec_splat( temp_u.v, 0 );
-    temp_u.s[0]=16;
-    sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
-    temp_u.s[0]=32;
-    thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
-
-    for( int y = 0; y < i_height; y++ )
-    {
-        int x = 0;
-
-        /* horizontal_filter */
-        HPEL_FILTER_HORIZONTAL();
-
-        /* vertical_filter */
-        HPEL_FILTER_VERTICAL();
-
-        /* central_filter */
-        tempav = tempcv;
-        tempbv = tempdv;
-        tempcv = vec_splat( temp1v, 0 ); /* first only */
-        tempdv = temp1v;
-        tempev = temp4v;
-
-        for( x = 16; x < i_width; x+=16 )
-        {
-            /* horizontal_filter */
-            HPEL_FILTER_HORIZONTAL();
-
-            /* vertical_filter */
-            HPEL_FILTER_VERTICAL();
-
-            /* central_filter */
-            tempav = tempcv;
-            tempbv = tempdv;
-            tempcv = tempev;
-            tempdv = temp1v;
-            tempev = temp4v;
-
-            HPEL_FILTER_CENTRAL();
-        }
-
-        /* Partial vertical filter */
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
-        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
-
-        temp1v = vec_u8_to_s16_h( src1v );
-        temp2v = vec_u8_to_s16_h( src2v );
-        temp3v = vec_u8_to_s16_h( src3v );
-        temp4v = vec_u8_to_s16_h( src4v );
-        temp5v = vec_u8_to_s16_h( src5v );
-        temp6v = vec_u8_to_s16_h( src6v );
-
-        HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v );
-
-        /* central_filter */
-        tempav = tempcv;
-        tempbv = tempdv;
-        tempcv = tempev;
-        tempdv = temp1v;
-        /* tempev is not used */
-
-        HPEL_FILTER_CENTRAL();
-    }
-}
-
-static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-                                            intptr_t src_stride, intptr_t dst_stride, int width, int height )
-{
-    int w = width >> 4;
-    int end = (width & 15);
-    vec_u8_t src0v, src1v, src2v;
-    vec_u8_t lv, hv, src1p1v;
-    vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
-    static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
-#ifndef WORDS_BIGENDIAN
-    static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
-#endif
-
-    for( int y = 0; y < height; y++ )
-    {
-        int x;
-        uint8_t *src1 = src0+src_stride;
-        uint8_t *src2 = src1+src_stride;
-
-        src0v = vec_ld(0, src0);
-        src1v = vec_ld(0, src1);
-        src2v = vec_ld(0, src2);
-
-        avg0v = vec_avg(src0v, src1v);
-        avg1v = vec_avg(src1v, src2v);
-
-        for( x = 0; x < w; x++ )
-        {
-            lv = vec_ld(16*(x*2+1), src0);
-            src1v = vec_ld(16*(x*2+1), src1);
-            avghv = vec_avg(lv, src1v);
-
-            lv = vec_ld(16*(x*2+2), src0);
-            src1p1v = vec_ld(16*(x*2+2), src1);
-            avghp1v = vec_avg(lv, src1p1v);
-
-            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
-
-            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
-#ifdef WORDS_BIGENDIAN
-            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
-#else
-            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
-#endif
-
-            avg0v = avghp1v;
-
-            hv = vec_ld(16*(x*2+1), src2);
-            avghv = vec_avg(src1v, hv);
-
-            hv = vec_ld(16*(x*2+2), src2);
-            avghp1v = vec_avg(src1p1v, hv);
-
-            avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
-            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
-
-            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
-#ifdef WORDS_BIGENDIAN
-            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
-#else
-            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
-#endif
-
-            avg1v = avghp1v;
-
-        }
-        if( end )
-        {
-            lv = vec_ld(16*(x*2+1), src0);
-            src1v = vec_ld(16*(x*2+1), src1);
-            avghv = vec_avg(lv, src1v);
-
-            lv = vec_ld(16*(x*2+1), src2);
-            avghp1v = vec_avg(src1v, lv);
-
-            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
-
-            lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
-#ifdef WORDS_BIGENDIAN
-            hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
-#else
-            hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
-#endif
-
-            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
-            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
-            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dsth);
-            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dsth);
-
-            lv = vec_sld(lv, lv, 8);
-            hv = vec_sld(hv, hv, 8);
-
-            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dstv);
-            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dstv);
-            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dstc);
-            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dstc);
-        }
-
-        src0 += src_stride*2;
-        dst0 += dst_stride;
-        dsth += dst_stride;
-        dstv += dst_stride;
-        dstc += dst_stride;
-    }
-}
-
-static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
-                                  const x264_weight_t *weight, int i_height )
-{
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    vec_u8_t srcv;
-    vec_s16_t weightv;
-    vec_s16_t scalev, offsetv, denomv, roundv;
-    vec_s16_u loadv;
-
-    int denom = weight->i_denom;
-
-    loadv.s[0] = weight->i_scale;
-    scalev = vec_splat( loadv.v, 0 );
-
-    loadv.s[0] = weight->i_offset;
-    offsetv = vec_splat( loadv.v, 0 );
-
-    if( denom >= 1 )
-    {
-        loadv.s[0] = denom;
-        denomv = vec_splat( loadv.v, 0 );
-
-        loadv.s[0] = 1<<(denom - 1);
-        roundv = vec_splat( loadv.v, 0 );
-
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 2, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, roundv );
-            weightv = vec_sra( weightv, (vec_u16_t)denomv );
-            weightv = vec_add( weightv, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
-        }
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 2, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
-        }
-    }
-}
-static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
-                                  const x264_weight_t *weight, int i_height )
-{
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    vec_u8_t srcv;
-    vec_s16_t weightv;
-    vec_s16_t scalev, offsetv, denomv, roundv;
-    vec_s16_u loadv;
-
-    int denom = weight->i_denom;
-
-    loadv.s[0] = weight->i_scale;
-    scalev = vec_splat( loadv.v, 0 );
-
-    loadv.s[0] = weight->i_offset;
-    offsetv = vec_splat( loadv.v, 0 );
-
-    if( denom >= 1 )
-    {
-        loadv.s[0] = denom;
-        denomv = vec_splat( loadv.v, 0 );
-
-        loadv.s[0] = 1<<(denom - 1);
-        roundv = vec_splat( loadv.v, 0 );
-
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 4, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, roundv );
-            weightv = vec_sra( weightv, (vec_u16_t)denomv );
-            weightv = vec_add( weightv, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
-        }
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 4, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
-        }
-    }
-}
-static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
-                                  const x264_weight_t *weight, int i_height )
-{
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    PREP_STORE8;
-    vec_u8_t srcv;
-    vec_s16_t weightv;
-    vec_s16_t scalev, offsetv, denomv, roundv;
-    vec_s16_u loadv;
-
-    int denom = weight->i_denom;
-
-    loadv.s[0] = weight->i_scale;
-    scalev = vec_splat( loadv.v, 0 );
-
-    loadv.s[0] = weight->i_offset;
-    offsetv = vec_splat( loadv.v, 0 );
-
-    if( denom >= 1 )
-    {
-        loadv.s[0] = denom;
-        denomv = vec_splat( loadv.v, 0 );
-
-        loadv.s[0] = 1<<(denom - 1);
-        roundv = vec_splat( loadv.v, 0 );
-
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 8, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, roundv );
-            weightv = vec_sra( weightv, (vec_u16_t)denomv );
-            weightv = vec_add( weightv, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            VEC_STORE8( srcv, dst );
-        }
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 8, vec_u8_t, src );
-            weightv = vec_u8_to_s16( srcv );
-
-            weightv = vec_mladd( weightv, scalev, offsetv );
-
-            srcv = vec_packsu( weightv, zero_s16v );
-            VEC_STORE8( srcv, dst );
-        }
-    }
-}
-static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
-                                   const x264_weight_t *weight, int i_height )
-{
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( src );
-    vec_u8_t srcv;
-    vec_s16_t weight_lv, weight_hv;
-    vec_s16_t scalev, offsetv, denomv, roundv;
-    vec_s16_u loadv;
-
-    int denom = weight->i_denom;
-
-    loadv.s[0] = weight->i_scale;
-    scalev = vec_splat( loadv.v, 0 );
-
-    loadv.s[0] = weight->i_offset;
-    offsetv = vec_splat( loadv.v, 0 );
-
-    if( denom >= 1 )
-    {
-        loadv.s[0] = denom;
-        denomv = vec_splat( loadv.v, 0 );
-
-        loadv.s[0] = 1<<(denom - 1);
-        roundv = vec_splat( loadv.v, 0 );
-
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 16, vec_u8_t, src );
-            weight_hv = vec_u8_to_s16_h( srcv );
-            weight_lv = vec_u8_to_s16_l( srcv );
-
-            weight_hv = vec_mladd( weight_hv, scalev, roundv );
-            weight_lv = vec_mladd( weight_lv, scalev, roundv );
-            weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
-            weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
-            weight_hv = vec_add( weight_hv, offsetv );
-            weight_lv = vec_add( weight_lv, offsetv );
-
-            srcv = vec_packsu( weight_hv, weight_lv );
-            vec_st( srcv, 0, dst );
-        }
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            VEC_LOAD( src, srcv, 16, vec_u8_t, src );
-            weight_hv = vec_u8_to_s16_h( srcv );
-            weight_lv = vec_u8_to_s16_l( srcv );
-
-            weight_hv = vec_mladd( weight_hv, scalev, offsetv );
-            weight_lv = vec_mladd( weight_lv, scalev, offsetv );
-
-            srcv = vec_packsu( weight_hv, weight_lv );
-            vec_st( srcv, 0, dst );
-        }
-    }
-}
-static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
-                                   const x264_weight_t *weight, int i_height )
-{
-    LOAD_ZERO;
-    PREP_LOAD_SRC( src );
-    vec_u8_t src_1v, src_2v, src_3v;
-    vec_s16_t weight_lv, weight_hv, weight_3v;
-    vec_s16_t scalev, offsetv, denomv, roundv;
-    vec_s16_u loadv;
-
-    int denom = weight->i_denom;
-
-    loadv.s[0] = weight->i_scale;
-    scalev = vec_splat( loadv.v, 0 );
-
-    loadv.s[0] = weight->i_offset;
-    offsetv = vec_splat( loadv.v, 0 );
-
-    if( denom >= 1 )
-    {
-        loadv.s[0] = denom;
-        denomv = vec_splat( loadv.v, 0 );
-
-        loadv.s[0] = 1<<(denom - 1);
-        roundv = vec_splat( loadv.v, 0 );
-
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            src_1v = vec_ld( 0,  src );
-            src_2v = vec_ld( 16, src );
-            src_3v = vec_ld( 19, src );
-            src_1v = vec_perm( src_1v, src_2v, _src_ );
-            src_3v = vec_perm( src_2v, src_3v, _src_ );
-            weight_hv = vec_u8_to_s16_h( src_1v );
-            weight_lv = vec_u8_to_s16_l( src_1v );
-            weight_3v = vec_u8_to_s16_h( src_3v );
-
-            weight_hv = vec_mladd( weight_hv, scalev, roundv );
-            weight_lv = vec_mladd( weight_lv, scalev, roundv );
-            weight_3v = vec_mladd( weight_3v, scalev, roundv );
-            weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
-            weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
-            weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv );
-            weight_hv = vec_add( weight_hv, offsetv );
-            weight_lv = vec_add( weight_lv, offsetv );
-            weight_3v = vec_add( weight_3v, offsetv );
-
-            src_1v = vec_packsu( weight_hv, weight_lv );
-            src_3v = vec_packsu( weight_3v, zero_s16v );
-            vec_st( src_1v, 0, dst );
-            vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
-        }
-    }
-    else
-    {
-        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
-        {
-            src_1v = vec_ld( 0,  src );
-            src_2v = vec_ld( 16, src );
-            src_3v = vec_ld( 19, src );
-            src_1v = vec_perm( src_1v, src_2v, _src_ );
-            src_3v = vec_perm( src_2v, src_3v, _src_ );
-            weight_hv = vec_u8_to_s16_h( src_1v );
-            weight_lv = vec_u8_to_s16_l( src_1v );
-            weight_3v = vec_u8_to_s16_h( src_3v );
-
-            weight_hv = vec_mladd( weight_hv, scalev, offsetv );
-            weight_lv = vec_mladd( weight_lv, scalev, offsetv );
-            weight_3v = vec_mladd( weight_3v, scalev, offsetv );
-
-            src_1v = vec_packsu( weight_hv, weight_lv );
-            src_3v = vec_packsu( weight_3v, zero_s16v );
-            vec_st( src_1v, 0, dst );
-            vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
-        }
-    }
-}
-
-static weight_fn_t x264_mc_weight_wtab_altivec[6] =
-{
-    mc_weight_w2_altivec,
-    mc_weight_w4_altivec,
-    mc_weight_w8_altivec,
-    mc_weight_w16_altivec,
-    mc_weight_w16_altivec,
-    mc_weight_w20_altivec,
-};
-
-#endif // !HIGH_BIT_DEPTH
-
-void x264_mc_altivec_init( x264_mc_functions_t *pf )
-{
-#if !HIGH_BIT_DEPTH
-    pf->mc_luma   = mc_luma_altivec;
-    pf->get_ref   = get_ref_altivec;
-    pf->mc_chroma = mc_chroma_altivec;
-
-    pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;
-
-    pf->hpel_filter = x264_hpel_filter_altivec;
-    pf->frame_init_lowres_core = frame_init_lowres_core_altivec;
-
-    pf->weight = x264_mc_weight_wtab_altivec;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/mc.h b/android/src/main/libenc/jni/libx264/common/ppc/mc.h
deleted file mode 100755
index 923c66f..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/mc.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * mc.h: ppc motion compensation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PPC_MC_H
-#define X264_PPC_MC_H
-
-void x264_mc_altivec_init( x264_mc_functions_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/pixel.c b/android/src/main/libenc/jni/libx264/common/ppc/pixel.c
deleted file mode 100755
index 5ace725..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/pixel.c
+++ /dev/null
@@ -1,2117 +0,0 @@
-/*****************************************************************************
- * pixel.c: ppc pixel metrics
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *          Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "ppccommon.h"
-
-#if !HIGH_BIT_DEPTH
-/***********************************************************************
- * SAD routines
- **********************************************************************/
-
-#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
-static int name( uint8_t *pix1, intptr_t i_pix1,       \
-                 uint8_t *pix2, intptr_t i_pix2 )      \
-{                                                      \
-    ALIGNED_16( int sum );                             \
-                                                       \
-    LOAD_ZERO;                                         \
-    PREP_LOAD;                                         \
-    vec_u8_t  pix1v, pix2v;                            \
-    vec_s32_t sumv = zero_s32v;                        \
-    for( int y = 0; y < ly; y++ )                      \
-    {                                                  \
-        VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t );       \
-        VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t );       \
-        sumv = (vec_s32_t) vec_sum4s(                  \
-                   vec_sub( vec_max( pix1v, pix2v ),   \
-                            vec_min( pix1v, pix2v ) ), \
-                   (vec_u32_t) sumv );                 \
-        pix1 += i_pix1;                                \
-        pix2 += i_pix2;                                \
-    }                                                  \
-    sumv = vec_sum##a( sumv, zero_s32v );              \
-    sumv = vec_splat( sumv, b );                       \
-    vec_ste( sumv, 0, &sum );                          \
-    return sum;                                        \
-}
-
-PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
-PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
-PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
-PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
-
-
-
-/***********************************************************************
- * SATD routines
- **********************************************************************/
-
-/***********************************************************************
- * VEC_HADAMAR
- ***********************************************************************
- * b[0] = a[0] + a[1] + a[2] + a[3]
- * b[1] = a[0] + a[1] - a[2] - a[3]
- * b[2] = a[0] - a[1] - a[2] + a[3]
- * b[3] = a[0] - a[1] + a[2] - a[3]
- **********************************************************************/
-#define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
-    b2 = vec_add( a0, a1 ); \
-    b3 = vec_add( a2, a3 ); \
-    a0 = vec_sub( a0, a1 ); \
-    a2 = vec_sub( a2, a3 ); \
-    b0 = vec_add( b2, b3 ); \
-    b1 = vec_sub( b2, b3 ); \
-    b2 = vec_sub( a0, a2 ); \
-    b3 = vec_add( a0, a2 )
-
-/***********************************************************************
- * VEC_ABS
- ***********************************************************************
- * a: s16v
- *
- * a = abs(a)
- *
- * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
- * actually also calls vec_splat(0), but we already have a null vector.
- **********************************************************************/
-#define VEC_ABS(a)                            \
-    a = vec_max( a, vec_sub( zero_s16v, a ) );
-
-#define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )
-
-/***********************************************************************
- * VEC_ADD_ABS
- ***********************************************************************
- * a:    s16v
- * b, c: s32v
- *
- * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
- **********************************************************************/
-#define VEC_ADD_ABS(a,b,c) \
-    VEC_ABS( a );          \
-    c = vec_sum4s( a, b )
-
-/***********************************************************************
- * SATD 4x4
- **********************************************************************/
-static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                   uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    PREP_DIFF;
-    PREP_LOAD_SRC( pix1 );
-    vec_s16_t diff0v, diff1v, diff2v, diff3v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v;
-    vec_s32_t satdv;
-
-    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
-    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
-
-    /* Hadamar H */
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-
-    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
-                     diff0v, diff1v, diff2v, diff3v );
-    /* Hadamar V */
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 4x8
- **********************************************************************/
-static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                   uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    PREP_DIFF;
-    vec_s16_t diff0v, diff1v, diff2v, diff3v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v;
-    vec_s32_t satdv;
-
-    PREP_LOAD_SRC( pix1 );
-    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
-    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
-                     diff0v, diff1v, diff2v, diff3v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
-                     diff0v, diff1v, diff2v, diff3v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 8x4
- **********************************************************************/
-static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                   uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    PREP_DIFF;
-    vec_s16_t diff0v, diff1v, diff2v, diff3v,
-              diff4v, diff5v, diff6v, diff7v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v,
-              temp4v, temp5v, temp6v, temp7v;
-    vec_s32_t satdv;
-
-
-    PREP_LOAD_SRC( pix1 );
-    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
-    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
-
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    /* This causes warnings because temp4v...temp7v haven't be set,
-       but we don't care */
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diff0v, diff1v, diff2v, diff3v,
-                     diff4v, diff5v, diff6v, diff7v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 8x8
- **********************************************************************/
-static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                   uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    PREP_DIFF;
-    vec_s16_t diff0v, diff1v, diff2v, diff3v,
-              diff4v, diff5v, diff6v, diff7v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v,
-              temp4v, temp5v, temp6v, temp7v;
-    vec_s32_t satdv;
-
-    vec_u8_t _offset1_1v_ = vec_lvsl(0, pix1);
-    vec_u8_t _offset1_2v_ = vec_lvsl(0, pix1 + i_pix1);
-    vec_u8_t _offset2_1v_ = vec_lvsl(0, pix2);
-    vec_u8_t _offset2_2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1_1v, offset2_1v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset1_2v, offset2_2v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1_1v, offset2_1v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset1_2v, offset2_2v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1_1v, offset2_1v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset1_2v, offset2_2v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1_1v, offset2_1v );
-    VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset1_2v, offset2_2v );
-
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diff0v, diff1v, diff2v, diff3v,
-                     diff4v, diff5v, diff6v, diff7v );
-
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 8x16
- **********************************************************************/
-static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                    uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    PREP_DIFF;
-    vec_s16_t diff0v, diff1v, diff2v, diff3v,
-              diff4v, diff5v, diff6v, diff7v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v,
-              temp4v, temp5v, temp6v, temp7v;
-    vec_s32_t satdv;
-
-    PREP_LOAD_SRC( pix1 );
-    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
-    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diff0v, diff1v, diff2v, diff3v,
-                     diff4v, diff5v, diff6v, diff7v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diff0v, diff1v, diff2v, diff3v,
-                     diff4v, diff5v, diff6v, diff7v );
-    VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 16x8
- **********************************************************************/
-static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                    uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    LOAD_ZERO;
-    PREP_LOAD;
-    PREP_LOAD_SRC( pix2 );
-    vec_s32_t satdv;
-    vec_s16_t pix1v, pix2v;
-    vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
-              diffh4v, diffh5v, diffh6v, diffh7v;
-    vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
-              diffl4v, diffl5v, diffl6v, diffl7v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v,
-              temp4v, temp5v, temp6v, temp7v;
-
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
-
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffh0v, diffh1v, diffh2v, diffh3v,
-                     diffh4v, diffh5v, diffh6v, diffh7v );
-
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffl0v, diffl1v, diffl2v, diffl3v,
-                     diffl4v, diffl5v, diffl6v, diffl7v );
-
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-/***********************************************************************
- * SATD 16x16
- **********************************************************************/
-static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                     uint8_t *pix2, intptr_t i_pix2 )
-{
-    ALIGNED_16( int i_satd );
-
-    LOAD_ZERO;
-    PREP_LOAD;
-    vec_s32_t satdv;
-    vec_s16_t pix1v, pix2v;
-    vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
-              diffh4v, diffh5v, diffh6v, diffh7v;
-    vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
-              diffl4v, diffl5v, diffl6v, diffl7v;
-    vec_s16_t temp0v, temp1v, temp2v, temp3v,
-              temp4v, temp5v, temp6v, temp7v;
-    PREP_LOAD_SRC( pix2 );
-
-
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffh0v, diffh1v, diffh2v, diffh3v,
-                     diffh4v, diffh5v, diffh6v, diffh7v );
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffl0v, diffl1v, diffl2v, diffl3v,
-                     diffl4v, diffl5v, diffl6v, diffl7v );
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
-    VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffh0v, diffh1v, diffh2v, diffh3v,
-                     diffh4v, diffh5v, diffh6v, diffh7v );
-    VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
-                     temp4v, temp5v, temp6v, temp7v,
-                     diffl0v, diffl1v, diffl2v, diffl3v,
-                     diffl4v, diffl5v, diffl6v, diffl7v );
-    VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
-                 temp0v, temp1v, temp2v, temp3v );
-    VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
-                 temp4v, temp5v, temp6v, temp7v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
-    VEC_ADD_ABS( temp4v, satdv,     satdv );
-    VEC_ADD_ABS( temp5v, satdv,     satdv );
-    VEC_ADD_ABS( temp6v, satdv,     satdv );
-    VEC_ADD_ABS( temp7v, satdv,     satdv );
-
-    satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
-
-    return i_satd >> 1;
-}
-
-
-
-/***********************************************************************
-* Interleaved SAD routines
-**********************************************************************/
-
-static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
-                                        uint8_t *pix0, uint8_t *pix1,
-                                        uint8_t *pix2, uint8_t *pix3,
-                                        intptr_t i_stride, int scores[4] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
-    //vec_u8_t perm0v, perm1v, perm2v, perm3v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
-    vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-    sum3v = vec_splat_s32(0);
-
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-    perm3vA = vec_lvsl(0, pix3);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-    perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-    for( int y = 0; y < 8; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-    }
-
-    sum0v = vec_sums( sum0v, zero_s32v );
-    sum1v = vec_sums( sum1v, zero_s32v );
-    sum2v = vec_sums( sum2v, zero_s32v );
-    sum3v = vec_sums( sum3v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-    sum3v = vec_splat( sum3v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
-}
-
-static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
-                                        uint8_t *pix1, uint8_t *pix2,
-                                        intptr_t i_stride, int scores[3] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv; // temporary load vectors
-    vec_u8_t fencv, pix0v, pix1v, pix2v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
-    vec_s32_t sum0v, sum1v, sum2v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-
-    for( int y = 0; y < 8; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-    }
-
-    sum0v = vec_sums( sum0v, zero_s32v );
-    sum1v = vec_sums( sum1v, zero_s32v );
-    sum2v = vec_sums( sum2v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-}
-
-static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
-                                       uint8_t *pix3, intptr_t i_stride, int scores[4] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
-    vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-    sum3v = vec_splat_s32(0);
-
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-    perm3vA = vec_lvsl(0, pix3);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-    perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-    for( int y = 0; y < 4; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-    }
-
-    sum0v = vec_sums( sum0v, zero_s32v );
-    sum1v = vec_sums( sum1v, zero_s32v );
-    sum2v = vec_sums( sum2v, zero_s32v );
-    sum3v = vec_sums( sum3v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-    sum3v = vec_splat( sum3v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
-}
-
-static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
-                                       uint8_t *pix1, uint8_t *pix2,
-                                       intptr_t i_stride, int scores[3] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
-    vec_s32_t sum0v, sum1v, sum2v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-
-    for( int y = 0; y < 4; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        fencv = vec_ld(0, fenc);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-    }
-
-    sum0v = vec_sums( sum0v, zero_s32v );
-    sum1v = vec_sums( sum1v, zero_s32v );
-    sum2v = vec_sums( sum2v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-}
-
-
-static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
-                                       uint8_t *pix0, uint8_t *pix1,
-                                       uint8_t *pix2, uint8_t *pix3,
-                                       intptr_t i_stride, int scores[4] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
-    vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-    sum3v = vec_splat_s32(0);
-
-    permEncv = vec_lvsl(0, fenc);
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-    perm3vA = vec_lvsl(0, pix3);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-    perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-    for( int y = 0; y < 8; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-    }
-
-    sum0v = vec_sum2s( sum0v, zero_s32v );
-    sum1v = vec_sum2s( sum1v, zero_s32v );
-    sum2v = vec_sum2s( sum2v, zero_s32v );
-    sum3v = vec_sum2s( sum3v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 1 );
-    sum1v = vec_splat( sum1v, 1 );
-    sum2v = vec_splat( sum2v, 1 );
-    sum3v = vec_splat( sum3v, 1 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
-}
-
-static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
-                                       uint8_t *pix1, uint8_t *pix2,
-                                       intptr_t i_stride, int scores[3] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
-
-    vec_s32_t sum0v, sum1v, sum2v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-
-    permEncv = vec_lvsl(0, fenc);
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-
-    for( int y = 0; y < 8; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-    }
-
-    sum0v = vec_sum2s( sum0v, zero_s32v );
-    sum1v = vec_sum2s( sum1v, zero_s32v );
-    sum2v = vec_sum2s( sum2v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 1 );
-    sum1v = vec_splat( sum1v, 1 );
-    sum2v = vec_splat( sum2v, 1 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-}
-
-static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
-                                      uint8_t *pix0, uint8_t *pix1,
-                                      uint8_t *pix2, uint8_t *pix3,
-                                      intptr_t i_stride, int scores[4] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
-    vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-    sum3v = vec_splat_s32(0);
-
-    permEncv = vec_lvsl(0, fenc);
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-    perm3vA = vec_lvsl(0, pix3);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-    perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-    for( int y = 0; y < 4; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        temp_lv = vec_ld(0, pix3);
-        temp_hv = vec_ld(16, pix3);
-        pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
-        pix3 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-        sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-    }
-
-    sum0v = vec_sum2s( sum0v, zero_s32v );
-    sum1v = vec_sum2s( sum1v, zero_s32v );
-    sum2v = vec_sum2s( sum2v, zero_s32v );
-    sum3v = vec_sum2s( sum3v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 1 );
-    sum1v = vec_splat( sum1v, 1 );
-    sum2v = vec_splat( sum2v, 1 );
-    sum3v = vec_splat( sum3v, 1 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
-}
-
-static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
-                                      uint8_t *pix1, uint8_t *pix2,
-                                      intptr_t i_stride, int scores[3] )
-{
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
-    LOAD_ZERO;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t fencv, pix0v, pix1v, pix2v;
-    vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,  permEncv;
-
-    vec_s32_t sum0v, sum1v, sum2v;
-
-    sum0v = vec_splat_s32(0);
-    sum1v = vec_splat_s32(0);
-    sum2v = vec_splat_s32(0);
-
-    permEncv = vec_lvsl(0, fenc);
-    perm0vA = vec_lvsl(0, pix0);
-    perm1vA = vec_lvsl(0, pix1);
-    perm2vA = vec_lvsl(0, pix2);
-
-    perm0vB = vec_lvsl(0, pix0 + i_stride);
-    perm1vB = vec_lvsl(0, pix1 + i_stride);
-    perm2vB = vec_lvsl(0, pix2 + i_stride);
-
-    for( int y = 0; y < 4; y++ )
-    {
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-        temp_lv = vec_ld(0, pix0);
-        temp_hv = vec_ld(16, pix0);
-        pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
-        pix0 += i_stride;
-
-        temp_lv = vec_ld(0, pix1);
-        temp_hv = vec_ld(16, pix1);
-        pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
-        pix1 += i_stride;
-
-        temp_lv = vec_ld(0, fenc);
-        fencv = vec_perm(temp_lv, temp_hv, permEncv);
-        fenc += FENC_STRIDE;
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
-        pix2 += i_stride;
-
-        sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-        sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-        sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-    }
-
-    sum0v = vec_sum2s( sum0v, zero_s32v );
-    sum1v = vec_sum2s( sum1v, zero_s32v );
-    sum2v = vec_sum2s( sum2v, zero_s32v );
-
-    sum0v = vec_splat( sum0v, 1 );
-    sum1v = vec_splat( sum1v, 1 );
-    sum2v = vec_splat( sum2v, 1 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-}
-
-/***********************************************************************
-* SSD routines
-**********************************************************************/
-
-static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
-                                     uint8_t *pix2, intptr_t i_stride_pix2 )
-{
-    ALIGNED_16( int sum );
-
-    LOAD_ZERO;
-    vec_u8_t  pix1vA, pix2vA, pix1vB, pix2vB;
-    vec_u32_t sumv;
-    vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t permA, permB;
-
-    sumv = vec_splat_u32(0);
-
-    permA = vec_lvsl(0, pix2);
-    permB = vec_lvsl(0, pix2 + i_stride_pix2);
-
-    temp_lv = vec_ld(0, pix2);
-    temp_hv = vec_ld(16, pix2);
-    pix2vA = vec_perm(temp_lv, temp_hv, permA);
-    pix1vA = vec_ld(0, pix1);
-
-    for( int y = 0; y < 7; y++ )
-    {
-        pix1 += i_stride_pix1;
-        pix2 += i_stride_pix2;
-
-        maxA = vec_max(pix1vA, pix2vA);
-        minA = vec_min(pix1vA, pix2vA);
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2vB = vec_perm(temp_lv, temp_hv, permB);
-        pix1vB = vec_ld(0, pix1);
-
-        diffA = vec_sub(maxA, minA);
-        sumv = vec_msum(diffA, diffA, sumv);
-
-        pix1 += i_stride_pix1;
-        pix2 += i_stride_pix2;
-
-        maxB = vec_max(pix1vB, pix2vB);
-        minB = vec_min(pix1vB, pix2vB);
-
-        temp_lv = vec_ld(0, pix2);
-        temp_hv = vec_ld(16, pix2);
-        pix2vA = vec_perm(temp_lv, temp_hv, permA);
-        pix1vA = vec_ld(0, pix1);
-
-        diffB = vec_sub(maxB, minB);
-        sumv = vec_msum(diffB, diffB, sumv);
-    }
-
-    pix1 += i_stride_pix1;
-    pix2 += i_stride_pix2;
-
-    temp_lv = vec_ld(0, pix2);
-    temp_hv = vec_ld(16, pix2);
-    pix2vB = vec_perm(temp_lv, temp_hv, permB);
-    pix1vB = vec_ld(0, pix1);
-
-    maxA = vec_max(pix1vA, pix2vA);
-    minA = vec_min(pix1vA, pix2vA);
-
-    maxB = vec_max(pix1vB, pix2vB);
-    minB = vec_min(pix1vB, pix2vB);
-
-    diffA = vec_sub(maxA, minA);
-    sumv = vec_msum(diffA, diffA, sumv);
-
-    diffB = vec_sub(maxB, minB);
-    sumv = vec_msum(diffB, diffB, sumv);
-
-    sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
-    sumv = vec_splat(sumv, 3);
-    vec_ste((vec_s32_t) sumv, 0, &sum);
-    return sum;
-}
-
-static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
-                                   uint8_t *pix2, intptr_t i_stride_pix2 )
-{
-    ALIGNED_16( int sum );
-
-    LOAD_ZERO;
-    vec_u8_t  pix1v, pix2v;
-    vec_u32_t sumv;
-    vec_u8_t maxv, minv, diffv;
-    vec_u8_t temp_lv, temp_hv;
-    vec_u8_t perm1v, perm2v;
-
-    const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
-
-    sumv = vec_splat_u32(0);
-
-    perm1v = vec_lvsl(0, pix1);
-    perm2v = vec_lvsl(0, pix2);
-
-    for( int y = 0; y < 8; y++ )
-    {
-        temp_hv = vec_ld(0, pix1);
-        temp_lv = vec_ld(7, pix1);
-        pix1v = vec_perm(temp_hv, temp_lv, perm1v);
-
-        temp_hv = vec_ld(0, pix2);
-        temp_lv = vec_ld(7, pix2);
-        pix2v = vec_perm(temp_hv, temp_lv, perm2v);
-
-        maxv = vec_max(pix1v, pix2v);
-        minv = vec_min(pix1v, pix2v);
-
-        diffv = vec_sub(maxv, minv);
-        sumv = vec_msum(diffv, diffv, sumv);
-
-        pix1 += i_stride_pix1;
-        pix2 += i_stride_pix2;
-    }
-
-    sumv = vec_sel( zero_u32v, sumv, sel );
-
-    sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
-    sumv = vec_splat(sumv, 3);
-    vec_ste((vec_s32_t) sumv, 0, &sum);
-
-    return sum;
-}
-
-
-/****************************************************************************
- * variance
- ****************************************************************************/
-static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
-{
-    ALIGNED_16(uint32_t sum_tab[4]);
-    ALIGNED_16(uint32_t sqr_tab[4]);
-
-    LOAD_ZERO;
-    vec_u32_t sqr_v = zero_u32v;
-    vec_u32_t sum_v = zero_u32v;
-
-    for( int y = 0; y < 16; y++ )
-    {
-        vec_u8_t pix0_v = vec_ld(0, pix);
-        sum_v = vec_sum4s(pix0_v, sum_v);
-        sqr_v = vec_msum(pix0_v, pix0_v, sqr_v);
-
-        pix += i_stride;
-    }
-    sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
-    sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
-    vec_ste(sum_v, 12, sum_tab);
-    vec_ste(sqr_v, 12, sqr_tab);
-
-    uint32_t sum = sum_tab[3];
-    uint32_t sqr = sqr_tab[3];
-    return sum + ((uint64_t)sqr<<32);
-}
-
-static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
-{
-    ALIGNED_16(uint32_t sum_tab[4]);
-    ALIGNED_16(uint32_t sqr_tab[4]);
-
-    LOAD_ZERO;
-    vec_u32_t sqr_v = zero_u32v;
-    vec_u32_t sum_v = zero_u32v;
-
-    static const vec_u8_t perm_tab[] =
-    {
-        CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,  /* pix=mod16, i_stride=mod16 */
-           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
-        CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,  /* pix=mod8, i_stride=mod16  */
-           0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F),
-    };
-    vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ];
-
-    for( int y = 0; y < 4; y++ )
-    {
-        vec_u8_t pix0_v = vec_ld(0, pix);
-        vec_u8_t pix1_v = vec_ld(i_stride, pix);
-        vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm);
-        sum_v = vec_sum4s(pix_v, sum_v);
-        sqr_v = vec_msum(pix_v, pix_v, sqr_v);
-
-        pix += i_stride<<1;
-    }
-    sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
-    sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
-    vec_ste(sum_v, 12, sum_tab);
-    vec_ste(sqr_v, 12, sqr_tab);
-
-    uint32_t sum = sum_tab[3];
-    uint32_t sqr = sqr_tab[3];
-    return sum + ((uint64_t)sqr<<32);
-}
-
-
-/**********************************************************************
- * SA8D routines: sum of 8x8 Hadamard transformed differences
- **********************************************************************/
-/* SA8D_1D unrolled by 8 in Altivec */
-#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v,  \
-                         sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
-{                                                         \
-    /* int    a0  =        SRC(0) + SRC(4) */             \
-    vec_s16_t a0v = vec_add(sa8d0v, sa8d4v);              \
-    /* int    a4  =        SRC(0) - SRC(4) */             \
-    vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v);              \
-    /* int    a1  =        SRC(1) + SRC(5) */             \
-    vec_s16_t a1v = vec_add(sa8d1v, sa8d5v);              \
-    /* int    a5  =        SRC(1) - SRC(5) */             \
-    vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v);              \
-    /* int    a2  =        SRC(2) + SRC(6) */             \
-    vec_s16_t a2v = vec_add(sa8d2v, sa8d6v);              \
-    /* int    a6  =        SRC(2) - SRC(6) */             \
-    vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v);              \
-    /* int    a3  =        SRC(3) + SRC(7) */             \
-    vec_s16_t a3v = vec_add(sa8d3v, sa8d7v);              \
-    /* int    a7  =        SRC(3) - SRC(7) */             \
-    vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v);              \
-                                                          \
-    /* int    b0  =         a0 + a2  */                   \
-    vec_s16_t b0v = vec_add(a0v, a2v);                    \
-    /* int    b2  =         a0 - a2; */                   \
-    vec_s16_t  b2v = vec_sub(a0v, a2v);                   \
-    /* int    b1  =         a1 + a3; */                   \
-    vec_s16_t b1v = vec_add(a1v, a3v);                    \
-    /* int    b3  =         a1 - a3; */                   \
-    vec_s16_t b3v = vec_sub(a1v, a3v);                    \
-    /* int    b4  =         a4 + a6; */                   \
-    vec_s16_t b4v = vec_add(a4v, a6v);                    \
-    /* int    b6  =         a4 - a6; */                   \
-    vec_s16_t b6v = vec_sub(a4v, a6v);                    \
-    /* int    b5  =         a5 + a7; */                   \
-    vec_s16_t b5v = vec_add(a5v, a7v);                    \
-    /* int    b7  =         a5 - a7; */                   \
-    vec_s16_t b7v = vec_sub(a5v, a7v);                    \
-                                                          \
-    /* DST(0,        b0 + b1) */                          \
-    sa8d0v = vec_add(b0v, b1v);                           \
-    /* DST(1,        b0 - b1) */                          \
-    sa8d1v = vec_sub(b0v, b1v);                           \
-    /* DST(2,        b2 + b3) */                          \
-    sa8d2v = vec_add(b2v, b3v);                           \
-    /* DST(3,        b2 - b3) */                          \
-    sa8d3v = vec_sub(b2v, b3v);                           \
-    /* DST(4,        b4 + b5) */                          \
-    sa8d4v = vec_add(b4v, b5v);                           \
-    /* DST(5,        b4 - b5) */                          \
-    sa8d5v = vec_sub(b4v, b5v);                           \
-    /* DST(6,        b6 + b7) */                          \
-    sa8d6v = vec_add(b6v, b7v);                           \
-    /* DST(7,        b6 - b7) */                          \
-    sa8d7v = vec_sub(b6v, b7v);                           \
-}
-
-static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                        uint8_t *pix2, intptr_t i_pix2 )
-{
-    int32_t i_satd=0;
-
-    PREP_DIFF;
-    PREP_LOAD_SRC( pix1 );
-    PREP_LOAD_SRC( pix2 );
-
-    vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
-
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
-
-    vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
-
-    SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
-                    diff4v, diff5v, diff6v, diff7v);
-
-    VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
-                    diff4v, diff5v, diff6v, diff7v,
-                    sa8d0v, sa8d1v, sa8d2v, sa8d3v,
-                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );
-
-    SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
-                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );
-
-    /* accumulation of the absolute value of all elements of the resulting bloc */
-    vec_s16_t abs0v = VEC_ABS(sa8d0v);
-    vec_s16_t abs1v = VEC_ABS(sa8d1v);
-    vec_s16_t sum01v = vec_add(abs0v, abs1v);
-
-    vec_s16_t abs2v = VEC_ABS(sa8d2v);
-    vec_s16_t abs3v = VEC_ABS(sa8d3v);
-    vec_s16_t sum23v = vec_add(abs2v, abs3v);
-
-    vec_s16_t abs4v = VEC_ABS(sa8d4v);
-    vec_s16_t abs5v = VEC_ABS(sa8d5v);
-    vec_s16_t sum45v = vec_add(abs4v, abs5v);
-
-    vec_s16_t abs6v = VEC_ABS(sa8d6v);
-    vec_s16_t abs7v = VEC_ABS(sa8d7v);
-    vec_s16_t sum67v = vec_add(abs6v, abs7v);
-
-    vec_s16_t sum0123v = vec_add(sum01v, sum23v);
-    vec_s16_t sum4567v = vec_add(sum45v, sum67v);
-
-    vec_s32_t sumblocv;
-
-    sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
-    sumblocv = vec_sum4s(sum4567v, sumblocv );
-
-    sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
-
-    sumblocv = vec_splat(sumblocv, 3);
-
-    vec_ste(sumblocv, 0, &i_satd);
-
-    return i_satd;
-}
-
-static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                   uint8_t *pix2, intptr_t i_pix2 )
-{
-    int32_t i_satd;
-    i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
-    return i_satd;
-}
-
-static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
-                                     uint8_t *pix2, intptr_t i_pix2 )
-{
-    int32_t i_satd;
-
-    i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0],          i_pix1, &pix2[0],          i_pix2 )
-            + pixel_sa8d_8x8_core_altivec( &pix1[8],          i_pix1, &pix2[8],          i_pix2 )
-            + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1],   i_pix1, &pix2[8*i_pix2],   i_pix2 )
-            + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
-    return i_satd;
-}
-
-#define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
-    vec_s16_t t0 = vec_add(s0, s1);                 \
-    vec_s16_t t1 = vec_sub(s0, s1);                 \
-    vec_s16_t t2 = vec_add(s2, s3);                 \
-    vec_s16_t t3 = vec_sub(s2, s3);                 \
-    d0 = vec_add(t0, t2);                           \
-    d2 = vec_sub(t0, t2);                           \
-    d1 = vec_add(t1, t3);                           \
-    d3 = vec_sub(t1, t3);                           \
-}
-
-#define VEC_LOAD_HIGH( p, num )                                    \
-    vec_u8_t pix8_##num = vec_ld( stride*num, p );                 \
-    vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \
-    vec_s16_t pix16_d##num;
-
-static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
-{
-    ALIGNED_16( int32_t sum4_tab[4] );
-    ALIGNED_16( int32_t sum8_tab[4] );
-    LOAD_ZERO;
-
-    VEC_LOAD_HIGH( pix, 0 );
-    VEC_LOAD_HIGH( pix, 1 );
-    VEC_LOAD_HIGH( pix, 2 );
-    VEC_LOAD_HIGH( pix, 3 );
-    HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
-                      pix16_s0,pix16_s1,pix16_s2,pix16_s3);
-
-    VEC_LOAD_HIGH( pix, 4 );
-    VEC_LOAD_HIGH( pix, 5 );
-    VEC_LOAD_HIGH( pix, 6 );
-    VEC_LOAD_HIGH( pix, 7 );
-    HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
-                      pix16_s4,pix16_s5,pix16_s6,pix16_s7);
-
-    VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
-                    pix16_d4, pix16_d5, pix16_d6, pix16_d7,
-                    pix16_s0, pix16_s1, pix16_s2, pix16_s3,
-                    pix16_s4, pix16_s5, pix16_s6, pix16_s7);
-
-    HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
-                      pix16_s0,pix16_s1,pix16_s2,pix16_s3);
-
-    HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
-                      pix16_s4,pix16_s5,pix16_s6,pix16_s7);
-
-    vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
-    vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
-    vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
-    vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );
-
-    vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
-    vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);
-
-    vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
-    vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
-    vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
-    vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
-    vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
-    vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
-    vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
-    vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);
-
-    int sum4 = sum4_tab[3];
-
-    VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
-                    tmpi4, tmpi5, tmpi6, tmpi7,
-                    pix16_d0, pix16_d1, pix16_d2, pix16_d3,
-                    pix16_d4, pix16_d5, pix16_d6, pix16_d7);
-
-    vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
-                                  VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
-    vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
-                                  VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
-    vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
-                                  VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
-    vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
-                                  VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );
-
-    vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
-    vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);
-
-    int sum8 = sum8_tab[3];
-
-    ALIGNED_16( int16_t tmp0_4_tab[8] );
-    vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
-
-    sum4 -= tmp0_4_tab[0];
-    sum8 -= tmp0_4_tab[0];
-    return ((uint64_t)sum8<<32) + sum4;
-}
-
-
-static const vec_u8_t hadamard_permtab[] =
-{
-    CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03,     /* pix = mod16 */
-       0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
-    CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B,     /* pix = mod8 */
-       0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
- };
-
-static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
-{
-    int idx =  ((uintptr_t)pix & 8) >> 3;
-    vec_u8_t permh = hadamard_permtab[idx];
-    vec_u8_t perml = hadamard_permtab[!idx];
-    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
-    sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
-    sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
-    sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
-    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
-}
-
-static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
-{
-    int idx =  ((uintptr_t)pix & 8) >> 3;
-    vec_u8_t permh = hadamard_permtab[idx];
-    vec_u8_t perml = hadamard_permtab[!idx];
-    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
-    sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
-    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
-}
-
-static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
-{
-    vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
-    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
-    sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
-    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
-}
-
-static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
-{
-    vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
-    uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
-    return ((sum>>34)<<32) + ((uint32_t)sum>>1);
-}
-
-
-/****************************************************************************
- * structural similarity metric
- ****************************************************************************/
-static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
-                                     const uint8_t *pix2, intptr_t stride2,
-                                     int sums[2][4] )
-{
-    ALIGNED_16( int temp[4] );
-
-    vec_u8_t pix1v, pix2v;
-    vec_u32_t s1v, s2v, ssv, s12v;
-    PREP_LOAD;
-    PREP_LOAD_SRC (pix1);
-    PREP_LOAD_SRC (pix2);
-    LOAD_ZERO;
-
-    s1v = s2v = ssv = s12v = zero_u32v;
-
-    for( int y = 0; y < 4; y++ )
-    {
-        VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
-        VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
-
-        s1v = vec_sum4s( pix1v, s1v );
-        s2v = vec_sum4s( pix2v, s2v );
-        ssv = vec_msum( pix1v, pix1v, ssv );
-        ssv = vec_msum( pix2v, pix2v, ssv );
-        s12v = vec_msum( pix1v, pix2v, s12v );
-    }
-
-    vec_st( (vec_s32_t)s1v, 0, temp );
-    sums[0][0] = temp[0];
-    sums[1][0] = temp[1];
-    vec_st( (vec_s32_t)s2v, 0, temp );
-    sums[0][1] = temp[0];
-    sums[1][1] = temp[1];
-    vec_st( (vec_s32_t)ssv, 0, temp );
-    sums[0][2] = temp[0];
-    sums[1][2] = temp[1];
-    vec_st( (vec_s32_t)s12v, 0, temp );
-    sums[0][3] = temp[0];
-    sums[1][3] = temp[1];
-}
-
-#define SATD_X( size ) \
-static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
-                                            intptr_t i_stride, int scores[3] )\
-{\
-    scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
-}\
-static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
-                                            uint8_t *pix3, intptr_t i_stride, int scores[4] )\
-{\
-    scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
-    scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
-    scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
-    scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\
-}
-SATD_X( 16x16 )\
-SATD_X( 16x8 )\
-SATD_X( 8x16 )\
-SATD_X( 8x8 )\
-SATD_X( 8x4 )\
-SATD_X( 4x8 )\
-SATD_X( 4x4 )
-
-
-#define INTRA_MBCMP_8x8( mbcmp )\
-void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
-{\
-    ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
-    x264_predict_8x8_v_c( pix, edge );\
-    res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_8x8_h_c( pix, edge );\
-    res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_8x8_dc_c( pix, edge );\
-    res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
-}
-
-INTRA_MBCMP_8x8(sad)
-INTRA_MBCMP_8x8(sa8d)
-
-#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
-void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\
-{\
-    x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
-    res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
-    res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-    x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
-    res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
-}
-
-INTRA_MBCMP(satd, 4, v, h, dc, )
-INTRA_MBCMP(sad, 8, dc, h, v, c )
-INTRA_MBCMP(satd, 8, dc, h, v, c )
-INTRA_MBCMP(sad, 16, v, h, dc, )
-INTRA_MBCMP(satd, 16, v, h, dc, )
-#endif // !HIGH_BIT_DEPTH
-
-/****************************************************************************
- * x264_pixel_init:
- ****************************************************************************/
-void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
-{
-#if !HIGH_BIT_DEPTH
-    pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
-    pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
-    pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
-    pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
-
-    pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
-    pixf->sad_x3[PIXEL_8x16]  = pixel_sad_x3_8x16_altivec;
-    pixf->sad_x3[PIXEL_16x8]  = pixel_sad_x3_16x8_altivec;
-    pixf->sad_x3[PIXEL_8x8]   = pixel_sad_x3_8x8_altivec;
-
-    pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
-    pixf->sad_x4[PIXEL_8x16]  = pixel_sad_x4_8x16_altivec;
-    pixf->sad_x4[PIXEL_16x8]  = pixel_sad_x4_16x8_altivec;
-    pixf->sad_x4[PIXEL_8x8]   = pixel_sad_x4_8x8_altivec;
-
-    pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
-    pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
-    pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
-    pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
-    pixf->satd[PIXEL_8x4]   = pixel_satd_8x4_altivec;
-    pixf->satd[PIXEL_4x8]   = pixel_satd_4x8_altivec;
-    pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
-
-    pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec;
-    pixf->satd_x3[PIXEL_8x16]  = pixel_satd_x3_8x16_altivec;
-    pixf->satd_x3[PIXEL_16x8]  = pixel_satd_x3_16x8_altivec;
-    pixf->satd_x3[PIXEL_8x8]   = pixel_satd_x3_8x8_altivec;
-    pixf->satd_x3[PIXEL_8x4]   = pixel_satd_x3_8x4_altivec;
-    pixf->satd_x3[PIXEL_4x8]   = pixel_satd_x3_4x8_altivec;
-    pixf->satd_x3[PIXEL_4x4]   = pixel_satd_x3_4x4_altivec;
-
-    pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec;
-    pixf->satd_x4[PIXEL_8x16]  = pixel_satd_x4_8x16_altivec;
-    pixf->satd_x4[PIXEL_16x8]  = pixel_satd_x4_16x8_altivec;
-    pixf->satd_x4[PIXEL_8x8]   = pixel_satd_x4_8x8_altivec;
-    pixf->satd_x4[PIXEL_8x4]   = pixel_satd_x4_8x4_altivec;
-    pixf->satd_x4[PIXEL_4x8]   = pixel_satd_x4_4x8_altivec;
-    pixf->satd_x4[PIXEL_4x4]   = pixel_satd_x4_4x4_altivec;
-
-    pixf->intra_sad_x3_8x8    = intra_sad_x3_8x8_altivec;
-    pixf->intra_sad_x3_8x8c   = intra_sad_x3_8x8c_altivec;
-    pixf->intra_sad_x3_16x16  = intra_sad_x3_16x16_altivec;
-
-    pixf->intra_satd_x3_4x4   = intra_satd_x3_4x4_altivec;
-    pixf->intra_satd_x3_8x8c  = intra_satd_x3_8x8c_altivec;
-    pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec;
-
-    pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
-    pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8_altivec;
-
-    pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
-    pixf->sa8d[PIXEL_8x8]   = pixel_sa8d_8x8_altivec;
-
-    pixf->intra_sa8d_x3_8x8   = intra_sa8d_x3_8x8_altivec;
-
-    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec;
-    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_altivec;
-
-    pixf->hadamard_ac[PIXEL_16x16] = x264_pixel_hadamard_ac_16x16_altivec;
-    pixf->hadamard_ac[PIXEL_16x8]  = x264_pixel_hadamard_ac_16x8_altivec;
-    pixf->hadamard_ac[PIXEL_8x16]  = x264_pixel_hadamard_ac_8x16_altivec;
-    pixf->hadamard_ac[PIXEL_8x8]   = x264_pixel_hadamard_ac_8x8_altivec;
-
-    pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/pixel.h b/android/src/main/libenc/jni/libx264/common/ppc/pixel.h
deleted file mode 100755
index 5e829f9..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/pixel.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * pixel.h: ppc pixel metrics
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PPC_PIXEL_H
-#define X264_PPC_PIXEL_H
-
-void x264_pixel_altivec_init( x264_pixel_function_t *pixf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/ppccommon.h b/android/src/main/libenc/jni/libx264/common/ppc/ppccommon.h
deleted file mode 100755
index 4c91cd2..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/ppccommon.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/*****************************************************************************
- * ppccommon.h: ppc utility macros
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Eric Petit <eric.petit@lapsus.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
-/***********************************************************************
- * For constant vectors, use parentheses on OS X and braces on Linux
- **********************************************************************/
-#if defined(__APPLE__) && __GNUC__ < 4
-#define CV(a...) (a)
-#else
-#define CV(a...) {a}
-#endif
-
-/***********************************************************************
- * Vector types
- **********************************************************************/
-#define vec_u8_t  vector unsigned char
-#define vec_s8_t  vector signed char
-#define vec_u16_t vector unsigned short
-#define vec_s16_t vector signed short
-#define vec_u32_t vector unsigned int
-#define vec_s32_t vector signed int
-
-typedef union {
-  uint32_t s[4];
-  vec_u32_t v;
-} vec_u32_u;
-
-typedef union {
-  uint16_t s[8];
-  vec_u16_t v;
-} vec_u16_u;
-
-typedef union {
-  int16_t s[8];
-  vec_s16_t v;
-} vec_s16_u;
-
-typedef union {
-  uint8_t s[16];
-  vec_u8_t v;
-} vec_u8_u;
-
-/***********************************************************************
- * Null vector
- **********************************************************************/
-#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
-
-#define zero_u8v  (vec_u8_t)  zerov
-#define zero_s8v  (vec_s8_t)  zerov
-#define zero_u16v (vec_u16_t) zerov
-#define zero_s16v (vec_s16_t) zerov
-#define zero_u32v (vec_u32_t) zerov
-#define zero_s32v (vec_s32_t) zerov
-
-/***********************************************************************
- * 8 <-> 16 bits conversions
- **********************************************************************/
-#ifdef WORDS_BIGENDIAN
-#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
-#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
-#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
-#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
-#else
-#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
-#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
-#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
-#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
-#endif
-
-#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
-#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
-
-#define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
-#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )
-
-
-/***********************************************************************
- * 16 <-> 32 bits conversions
- **********************************************************************/
-#ifdef WORDS_BIGENDIAN
-#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
-#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
-#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
-#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
-#else
-#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
-#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
-#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
-#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
-#endif
-
-#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
-#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
-
-#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
-#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
-
-
-/***********************************************************************
- * PREP_LOAD: declares two vectors required to perform unaligned loads
- * VEC_LOAD:  loads n bytes from u8 * p into vector v of type t where o is from original src offset
- * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
- * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
- **********************************************************************/
-#define PREP_LOAD     \
-    vec_u8_t _hv, _lv
-
-#define PREP_LOAD_SRC( src )              \
-    vec_u8_t _##src##_ = vec_lvsl(0, src)
-
-#define VEC_LOAD_G( p, v, n, t )                 \
-    _hv = vec_ld( 0, p );                        \
-    v   = (t) vec_lvsl( 0, p );                  \
-    _lv = vec_ld( n - 1, p );                    \
-    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
-
-#define VEC_LOAD( p, v, n, t, g )                   \
-    _hv = vec_ld( 0, p );                           \
-    _lv = vec_ld( n - 1, p );                       \
-    v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )
-
-#define VEC_LOAD_OFFSET( p, v, n, t, o )         \
-    _hv = vec_ld( 0, p);                         \
-    _lv = vec_ld( n - 1, p );                    \
-    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) o )
-
-#define VEC_LOAD_PARTIAL( p, v, n, t, g)               \
-    _hv = vec_ld( 0, p);                               \
-    v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
-
-
-/***********************************************************************
- * PREP_STORE##n: declares required vectors to store n bytes to a
- *                potentially unaligned address
- * VEC_STORE##n:  stores n bytes from vector v to address p
- **********************************************************************/
-#define PREP_STORE16 \
-    vec_u8_t _tmp1v  \
-
-#define PREP_STORE16_DST( dst )             \
-    vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
-    vec_u8_t _##dst##r_ = vec_lvsr(0, dst);
-
-#define VEC_STORE16( v, p, o )                           \
-    _hv    = vec_ld( 0, p );                             \
-    _lv    = vec_ld( 15, p );                            \
-    _tmp1v = vec_perm( _lv, _hv, _##o##l_ );             \
-    _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
-    vec_st( _lv, 15, (uint8_t *) p );                    \
-    _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
-    vec_st( _hv, 0, (uint8_t *) p )
-
-
-#define PREP_STORE8 \
-    vec_u8_t _tmp3v \
-
-#define VEC_STORE8( v, p )                \
-    _tmp3v = vec_lvsl(0, p);              \
-    v = vec_perm(v, v, _tmp3v);           \
-    vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
-    vec_ste((vec_u32_t)v,4,(uint32_t*)p)
-
-
-#define PREP_STORE4                                        \
-    PREP_STORE16;                                          \
-    vec_u8_t _tmp2v, _tmp3v;                               \
-    const vec_u8_t sel =                                   \
-        (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)
-
-#define VEC_STORE4( v, p )                      \
-    _tmp3v = vec_lvsr( 0, p );                  \
-    v      = vec_perm( v, v, _tmp3v );          \
-    _lv    = vec_ld( 3, p );                    \
-    _tmp1v = vec_perm( sel, zero_u8v, _tmp3v ); \
-    _lv    = vec_sel( _lv, v, _tmp1v );         \
-    vec_st( _lv, 3, p );                        \
-    _hv    = vec_ld( 0, p );                    \
-    _tmp2v = vec_perm( zero_u8v, sel, _tmp3v ); \
-    _hv    = vec_sel( _hv, v, _tmp2v );         \
-    vec_st( _hv, 0, p )
-
-/***********************************************************************
- * VEC_TRANSPOSE_8
- ***********************************************************************
- * Transposes a 8x8 matrix of s16 vectors
- **********************************************************************/
-#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
-    b0 = vec_mergeh( a0, a4 ); \
-    b1 = vec_mergel( a0, a4 ); \
-    b2 = vec_mergeh( a1, a5 ); \
-    b3 = vec_mergel( a1, a5 ); \
-    b4 = vec_mergeh( a2, a6 ); \
-    b5 = vec_mergel( a2, a6 ); \
-    b6 = vec_mergeh( a3, a7 ); \
-    b7 = vec_mergel( a3, a7 ); \
-    a0 = vec_mergeh( b0, b4 ); \
-    a1 = vec_mergel( b0, b4 ); \
-    a2 = vec_mergeh( b1, b5 ); \
-    a3 = vec_mergel( b1, b5 ); \
-    a4 = vec_mergeh( b2, b6 ); \
-    a5 = vec_mergel( b2, b6 ); \
-    a6 = vec_mergeh( b3, b7 ); \
-    a7 = vec_mergel( b3, b7 ); \
-    b0 = vec_mergeh( a0, a4 ); \
-    b1 = vec_mergel( a0, a4 ); \
-    b2 = vec_mergeh( a1, a5 ); \
-    b3 = vec_mergel( a1, a5 ); \
-    b4 = vec_mergeh( a2, a6 ); \
-    b5 = vec_mergel( a2, a6 ); \
-    b6 = vec_mergeh( a3, a7 ); \
-    b7 = vec_mergel( a3, a7 )
-
-/***********************************************************************
- * VEC_TRANSPOSE_4
- ***********************************************************************
- * Transposes a 4x4 matrix of s16 vectors.
- * Actually source and destination are 8x4. The low elements of the
- * source are discarded and the low elements of the destination mustn't
- * be used.
- **********************************************************************/
-#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
-    b0 = vec_mergeh( a0, a0 ); \
-    b1 = vec_mergeh( a1, a0 ); \
-    b2 = vec_mergeh( a2, a0 ); \
-    b3 = vec_mergeh( a3, a0 ); \
-    a0 = vec_mergeh( b0, b2 ); \
-    a1 = vec_mergel( b0, b2 ); \
-    a2 = vec_mergeh( b1, b3 ); \
-    a3 = vec_mergel( b1, b3 ); \
-    b0 = vec_mergeh( a0, a2 ); \
-    b1 = vec_mergel( a0, a2 ); \
-    b2 = vec_mergeh( a1, a3 ); \
-    b3 = vec_mergel( a1, a3 )
-
-/***********************************************************************
- * VEC_DIFF_H
- ***********************************************************************
- * p1, p2:    u8 *
- * i1, i2, n: int
- * d:         s16v
- *
- * Loads n bytes from p1 and p2, do the diff of the high elements into
- * d, increments p1 and p2 by i1 and i2 into known offset g
- **********************************************************************/
-#define PREP_DIFF           \
-    LOAD_ZERO;              \
-    PREP_LOAD;              \
-    vec_s16_t pix1v, pix2v;
-
-
-#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g)               \
-    VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
-    pix1v = vec_u8_to_s16( pix1v );                 \
-    VEC_LOAD( p2, pix2v, n, vec_s16_t, g);          \
-    pix2v = vec_u8_to_s16( pix2v );                 \
-    d     = vec_sub( pix1v, pix2v );                \
-    p1   += i1;                                     \
-    p2   += i2
-
-#define VEC_DIFF_H_OFFSET(p1,i1,p2,i2,n,d,g1,g2)    \
-    pix1v = (vec_s16_t)vec_perm( vec_ld( 0, p1 ), zero_u8v, _##g1##_ );\
-    pix1v = vec_u8_to_s16( pix1v );                 \
-    VEC_LOAD( p2, pix2v, n, vec_s16_t, g2);         \
-    pix2v = vec_u8_to_s16( pix2v );                 \
-    d     = vec_sub( pix1v, pix2v );                \
-    p1   += i1;                                     \
-    p2   += i2
-
-
-/***********************************************************************
- * VEC_DIFF_HL
- ***********************************************************************
- * p1, p2: u8 *
- * i1, i2: int
- * dh, dl: s16v
- *
- * Loads 16 bytes from p1 and p2, do the diff of the high elements into
- * dh, the diff of the low elements into dl, increments p1 and p2 by i1
- * and i2
- **********************************************************************/
-#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl)       \
-    pix1v = (vec_s16_t)vec_ld(0, p1);        \
-    temp0v = vec_u8_to_s16_h( pix1v );       \
-    temp1v = vec_u8_to_s16_l( pix1v );       \
-    VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
-    temp2v = vec_u8_to_s16_h( pix2v );       \
-    temp3v = vec_u8_to_s16_l( pix2v );       \
-    dh     = vec_sub( temp0v, temp2v );      \
-    dl     = vec_sub( temp1v, temp3v );      \
-    p1    += i1;                             \
-    p2    += i2
-
-/***********************************************************************
-* VEC_DIFF_H_8BYTE_ALIGNED
-***********************************************************************
-* p1, p2:    u8 *
-* i1, i2, n: int
-* d:         s16v
-*
-* Loads n bytes from p1 and p2, do the diff of the high elements into
-* d, increments p1 and p2 by i1 and i2
-* Slightly faster when we know we are loading/diffing 8bytes which
-* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
-**********************************************************************/
-#define PREP_DIFF_8BYTEALIGNED \
-LOAD_ZERO;                     \
-vec_s16_t pix1v, pix2v;        \
-vec_u8_t pix1v8, pix2v8;       \
-vec_u8_t permPix1, permPix2;   \
-permPix1 = vec_lvsl(0, pix1);  \
-permPix2 = vec_lvsl(0, pix2);  \
-
-#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d)     \
-pix1v8 = vec_perm(vec_ld(0,p1), zero_u8v, permPix1);  \
-pix2v8 = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
-pix1v = vec_u8_to_s16( pix1v8 );                      \
-pix2v = vec_u8_to_s16( pix2v8 );                      \
-d = vec_sub( pix1v, pix2v);                           \
-p1 += i1;                                             \
-p2 += i2;
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/predict.c b/android/src/main/libenc/jni/libx264/common/ppc/predict.c
deleted file mode 100755
index f35e180..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/predict.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/*****************************************************************************
- * predict.c: ppc intra prediction
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "predict.h"
-#include "pixel.h"
-#include "ppccommon.h"
-
-#if !HIGH_BIT_DEPTH
-static void predict_8x8c_p_altivec( uint8_t *src )
-{
-    int H = 0, V = 0;
-
-    for( int i = 0; i < 4; i++ )
-    {
-        H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
-        V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
-    }
-
-    int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
-    int b = ( 17 * H + 16 ) >> 5;
-    int c = ( 17 * V + 16 ) >> 5;
-    int i00 = a -3*b -3*c + 16;
-
-    vec_s16_u i00_u, b_u, c_u;
-    i00_u.s[0] = i00;
-    b_u.s[0]   = b;
-    c_u.s[0]   = c;
-
-    vec_u16_t val5_v = vec_splat_u16(5);
-    vec_s16_t i00_v, b_v, c_v;
-    i00_v = vec_splat(i00_u.v, 0);
-    b_v = vec_splat(b_u.v, 0);
-    c_v = vec_splat(c_u.v, 0);
-
-    vec_s16_t induc_v  = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
-    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
-
-    PREP_STORE8;
-
-    for( int i = 0; i < 8; ++i )
-    {
-        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
-        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v);
-        VEC_STORE8(com_sat_v, &src[0]);
-        src += FDEC_STRIDE;
-        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
-
-    }
-}
-
-
-/****************************************************************************
- * 16x16 prediction for intra luma block
- ****************************************************************************/
-
-static void predict_16x16_p_altivec( uint8_t *src )
-{
-    int H = 0, V = 0;
-
-    for( int i = 1; i <= 8; i++ )
-    {
-        H += i * ( src[7+i - FDEC_STRIDE ]  - src[7-i - FDEC_STRIDE ] );
-        V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );
-    }
-
-    int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
-    int b = ( 5 * H + 32 ) >> 6;
-    int c = ( 5 * V + 32 ) >> 6;
-    int i00 = a - b * 7 - c * 7 + 16;
-
-    vec_s16_u i00_u, b_u, c_u;
-    i00_u.s[0] = i00;
-    b_u.s[0]   = b;
-    c_u.s[0]   = c;
-
-    vec_u16_t val5_v = vec_splat_u16(5);
-    vec_s16_t i00_v, b_v, c_v;
-    i00_v = vec_splat(i00_u.v, 0);
-    b_v = vec_splat(b_u.v, 0);
-    c_v = vec_splat(c_u.v, 0);
-    vec_s16_t induc_v  = (vec_s16_t) CV(0,  1,  2,  3,  4,  5,  6,  7);
-    vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3));
-    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
-    vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v);
-
-    for( int y = 0; y < 16; y++ )
-    {
-        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
-        vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v);
-        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v);
-        vec_st( com_sat_v, 0, &src[0]);
-        src += FDEC_STRIDE;
-        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
-        add_i0_b_8v = vec_adds(add_i0_b_8v, c_v);
-    }
-}
-
-#define PREDICT_16x16_DC_ALTIVEC(v) \
-for( int i = 0; i < 16; i += 2)     \
-{                                   \
-    vec_st(v, 0, src);              \
-    vec_st(v, FDEC_STRIDE, src);    \
-    src += FDEC_STRIDE*2;           \
-}
-
-static void predict_16x16_dc_altivec( uint8_t *src )
-{
-    uint32_t dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-    {
-        dc += src[-1 + i * FDEC_STRIDE];
-        dc += src[i - FDEC_STRIDE];
-    }
-    vec_u8_u v ; v.s[0] = (( dc + 16 ) >> 5);
-    vec_u8_t bc_v = vec_splat(v.v, 0);
-
-    PREDICT_16x16_DC_ALTIVEC(bc_v);
-}
-
-static void predict_16x16_dc_left_altivec( uint8_t *src )
-{
-    uint32_t dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-        dc += src[-1 + i * FDEC_STRIDE];
-    vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
-    vec_u8_t bc_v = vec_splat(v.v, 0);
-
-    PREDICT_16x16_DC_ALTIVEC(bc_v);
-}
-
-static void predict_16x16_dc_top_altivec( uint8_t *src )
-{
-    uint32_t dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-        dc += src[i - FDEC_STRIDE];
-    vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4);
-    vec_u8_t bc_v = vec_splat(v.v, 0);
-
-    PREDICT_16x16_DC_ALTIVEC(bc_v);
-}
-
-static void predict_16x16_dc_128_altivec( uint8_t *src )
-{
-    /* test if generating the constant is faster than loading it.
-    vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080);
-    */
-    vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7));
-    PREDICT_16x16_DC_ALTIVEC(bc_v);
-}
-
-static void predict_16x16_h_altivec( uint8_t *src )
-{
-    for( int i = 0; i < 16; i++ )
-    {
-        vec_u8_t v = vec_ld(-1, src);
-        vec_u8_t v_v = vec_splat(v, 15);
-        vec_st(v_v, 0, src);
-
-        src += FDEC_STRIDE;
-    }
-}
-
-static void predict_16x16_v_altivec( uint8_t *src )
-{
-    vec_u32_u v;
-    v.s[0] = *(uint32_t*)&src[ 0-FDEC_STRIDE];
-    v.s[1] = *(uint32_t*)&src[ 4-FDEC_STRIDE];
-    v.s[2] = *(uint32_t*)&src[ 8-FDEC_STRIDE];
-    v.s[3] = *(uint32_t*)&src[12-FDEC_STRIDE];
-
-    for( int i = 0; i < 16; i++ )
-    {
-        vec_st(v.v, 0, (uint32_t*)src);
-        src += FDEC_STRIDE;
-    }
-}
-#endif // !HIGH_BIT_DEPTH
-
-
-/****************************************************************************
- * Exported functions:
- ****************************************************************************/
-void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
-{
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_16x16_V ]      = predict_16x16_v_altivec;
-    pf[I_PRED_16x16_H ]      = predict_16x16_h_altivec;
-    pf[I_PRED_16x16_DC]      = predict_16x16_dc_altivec;
-    pf[I_PRED_16x16_P ]      = predict_16x16_p_altivec;
-    pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec;
-    pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec;
-    pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec;
-#endif // !HIGH_BIT_DEPTH
-}
-
-void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] )
-{
-#if !HIGH_BIT_DEPTH
-    pf[I_PRED_CHROMA_P]       = predict_8x8c_p_altivec;
-#endif // !HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/predict.h b/android/src/main/libenc/jni/libx264/common/ppc/predict.h
deleted file mode 100755
index 79ebc6c..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/predict.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*****************************************************************************
- * predict.h: ppc intra prediction
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PPC_PREDICT_H
-#define X264_PPC_PREDICT_H
-
-void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] );
-void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] );
-
-#endif /* X264_PPC_PREDICT_H */
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/quant.c b/android/src/main/libenc/jni/libx264/common/ppc/quant.c
deleted file mode 100755
index 52d02a5..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/quant.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/*****************************************************************************
- * quant.c: ppc quantization
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "ppccommon.h"
-#include "quant.h"
-
-#if !HIGH_BIT_DEPTH
-// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
-#define QUANT_16_U( idx0, idx1 )                                    \
-{                                                                   \
-    temp1v = vec_ld((idx0), dct);                                   \
-    temp2v = vec_ld((idx1), dct);                                   \
-    mfvA = vec_ld((idx0), mf);                                      \
-    mfvB = vec_ld((idx1), mf);                                      \
-    biasvA = vec_ld((idx0), bias);                                  \
-    biasvB = vec_ld((idx1), bias);                                  \
-    mskA = vec_cmplt(temp1v, zero_s16v);                            \
-    mskB = vec_cmplt(temp2v, zero_s16v);                            \
-    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
-    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
-    coefvA = vec_adds(coefvA, biasvA);                              \
-    coefvB = vec_adds(coefvB, biasvB);                              \
-    multEvenvA = vec_mule(coefvA, mfvA);                            \
-    multOddvA = vec_mulo(coefvA, mfvA);                             \
-    multEvenvB = vec_mule(coefvB, mfvB);                            \
-    multOddvB = vec_mulo(coefvB, mfvB);                             \
-    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
-    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
-    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
-    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
-    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
-    temp1v = vec_xor(temp1v, mskA);                                 \
-    temp2v = vec_xor(temp2v, mskB);                                 \
-    temp1v = vec_adds(temp1v, vec_and(mskA, one));                  \
-    vec_st(temp1v, (idx0), dct);                                    \
-    temp2v = vec_adds(temp2v, vec_and(mskB, one));                  \
-    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
-    vec_st(temp2v, (idx1), dct);                                    \
-}
-
-int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-{
-    LOAD_ZERO;
-    vector bool short mskA;
-    vec_u32_t i_qbitsv;
-    vec_u16_t coefvA;
-    vec_u32_t multEvenvA, multOddvA;
-    vec_u16_t mfvA;
-    vec_u16_t biasvA;
-    vec_s16_t one = vec_splat_s16(1);
-    vec_s16_t nz = zero_s16v;
-
-    vector bool short mskB;
-    vec_u16_t coefvB;
-    vec_u32_t multEvenvB, multOddvB;
-    vec_u16_t mfvB;
-    vec_u16_t biasvB;
-
-    vec_s16_t temp1v, temp2v;
-
-    vec_u32_u qbits_u;
-    qbits_u.s[0]=16;
-    i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    QUANT_16_U( 0, 16 );
-    return vec_any_ne(nz, zero_s16v);
-}
-
-// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
-#define QUANT_16_U_DC( idx0, idx1 )                                 \
-{                                                                   \
-    temp1v = vec_ld((idx0), dct);                                   \
-    temp2v = vec_ld((idx1), dct);                                   \
-    mskA = vec_cmplt(temp1v, zero_s16v);                            \
-    mskB = vec_cmplt(temp2v, zero_s16v);                            \
-    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
-    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
-    coefvA = vec_add(coefvA, biasv);                                \
-    coefvB = vec_add(coefvB, biasv);                                \
-    multEvenvA = vec_mule(coefvA, mfv);                             \
-    multOddvA = vec_mulo(coefvA, mfv);                              \
-    multEvenvB = vec_mule(coefvB, mfv);                             \
-    multOddvB = vec_mulo(coefvB, mfv);                              \
-    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
-    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
-    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
-    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
-    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
-    temp1v = vec_xor(temp1v, mskA);                                 \
-    temp2v = vec_xor(temp2v, mskB);                                 \
-    temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
-    vec_st(temp1v, (idx0), dct);                                    \
-    temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
-    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
-    vec_st(temp2v, (idx1), dct);                                    \
-}
-
-int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
-{
-    LOAD_ZERO;
-    vector bool short mskA;
-    vec_u32_t i_qbitsv;
-    vec_u16_t coefvA;
-    vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t one = vec_splat_s16(1);
-    vec_s16_t nz = zero_s16v;
-
-    vector bool short mskB;
-    vec_u16_t coefvB;
-    vec_u32_t multEvenvB, multOddvB;
-
-    vec_s16_t temp1v, temp2v;
-
-    vec_u16_t mfv;
-    vec_u16_t biasv;
-
-    vec_u16_u mf_u;
-    mf_u.s[0]=mf;
-    mfv = vec_splat( mf_u.v, 0 );
-
-    vec_u32_u qbits_u;
-    qbits_u.s[0]=16;
-    i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    vec_u16_u bias_u;
-    bias_u.s[0]=bias;
-    biasv = vec_splat(bias_u.v, 0);
-
-    QUANT_16_U_DC( 0, 16 );
-    return vec_any_ne(nz, zero_s16v);
-}
-
-// DC quant of a whole 2x2 block
-#define QUANT_4_U_DC( idx0 )                                        \
-{                                                                   \
-    const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
-    temp1v = vec_ld((idx0), dct);                                   \
-    mskA = vec_cmplt(temp1v, zero_s16v);                            \
-    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
-    coefvA = vec_add(coefvA, biasv);                                \
-    multEvenvA = vec_mule(coefvA, mfv);                             \
-    multOddvA = vec_mulo(coefvA, mfv);                              \
-    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
-    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
-    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-    temp2v = vec_xor(temp2v, mskA);                                 \
-    temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
-    temp1v = vec_sel(temp1v, temp2v, sel);                          \
-    nz = vec_or(nz, temp1v);                                        \
-    vec_st(temp1v, (idx0), dct);                                    \
-}
-
-int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
-{
-    LOAD_ZERO;
-    vector bool short mskA;
-    vec_u32_t i_qbitsv;
-    vec_u16_t coefvA;
-    vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t one = vec_splat_s16(1);
-    vec_s16_t nz = zero_s16v;
-
-    vec_s16_t temp1v, temp2v;
-
-    vec_u16_t mfv;
-    vec_u16_t biasv;
-
-    vec_u16_u mf_u;
-    mf_u.s[0]=mf;
-    mfv = vec_splat( mf_u.v, 0 );
-
-    vec_u32_u qbits_u;
-    qbits_u.s[0]=16;
-    i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    vec_u16_u bias_u;
-    bias_u.s[0]=bias;
-    biasv = vec_splat(bias_u.v, 0);
-
-    static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
-    QUANT_4_U_DC(0);
-    return vec_any_ne(vec_and(nz, mask2), zero_s16v);
-}
-
-int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-{
-    LOAD_ZERO;
-    vector bool short mskA;
-    vec_u32_t i_qbitsv;
-    vec_u16_t coefvA;
-    vec_u32_t multEvenvA, multOddvA;
-    vec_u16_t mfvA;
-    vec_u16_t biasvA;
-    vec_s16_t one = vec_splat_s16(1);
-    vec_s16_t nz = zero_s16v;
-
-    vector bool short mskB;
-    vec_u16_t coefvB;
-    vec_u32_t multEvenvB, multOddvB;
-    vec_u16_t mfvB;
-    vec_u16_t biasvB;
-
-    vec_s16_t temp1v, temp2v;
-
-    vec_u32_u qbits_u;
-    qbits_u.s[0]=16;
-    i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    for( int i = 0; i < 4; i++ )
-        QUANT_16_U( i*2*16, i*2*16+16 );
-    return vec_any_ne(nz, zero_s16v);
-}
-
-#define DEQUANT_SHL()                                                \
-{                                                                    \
-    dctv = vec_ld(8*y, dct);                                         \
-    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                           \
-    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                        \
-    mfv  = vec_packs(mf1v, mf2v);                                    \
-                                                                     \
-    multEvenvA = vec_mule(dctv, mfv);                                \
-    multOddvA = vec_mulo(dctv, mfv);                                 \
-    dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA),  \
-                                 vec_mergel(multEvenvA, multOddvA)); \
-    dctv = vec_sl(dctv, i_qbitsv);                                   \
-    vec_st(dctv, 8*y, dct);                                          \
-}
-
-#ifdef WORDS_BIGENDIAN
-#define VEC_MULE vec_mule
-#define VEC_MULO vec_mulo
-#else
-#define VEC_MULE vec_mulo
-#define VEC_MULO vec_mule
-#endif
-
-#define DEQUANT_SHR()                                          \
-{                                                              \
-    dctv = vec_ld(8*y, dct);                                   \
-    dct1v = vec_mergeh(dctv, dctv);                            \
-    dct2v = vec_mergel(dctv, dctv);                            \
-    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
-    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
-                                                               \
-    multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v);             \
-    multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v);              \
-    temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
-    temp1v = vec_add(temp1v, fv);                              \
-    temp1v = vec_sra(temp1v, i_qbitsv);                        \
-                                                               \
-    multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v);             \
-    multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v);              \
-    temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
-    temp2v = vec_add(temp2v, fv);                              \
-    temp2v = vec_sra(temp2v, i_qbitsv);                        \
-                                                               \
-    dctv = (vec_s16_t)vec_packs(temp1v, temp2v);               \
-    vec_st(dctv, y*8, dct);                                    \
-}
-
-void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-{
-    int i_mf = i_qp%6;
-    int i_qbits = i_qp/6 - 4;
-
-    vec_s16_t dctv;
-    vec_s16_t dct1v, dct2v;
-    vec_s32_t mf1v, mf2v;
-    vec_s16_t mfv;
-    vec_s32_t multEvenvA, multOddvA;
-    vec_s32_t temp1v, temp2v;
-
-    if( i_qbits >= 0 )
-    {
-        vec_u16_t i_qbitsv;
-        vec_u16_u qbits_u;
-        qbits_u.s[0]=i_qbits;
-        i_qbitsv = vec_splat(qbits_u.v, 0);
-
-        for( int y = 0; y < 4; y+=2 )
-            DEQUANT_SHL();
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-
-        vec_s32_t fv;
-        vec_u32_u f_u;
-        f_u.s[0]=f;
-        fv = (vec_s32_t)vec_splat(f_u.v, 0);
-
-        vec_u32_t i_qbitsv;
-        vec_u32_u qbits_u;
-        qbits_u.s[0]=-i_qbits;
-        i_qbitsv = vec_splat(qbits_u.v, 0);
-
-        vec_u32_t sixteenv;
-        vec_u32_u sixteen_u;
-        sixteen_u.s[0]=16;
-        sixteenv = vec_splat(sixteen_u.v, 0);
-
-        for( int y = 0; y < 4; y+=2 )
-            DEQUANT_SHR();
-    }
-}
-
-void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
-{
-    int i_mf = i_qp%6;
-    int i_qbits = i_qp/6 - 6;
-
-    vec_s16_t dctv;
-    vec_s16_t dct1v, dct2v;
-    vec_s32_t mf1v, mf2v;
-    vec_s16_t mfv;
-    vec_s32_t multEvenvA, multOddvA;
-    vec_s32_t temp1v, temp2v;
-
-    if( i_qbits >= 0 )
-    {
-        vec_u16_t i_qbitsv;
-        vec_u16_u qbits_u;
-        qbits_u.s[0]=i_qbits;
-        i_qbitsv = vec_splat(qbits_u.v, 0);
-
-        for( int y = 0; y < 16; y+=2 )
-            DEQUANT_SHL();
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-
-        vec_s32_t fv;
-        vec_u32_u f_u;
-        f_u.s[0]=f;
-        fv = (vec_s32_t)vec_splat(f_u.v, 0);
-
-        vec_u32_t i_qbitsv;
-        vec_u32_u qbits_u;
-        qbits_u.s[0]=-i_qbits;
-        i_qbitsv = vec_splat(qbits_u.v, 0);
-
-        vec_u32_t sixteenv;
-        vec_u32_u sixteen_u;
-        sixteen_u.s[0]=16;
-        sixteenv = vec_splat(sixteen_u.v, 0);
-
-        for( int y = 0; y < 16; y+=2 )
-            DEQUANT_SHR();
-    }
-}
-#endif // !HIGH_BIT_DEPTH
-
diff --git a/android/src/main/libenc/jni/libx264/common/ppc/quant.h b/android/src/main/libenc/jni/libx264/common/ppc/quant.h
deleted file mode 100755
index 5335e5e..0000000
--- a/android/src/main/libenc/jni/libx264/common/ppc/quant.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*****************************************************************************
- * quant.h: ppc quantization
- *****************************************************************************
- * Copyright (C) 2007-2016 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PPC_QUANT_H
-#define X264_PPC_QUANT_H
-
-int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
-
-int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias );
-int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias );
-
-void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/predict.c b/android/src/main/libenc/jni/libx264/common/predict.c
deleted file mode 100755
index 6b75dd8..0000000
--- a/android/src/main/libenc/jni/libx264/common/predict.c
+++ /dev/null
@@ -1,1054 +0,0 @@
-/*****************************************************************************
- * predict.c: intra prediction
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-/* predict4x4 are inspired from ffmpeg h264 decoder */
-
-
-#include "common.h"
-
-#if HAVE_MMX
-#   include "x86/predict.h"
-#endif
-#if ARCH_PPC
-#   include "ppc/predict.h"
-#endif
-#if ARCH_ARM
-#   include "arm/predict.h"
-#endif
-#if ARCH_AARCH64
-#   include "aarch64/predict.h"
-#endif
-#if ARCH_MIPS
-#   include "mips/predict.h"
-#endif
-
-/****************************************************************************
- * 16x16 prediction for intra luma block
- ****************************************************************************/
-
-#define PREDICT_16x16_DC(v)\
-    for( int i = 0; i < 16; i++ )\
-    {\
-        MPIXEL_X4( src+ 0 ) = v;\
-        MPIXEL_X4( src+ 4 ) = v;\
-        MPIXEL_X4( src+ 8 ) = v;\
-        MPIXEL_X4( src+12 ) = v;\
-        src += FDEC_STRIDE;\
-    }
-
-void x264_predict_16x16_dc_c( pixel *src )
-{
-    int dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-    {
-        dc += src[-1 + i * FDEC_STRIDE];
-        dc += src[i - FDEC_STRIDE];
-    }
-    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );
-
-    PREDICT_16x16_DC( dcsplat );
-}
-static void x264_predict_16x16_dc_left_c( pixel *src )
-{
-    int dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-        dc += src[-1 + i * FDEC_STRIDE];
-    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
-
-    PREDICT_16x16_DC( dcsplat );
-}
-static void x264_predict_16x16_dc_top_c( pixel *src )
-{
-    int dc = 0;
-
-    for( int i = 0; i < 16; i++ )
-        dc += src[i - FDEC_STRIDE];
-    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
-
-    PREDICT_16x16_DC( dcsplat );
-}
-static void x264_predict_16x16_dc_128_c( pixel *src )
-{
-    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
-}
-void x264_predict_16x16_h_c( pixel *src )
-{
-    for( int i = 0; i < 16; i++ )
-    {
-        const pixel4 v = PIXEL_SPLAT_X4( src[-1] );
-        MPIXEL_X4( src+ 0 ) = v;
-        MPIXEL_X4( src+ 4 ) = v;
-        MPIXEL_X4( src+ 8 ) = v;
-        MPIXEL_X4( src+12 ) = v;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_16x16_v_c( pixel *src )
-{
-    pixel4 v0 = MPIXEL_X4( &src[ 0-FDEC_STRIDE] );
-    pixel4 v1 = MPIXEL_X4( &src[ 4-FDEC_STRIDE] );
-    pixel4 v2 = MPIXEL_X4( &src[ 8-FDEC_STRIDE] );
-    pixel4 v3 = MPIXEL_X4( &src[12-FDEC_STRIDE] );
-
-    for( int i = 0; i < 16; i++ )
-    {
-        MPIXEL_X4( src+ 0 ) = v0;
-        MPIXEL_X4( src+ 4 ) = v1;
-        MPIXEL_X4( src+ 8 ) = v2;
-        MPIXEL_X4( src+12 ) = v3;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_16x16_p_c( pixel *src )
-{
-    int H = 0, V = 0;
-
-    /* calculate H and V */
-    for( int i = 0; i <= 7; i++ )
-    {
-        H += ( i + 1 ) * ( src[ 8 + i - FDEC_STRIDE ] - src[6 -i -FDEC_STRIDE] );
-        V += ( i + 1 ) * ( src[-1 + (8+i)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
-    }
-
-    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[15 - FDEC_STRIDE] );
-    int b = ( 5 * H + 32 ) >> 6;
-    int c = ( 5 * V + 32 ) >> 6;
-
-    int i00 = a - b * 7 - c * 7 + 16;
-
-    for( int y = 0; y < 16; y++ )
-    {
-        int pix = i00;
-        for( int x = 0; x < 16; x++ )
-        {
-            src[x] = x264_clip_pixel( pix>>5 );
-            pix += b;
-        }
-        src += FDEC_STRIDE;
-        i00 += c;
-    }
-}
-
-
-/****************************************************************************
- * 8x8 prediction for intra chroma block (4:2:0)
- ****************************************************************************/
-
-static void x264_predict_8x8c_dc_128_c( pixel *src )
-{
-    for( int y = 0; y < 8; y++ )
-    {
-        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
-        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
-        src += FDEC_STRIDE;
-    }
-}
-static void x264_predict_8x8c_dc_left_c( pixel *src )
-{
-    int dc0 = 0, dc1 = 0;
-
-    for( int y = 0; y < 4; y++ )
-    {
-        dc0 += src[y * FDEC_STRIDE     - 1];
-        dc1 += src[(y+4) * FDEC_STRIDE - 1];
-    }
-    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
-
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc0splat;
-        MPIXEL_X4( src+4 ) = dc0splat;
-        src += FDEC_STRIDE;
-    }
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc1splat;
-        MPIXEL_X4( src+4 ) = dc1splat;
-        src += FDEC_STRIDE;
-    }
-
-}
-static void x264_predict_8x8c_dc_top_c( pixel *src )
-{
-    int dc0 = 0, dc1 = 0;
-
-    for( int x = 0; x < 4; x++ )
-    {
-        dc0 += src[x     - FDEC_STRIDE];
-        dc1 += src[x + 4 - FDEC_STRIDE];
-    }
-    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
-
-    for( int y = 0; y < 8; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc0splat;
-        MPIXEL_X4( src+4 ) = dc1splat;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x8c_dc_c( pixel *src )
-{
-    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-
-    /*
-          s0 s1
-       s2
-       s3
-    */
-    for( int i = 0; i < 4; i++ )
-    {
-        s0 += src[i - FDEC_STRIDE];
-        s1 += src[i + 4 - FDEC_STRIDE];
-        s2 += src[-1 + i * FDEC_STRIDE];
-        s3 += src[-1 + (i+4)*FDEC_STRIDE];
-    }
-    /*
-       dc0 dc1
-       dc2 dc3
-     */
-    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
-    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
-    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
-    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
-
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc1;
-        src += FDEC_STRIDE;
-    }
-
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc2;
-        MPIXEL_X4( src+4 ) = dc3;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x8c_h_c( pixel *src )
-{
-    for( int i = 0; i < 8; i++ )
-    {
-        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
-        MPIXEL_X4( src+0 ) = v;
-        MPIXEL_X4( src+4 ) = v;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x8c_v_c( pixel *src )
-{
-    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
-    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
-
-    for( int i = 0; i < 8; i++ )
-    {
-        MPIXEL_X4( src+0 ) = v0;
-        MPIXEL_X4( src+4 ) = v1;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x8c_p_c( pixel *src )
-{
-    int H = 0, V = 0;
-
-    for( int i = 0; i < 4; i++ )
-    {
-        H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
-        V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
-    }
-
-    int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
-    int b = ( 17 * H + 16 ) >> 5;
-    int c = ( 17 * V + 16 ) >> 5;
-    int i00 = a -3*b -3*c + 16;
-
-    for( int y = 0; y < 8; y++ )
-    {
-        int pix = i00;
-        for( int x = 0; x < 8; x++ )
-        {
-            src[x] = x264_clip_pixel( pix>>5 );
-            pix += b;
-        }
-        src += FDEC_STRIDE;
-        i00 += c;
-    }
-}
-
-/****************************************************************************
- * 8x16 prediction for intra chroma block (4:2:2)
- ****************************************************************************/
-
-static void x264_predict_8x16c_dc_128_c( pixel *src )
-{
-    for( int y = 0; y < 16; y++ )
-    {
-        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
-        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
-        src += FDEC_STRIDE;
-    }
-}
-static void x264_predict_8x16c_dc_left_c( pixel *src )
-{
-    for( int i = 0; i < 4; i++ )
-    {
-        int dc = 0;
-
-        for( int y = 0; y < 4; y++ )
-            dc += src[y*FDEC_STRIDE - 1];
-
-        pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 );
-
-        for( int y = 0; y < 4; y++ )
-        {
-            MPIXEL_X4( src+0 ) = dcsplat;
-            MPIXEL_X4( src+4 ) = dcsplat;
-            src += FDEC_STRIDE;
-        }
-    }
-}
-static void x264_predict_8x16c_dc_top_c( pixel *src )
-{
-    int dc0 = 0, dc1 = 0;
-
-    for(int  x = 0; x < 4; x++ )
-    {
-        dc0 += src[x     - FDEC_STRIDE];
-        dc1 += src[x + 4 - FDEC_STRIDE];
-    }
-    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
-
-    for( int y = 0; y < 16; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc0splat;
-        MPIXEL_X4( src+4 ) = dc1splat;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x16c_dc_c( pixel *src )
-{
-    int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0;
-
-    /*
-          s0 s1
-       s2
-       s3
-       s4
-       s5
-    */
-    for( int i = 0; i < 4; i++ )
-    {
-        s0 += src[i+0 - FDEC_STRIDE];
-        s1 += src[i+4 - FDEC_STRIDE];
-        s2 += src[-1 + (i+0)  * FDEC_STRIDE];
-        s3 += src[-1 + (i+4)  * FDEC_STRIDE];
-        s4 += src[-1 + (i+8)  * FDEC_STRIDE];
-        s5 += src[-1 + (i+12) * FDEC_STRIDE];
-    }
-    /*
-       dc0 dc1
-       dc2 dc3
-       dc4 dc5
-       dc6 dc7
-    */
-    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
-    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
-    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
-    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
-    pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 );
-    pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 );
-    pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 );
-    pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 );
-
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc1;
-        src += FDEC_STRIDE;
-    }
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc2;
-        MPIXEL_X4( src+4 ) = dc3;
-        src += FDEC_STRIDE;
-    }
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc4;
-        MPIXEL_X4( src+4 ) = dc5;
-        src += FDEC_STRIDE;
-    }
-    for( int y = 0; y < 4; y++ )
-    {
-        MPIXEL_X4( src+0 ) = dc6;
-        MPIXEL_X4( src+4 ) = dc7;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x16c_h_c( pixel *src )
-{
-    for( int i = 0; i < 16; i++ )
-    {
-        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
-        MPIXEL_X4( src+0 ) = v;
-        MPIXEL_X4( src+4 ) = v;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x16c_v_c( pixel *src )
-{
-    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
-    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
-
-    for( int i = 0; i < 16; i++ )
-    {
-        MPIXEL_X4( src+0 ) = v0;
-        MPIXEL_X4( src+4 ) = v1;
-        src += FDEC_STRIDE;
-    }
-}
-void x264_predict_8x16c_p_c( pixel *src )
-{
-    int H = 0;
-    int V = 0;
-
-    for( int i = 0; i < 4; i++ )
-        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );
-    for( int i = 0; i < 8; i++ )
-        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
-
-    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
-    int b = ( 17 * H + 16 ) >> 5;
-    int c = ( 5 * V + 32 ) >> 6;
-    int i00 = a -3*b -7*c + 16;
-
-    for( int y = 0; y < 16; y++ )
-    {
-        int pix = i00;
-        for( int x = 0; x < 8; x++ )
-        {
-            src[x] = x264_clip_pixel( pix>>5 );
-            pix += b;
-        }
-        src += FDEC_STRIDE;
-        i00 += c;
-    }
-}
-
-/****************************************************************************
- * 4x4 prediction for intra luma block
- ****************************************************************************/
-
-#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC_X4(x,y) MPIXEL_X4( &SRC(x,y) )
-
-#define PREDICT_4x4_DC(v)\
-    SRC_X4(0,0) = SRC_X4(0,1) = SRC_X4(0,2) = SRC_X4(0,3) = v;
-
-static void x264_predict_4x4_dc_128_c( pixel *src )
-{
-    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
-}
-static void x264_predict_4x4_dc_left_c( pixel *src )
-{
-    pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + 2) >> 2 );
-    PREDICT_4x4_DC( dc );
-}
-static void x264_predict_4x4_dc_top_c( pixel *src )
-{
-    pixel4 dc = PIXEL_SPLAT_X4( (SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 2) >> 2 );
-    PREDICT_4x4_DC( dc );
-}
-void x264_predict_4x4_dc_c( pixel *src )
-{
-    pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) +
-                                 SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 4) >> 3 );
-    PREDICT_4x4_DC( dc );
-}
-void x264_predict_4x4_h_c( pixel *src )
-{
-    SRC_X4(0,0) = PIXEL_SPLAT_X4( SRC(-1,0) );
-    SRC_X4(0,1) = PIXEL_SPLAT_X4( SRC(-1,1) );
-    SRC_X4(0,2) = PIXEL_SPLAT_X4( SRC(-1,2) );
-    SRC_X4(0,3) = PIXEL_SPLAT_X4( SRC(-1,3) );
-}
-void x264_predict_4x4_v_c( pixel *src )
-{
-    PREDICT_4x4_DC(SRC_X4(0,-1));
-}
-
-#define PREDICT_4x4_LOAD_LEFT\
-    int l0 = SRC(-1,0);\
-    int l1 = SRC(-1,1);\
-    int l2 = SRC(-1,2);\
-    UNUSED int l3 = SRC(-1,3);
-
-#define PREDICT_4x4_LOAD_TOP\
-    int t0 = SRC(0,-1);\
-    int t1 = SRC(1,-1);\
-    int t2 = SRC(2,-1);\
-    UNUSED int t3 = SRC(3,-1);
-
-#define PREDICT_4x4_LOAD_TOP_RIGHT\
-    int t4 = SRC(4,-1);\
-    int t5 = SRC(5,-1);\
-    int t6 = SRC(6,-1);\
-    UNUSED int t7 = SRC(7,-1);
-
-#define F1(a,b)   (((a)+(b)+1)>>1)
-#define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2)
-
-static void x264_predict_4x4_ddl_c( pixel *src )
-{
-    PREDICT_4x4_LOAD_TOP
-    PREDICT_4x4_LOAD_TOP_RIGHT
-    SRC(0,0)= F2(t0,t1,t2);
-    SRC(1,0)=SRC(0,1)= F2(t1,t2,t3);
-    SRC(2,0)=SRC(1,1)=SRC(0,2)= F2(t2,t3,t4);
-    SRC(3,0)=SRC(2,1)=SRC(1,2)=SRC(0,3)= F2(t3,t4,t5);
-    SRC(3,1)=SRC(2,2)=SRC(1,3)= F2(t4,t5,t6);
-    SRC(3,2)=SRC(2,3)= F2(t5,t6,t7);
-    SRC(3,3)= F2(t6,t7,t7);
-}
-static void x264_predict_4x4_ddr_c( pixel *src )
-{
-    int lt = SRC(-1,-1);
-    PREDICT_4x4_LOAD_LEFT
-    PREDICT_4x4_LOAD_TOP
-    SRC(3,0)= F2(t3,t2,t1);
-    SRC(2,0)=SRC(3,1)= F2(t2,t1,t0);
-    SRC(1,0)=SRC(2,1)=SRC(3,2)= F2(t1,t0,lt);
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)= F2(t0,lt,l0);
-    SRC(0,1)=SRC(1,2)=SRC(2,3)= F2(lt,l0,l1);
-    SRC(0,2)=SRC(1,3)= F2(l0,l1,l2);
-    SRC(0,3)= F2(l1,l2,l3);
-}
-
-static void x264_predict_4x4_vr_c( pixel *src )
-{
-    int lt = SRC(-1,-1);
-    PREDICT_4x4_LOAD_LEFT
-    PREDICT_4x4_LOAD_TOP
-    SRC(0,3)= F2(l2,l1,l0);
-    SRC(0,2)= F2(l1,l0,lt);
-    SRC(0,1)=SRC(1,3)= F2(l0,lt,t0);
-    SRC(0,0)=SRC(1,2)= F1(lt,t0);
-    SRC(1,1)=SRC(2,3)= F2(lt,t0,t1);
-    SRC(1,0)=SRC(2,2)= F1(t0,t1);
-    SRC(2,1)=SRC(3,3)= F2(t0,t1,t2);
-    SRC(2,0)=SRC(3,2)= F1(t1,t2);
-    SRC(3,1)= F2(t1,t2,t3);
-    SRC(3,0)= F1(t2,t3);
-}
-
-static void x264_predict_4x4_hd_c( pixel *src )
-{
-    int lt= SRC(-1,-1);
-    PREDICT_4x4_LOAD_LEFT
-    PREDICT_4x4_LOAD_TOP
-    SRC(0,3)= F1(l2,l3);
-    SRC(1,3)= F2(l1,l2,l3);
-    SRC(0,2)=SRC(2,3)= F1(l1,l2);
-    SRC(1,2)=SRC(3,3)= F2(l0,l1,l2);
-    SRC(0,1)=SRC(2,2)= F1(l0,l1);
-    SRC(1,1)=SRC(3,2)= F2(lt,l0,l1);
-    SRC(0,0)=SRC(2,1)= F1(lt,l0);
-    SRC(1,0)=SRC(3,1)= F2(t0,lt,l0);
-    SRC(2,0)= F2(t1,t0,lt);
-    SRC(3,0)= F2(t2,t1,t0);
-}
-
-static void x264_predict_4x4_vl_c( pixel *src )
-{
-    PREDICT_4x4_LOAD_TOP
-    PREDICT_4x4_LOAD_TOP_RIGHT
-    SRC(0,0)= F1(t0,t1);
-    SRC(0,1)= F2(t0,t1,t2);
-    SRC(1,0)=SRC(0,2)= F1(t1,t2);
-    SRC(1,1)=SRC(0,3)= F2(t1,t2,t3);
-    SRC(2,0)=SRC(1,2)= F1(t2,t3);
-    SRC(2,1)=SRC(1,3)= F2(t2,t3,t4);
-    SRC(3,0)=SRC(2,2)= F1(t3,t4);
-    SRC(3,1)=SRC(2,3)= F2(t3,t4,t5);
-    SRC(3,2)= F1(t4,t5);
-    SRC(3,3)= F2(t4,t5,t6);
-}
-
-static void x264_predict_4x4_hu_c( pixel *src )
-{
-    PREDICT_4x4_LOAD_LEFT
-    SRC(0,0)= F1(l0,l1);
-    SRC(1,0)= F2(l0,l1,l2);
-    SRC(2,0)=SRC(0,1)= F1(l1,l2);
-    SRC(3,0)=SRC(1,1)= F2(l1,l2,l3);
-    SRC(2,1)=SRC(0,2)= F1(l2,l3);
-    SRC(3,1)=SRC(1,2)= F2(l2,l3,l3);
-    SRC(3,2)=SRC(1,3)=SRC(0,3)=
-    SRC(2,2)=SRC(2,3)=SRC(3,3)= l3;
-}
-
-/****************************************************************************
- * 8x8 prediction for intra luma block
- ****************************************************************************/
-
-#define PL(y) \
-    edge[14-y] = F2(SRC(-1,y-1), SRC(-1,y), SRC(-1,y+1));
-#define PT(x) \
-    edge[16+x] = F2(SRC(x-1,-1), SRC(x,-1), SRC(x+1,-1));
-
-static void x264_predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
-{
-    /* edge[7..14] = l7..l0
-     * edge[15] = lt
-     * edge[16..31] = t0 .. t15
-     * edge[32] = t15 */
-
-    int have_lt = i_neighbor & MB_TOPLEFT;
-    if( i_filters & MB_LEFT )
-    {
-        edge[15] = (SRC(0,-1) + 2*SRC(-1,-1) + SRC(-1,0) + 2) >> 2;
-        edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0))
-                 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2;
-        PL(1) PL(2) PL(3) PL(4) PL(5) PL(6)
-        edge[6] =
-        edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
-    }
-
-    if( i_filters & MB_TOP )
-    {
-        int have_tr = i_neighbor & MB_TOPRIGHT;
-        edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1))
-                 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2;
-        PT(1) PT(2) PT(3) PT(4) PT(5) PT(6)
-        edge[23] = (SRC(6,-1) + 2*SRC(7,-1)
-                 + (have_tr ? SRC(8,-1) : SRC(7,-1)) + 2) >> 2;
-
-        if( i_filters & MB_TOPRIGHT )
-        {
-            if( have_tr )
-            {
-                PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14)
-                edge[31] =
-                edge[32] = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2;
-            }
-            else
-            {
-                MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
-                MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
-                edge[32] = SRC(7,-1);
-            }
-        }
-    }
-}
-
-#undef PL
-#undef PT
-
-#define PL(y) \
-    UNUSED int l##y = edge[14-y];
-#define PT(x) \
-    UNUSED int t##x = edge[16+x];
-#define PREDICT_8x8_LOAD_TOPLEFT \
-    int lt = edge[15];
-#define PREDICT_8x8_LOAD_LEFT \
-    PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
-#define PREDICT_8x8_LOAD_TOP \
-    PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
-#define PREDICT_8x8_LOAD_TOPRIGHT \
-    PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14) PT(15)
-
-#define PREDICT_8x8_DC(v) \
-    for( int y = 0; y < 8; y++ ) { \
-        MPIXEL_X4( src+0 ) = v; \
-        MPIXEL_X4( src+4 ) = v; \
-        src += FDEC_STRIDE; \
-    }
-
-static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
-}
-static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_LEFT
-    pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3 );
-    PREDICT_8x8_DC( dc );
-}
-static void x264_predict_8x8_dc_top_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    pixel4 dc = PIXEL_SPLAT_X4( (t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3 );
-    PREDICT_8x8_DC( dc );
-}
-void x264_predict_8x8_dc_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_LEFT
-    PREDICT_8x8_LOAD_TOP
-    pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4 );
-    PREDICT_8x8_DC( dc );
-}
-void x264_predict_8x8_h_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_LEFT
-#define ROW(y) MPIXEL_X4( src+y*FDEC_STRIDE+0 ) =\
-               MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = PIXEL_SPLAT_X4( l##y );
-    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
-#undef ROW
-}
-void x264_predict_8x8_v_c( pixel *src, pixel edge[36] )
-{
-    pixel4 top[2] = { MPIXEL_X4( edge+16 ),
-                      MPIXEL_X4( edge+20 ) };
-    for( int y = 0; y < 8; y++ )
-    {
-        MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0];
-        MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1];
-    }
-}
-static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    PREDICT_8x8_LOAD_TOPRIGHT
-    SRC(0,0)= F2(t0,t1,t2);
-    SRC(0,1)=SRC(1,0)= F2(t1,t2,t3);
-    SRC(0,2)=SRC(1,1)=SRC(2,0)= F2(t2,t3,t4);
-    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= F2(t3,t4,t5);
-    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= F2(t4,t5,t6);
-    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= F2(t5,t6,t7);
-    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= F2(t6,t7,t8);
-    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= F2(t7,t8,t9);
-    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= F2(t8,t9,t10);
-    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= F2(t9,t10,t11);
-    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= F2(t10,t11,t12);
-    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= F2(t11,t12,t13);
-    SRC(5,7)=SRC(6,6)=SRC(7,5)= F2(t12,t13,t14);
-    SRC(6,7)=SRC(7,6)= F2(t13,t14,t15);
-    SRC(7,7)= F2(t14,t15,t15);
-}
-static void x264_predict_8x8_ddr_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    PREDICT_8x8_LOAD_LEFT
-    PREDICT_8x8_LOAD_TOPLEFT
-    SRC(0,7)= F2(l7,l6,l5);
-    SRC(0,6)=SRC(1,7)= F2(l6,l5,l4);
-    SRC(0,5)=SRC(1,6)=SRC(2,7)= F2(l5,l4,l3);
-    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= F2(l4,l3,l2);
-    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= F2(l3,l2,l1);
-    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= F2(l2,l1,l0);
-    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= F2(l1,l0,lt);
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= F2(l0,lt,t0);
-    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= F2(lt,t0,t1);
-    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= F2(t0,t1,t2);
-    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= F2(t1,t2,t3);
-    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= F2(t2,t3,t4);
-    SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5);
-    SRC(6,0)=SRC(7,1)= F2(t4,t5,t6);
-    SRC(7,0)= F2(t5,t6,t7);
-
-}
-static void x264_predict_8x8_vr_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    PREDICT_8x8_LOAD_LEFT
-    PREDICT_8x8_LOAD_TOPLEFT
-    SRC(0,6)= F2(l5,l4,l3);
-    SRC(0,7)= F2(l6,l5,l4);
-    SRC(0,4)=SRC(1,6)= F2(l3,l2,l1);
-    SRC(0,5)=SRC(1,7)= F2(l4,l3,l2);
-    SRC(0,2)=SRC(1,4)=SRC(2,6)= F2(l1,l0,lt);
-    SRC(0,3)=SRC(1,5)=SRC(2,7)= F2(l2,l1,l0);
-    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= F2(l0,lt,t0);
-    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= F1(lt,t0);
-    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= F2(lt,t0,t1);
-    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= F1(t0,t1);
-    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= F2(t0,t1,t2);
-    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= F1(t1,t2);
-    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= F2(t1,t2,t3);
-    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= F1(t2,t3);
-    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= F2(t2,t3,t4);
-    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= F1(t3,t4);
-    SRC(5,1)=SRC(6,3)=SRC(7,5)= F2(t3,t4,t5);
-    SRC(5,0)=SRC(6,2)=SRC(7,4)= F1(t4,t5);
-    SRC(6,1)=SRC(7,3)= F2(t4,t5,t6);
-    SRC(6,0)=SRC(7,2)= F1(t5,t6);
-    SRC(7,1)= F2(t5,t6,t7);
-    SRC(7,0)= F1(t6,t7);
-}
-static void x264_predict_8x8_hd_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    PREDICT_8x8_LOAD_LEFT
-    PREDICT_8x8_LOAD_TOPLEFT
-    int p1 = pack_pixel_1to2(F1(l6,l7), F2(l5,l6,l7));
-    int p2 = pack_pixel_1to2(F1(l5,l6), F2(l4,l5,l6));
-    int p3 = pack_pixel_1to2(F1(l4,l5), F2(l3,l4,l5));
-    int p4 = pack_pixel_1to2(F1(l3,l4), F2(l2,l3,l4));
-    int p5 = pack_pixel_1to2(F1(l2,l3), F2(l1,l2,l3));
-    int p6 = pack_pixel_1to2(F1(l1,l2), F2(l0,l1,l2));
-    int p7 = pack_pixel_1to2(F1(l0,l1), F2(lt,l0,l1));
-    int p8 = pack_pixel_1to2(F1(lt,l0), F2(l0,lt,t0));
-    int p9 = pack_pixel_1to2(F2(t1,t0,lt), F2(t2,t1,t0));
-    int p10 = pack_pixel_1to2(F2(t3,t2,t1), F2(t4,t3,t2));
-    int p11 = pack_pixel_1to2(F2(t5,t4,t3), F2(t6,t5,t4));
-    SRC_X4(0,7)= pack_pixel_2to4(p1,p2);
-    SRC_X4(0,6)= pack_pixel_2to4(p2,p3);
-    SRC_X4(4,7)=SRC_X4(0,5)= pack_pixel_2to4(p3,p4);
-    SRC_X4(4,6)=SRC_X4(0,4)= pack_pixel_2to4(p4,p5);
-    SRC_X4(4,5)=SRC_X4(0,3)= pack_pixel_2to4(p5,p6);
-    SRC_X4(4,4)=SRC_X4(0,2)= pack_pixel_2to4(p6,p7);
-    SRC_X4(4,3)=SRC_X4(0,1)= pack_pixel_2to4(p7,p8);
-    SRC_X4(4,2)=SRC_X4(0,0)= pack_pixel_2to4(p8,p9);
-    SRC_X4(4,1)= pack_pixel_2to4(p9,p10);
-    SRC_X4(4,0)= pack_pixel_2to4(p10,p11);
-}
-static void x264_predict_8x8_vl_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_TOP
-    PREDICT_8x8_LOAD_TOPRIGHT
-    SRC(0,0)= F1(t0,t1);
-    SRC(0,1)= F2(t0,t1,t2);
-    SRC(0,2)=SRC(1,0)= F1(t1,t2);
-    SRC(0,3)=SRC(1,1)= F2(t1,t2,t3);
-    SRC(0,4)=SRC(1,2)=SRC(2,0)= F1(t2,t3);
-    SRC(0,5)=SRC(1,3)=SRC(2,1)= F2(t2,t3,t4);
-    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= F1(t3,t4);
-    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= F2(t3,t4,t5);
-    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= F1(t4,t5);
-    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= F2(t4,t5,t6);
-    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= F1(t5,t6);
-    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= F2(t5,t6,t7);
-    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= F1(t6,t7);
-    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= F2(t6,t7,t8);
-    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= F1(t7,t8);
-    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= F2(t7,t8,t9);
-    SRC(5,6)=SRC(6,4)=SRC(7,2)= F1(t8,t9);
-    SRC(5,7)=SRC(6,5)=SRC(7,3)= F2(t8,t9,t10);
-    SRC(6,6)=SRC(7,4)= F1(t9,t10);
-    SRC(6,7)=SRC(7,5)= F2(t9,t10,t11);
-    SRC(7,6)= F1(t10,t11);
-    SRC(7,7)= F2(t10,t11,t12);
-}
-static void x264_predict_8x8_hu_c( pixel *src, pixel edge[36] )
-{
-    PREDICT_8x8_LOAD_LEFT
-    int p1 = pack_pixel_1to2(F1(l0,l1), F2(l0,l1,l2));
-    int p2 = pack_pixel_1to2(F1(l1,l2), F2(l1,l2,l3));
-    int p3 = pack_pixel_1to2(F1(l2,l3), F2(l2,l3,l4));
-    int p4 = pack_pixel_1to2(F1(l3,l4), F2(l3,l4,l5));
-    int p5 = pack_pixel_1to2(F1(l4,l5), F2(l4,l5,l6));
-    int p6 = pack_pixel_1to2(F1(l5,l6), F2(l5,l6,l7));
-    int p7 = pack_pixel_1to2(F1(l6,l7), F2(l6,l7,l7));
-    int p8 = pack_pixel_1to2(l7,l7);
-    SRC_X4(0,0)= pack_pixel_2to4(p1,p2);
-    SRC_X4(0,1)= pack_pixel_2to4(p2,p3);
-    SRC_X4(4,0)=SRC_X4(0,2)= pack_pixel_2to4(p3,p4);
-    SRC_X4(4,1)=SRC_X4(0,3)= pack_pixel_2to4(p4,p5);
-    SRC_X4(4,2)=SRC_X4(0,4)= pack_pixel_2to4(p5,p6);
-    SRC_X4(4,3)=SRC_X4(0,5)= pack_pixel_2to4(p6,p7);
-    SRC_X4(4,4)=SRC_X4(0,6)= pack_pixel_2to4(p7,p8);
-    SRC_X4(4,5)=SRC_X4(4,6)= SRC_X4(0,7) = SRC_X4(4,7) = pack_pixel_2to4(p8,p8);
-}
-
-/****************************************************************************
- * Exported functions:
- ****************************************************************************/
-void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
-{
-    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_c;
-    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_c;
-    pf[I_PRED_16x16_DC]     = x264_predict_16x16_dc_c;
-    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_c;
-    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_c;
-    pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_c;
-    pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_c;
-
-#if HAVE_MMX
-    x264_predict_16x16_init_mmx( cpu, pf );
-#endif
-
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-        x264_predict_16x16_init_altivec( pf );
-#endif
-
-#if HAVE_ARMV6
-    x264_predict_16x16_init_arm( cpu, pf );
-#endif
-
-#if ARCH_AARCH64
-    x264_predict_16x16_init_aarch64( cpu, pf );
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf[I_PRED_16x16_V ]     = x264_intra_predict_vert_16x16_msa;
-        pf[I_PRED_16x16_H ]     = x264_intra_predict_hor_16x16_msa;
-        pf[I_PRED_16x16_DC]     = x264_intra_predict_dc_16x16_msa;
-        pf[I_PRED_16x16_P ]     = x264_intra_predict_plane_16x16_msa;
-        pf[I_PRED_16x16_DC_LEFT]= x264_intra_predict_dc_left_16x16_msa;
-        pf[I_PRED_16x16_DC_TOP ]= x264_intra_predict_dc_top_16x16_msa;
-        pf[I_PRED_16x16_DC_128 ]= x264_intra_predict_dc_128_16x16_msa;
-    }
-#endif
-#endif
-}
-
-void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
-{
-    pf[I_PRED_CHROMA_V ]     = x264_predict_8x8c_v_c;
-    pf[I_PRED_CHROMA_H ]     = x264_predict_8x8c_h_c;
-    pf[I_PRED_CHROMA_DC]     = x264_predict_8x8c_dc_c;
-    pf[I_PRED_CHROMA_P ]     = x264_predict_8x8c_p_c;
-    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_c;
-    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x8c_dc_top_c;
-    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x8c_dc_128_c;
-
-#if HAVE_MMX
-    x264_predict_8x8c_init_mmx( cpu, pf );
-#endif
-
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-        x264_predict_8x8c_init_altivec( pf );
-#endif
-
-#if HAVE_ARMV6
-    x264_predict_8x8c_init_arm( cpu, pf );
-#endif
-
-#if ARCH_AARCH64
-    x264_predict_8x8c_init_aarch64( cpu, pf );
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf[I_PRED_CHROMA_P ]     = x264_intra_predict_plane_8x8_msa;
-    }
-#endif
-#endif
-}
-
-void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
-{
-    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_c;
-    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_c;
-    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_c;
-    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_c;
-    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
-    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
-    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
-
-#if HAVE_MMX
-    x264_predict_8x16c_init_mmx( cpu, pf );
-#endif
-
-#if HAVE_ARMV6
-    x264_predict_8x16c_init_arm( cpu, pf );
-#endif
-
-#if ARCH_AARCH64
-    x264_predict_8x16c_init_aarch64( cpu, pf );
-#endif
-}
-
-void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
-{
-    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_c;
-    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_c;
-    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_c;
-    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_c;
-    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_c;
-    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_c;
-    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_c;
-    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_c;
-    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_c;
-    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_c;
-    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_c;
-    pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_c;
-    *predict_filter       = x264_predict_8x8_filter_c;
-
-#if HAVE_MMX
-    x264_predict_8x8_init_mmx( cpu, pf, predict_filter );
-#endif
-
-#if HAVE_ARMV6
-    x264_predict_8x8_init_arm( cpu, pf, predict_filter );
-#endif
-
-#if ARCH_AARCH64
-    x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
-#endif
-
-#if !HIGH_BIT_DEPTH
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf[I_PRED_8x8_DDL]    = x264_intra_predict_ddl_8x8_msa;
-    }
-#endif
-#endif
-}
-
-void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
-{
-    pf[I_PRED_4x4_V]      = x264_predict_4x4_v_c;
-    pf[I_PRED_4x4_H]      = x264_predict_4x4_h_c;
-    pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_c;
-    pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_c;
-    pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_c;
-    pf[I_PRED_4x4_VR]     = x264_predict_4x4_vr_c;
-    pf[I_PRED_4x4_HD]     = x264_predict_4x4_hd_c;
-    pf[I_PRED_4x4_VL]     = x264_predict_4x4_vl_c;
-    pf[I_PRED_4x4_HU]     = x264_predict_4x4_hu_c;
-    pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_c;
-    pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_c;
-    pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_c;
-
-#if HAVE_MMX
-    x264_predict_4x4_init_mmx( cpu, pf );
-#endif
-
-#if HAVE_ARMV6
-    x264_predict_4x4_init_arm( cpu, pf );
-#endif
-
-#if ARCH_AARCH64
-    x264_predict_4x4_init_aarch64( cpu, pf );
-#endif
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/predict.h b/android/src/main/libenc/jni/libx264/common/predict.h
deleted file mode 100755
index 9ec9d1d..0000000
--- a/android/src/main/libenc/jni/libx264/common/predict.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*****************************************************************************
- * predict.h: intra prediction
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_PREDICT_H
-#define X264_PREDICT_H
-
-typedef void (*x264_predict_t)( pixel *src );
-typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[36] );
-typedef void (*x264_predict_8x8_filter_t) ( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
-
-enum intra_chroma_pred_e
-{
-    I_PRED_CHROMA_DC = 0,
-    I_PRED_CHROMA_H  = 1,
-    I_PRED_CHROMA_V  = 2,
-    I_PRED_CHROMA_P  = 3,
-
-    I_PRED_CHROMA_DC_LEFT = 4,
-    I_PRED_CHROMA_DC_TOP  = 5,
-    I_PRED_CHROMA_DC_128  = 6
-};
-static const uint8_t x264_mb_chroma_pred_mode_fix[7] =
-{
-    I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
-    I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
-};
-
-enum intra16x16_pred_e
-{
-    I_PRED_16x16_V  = 0,
-    I_PRED_16x16_H  = 1,
-    I_PRED_16x16_DC = 2,
-    I_PRED_16x16_P  = 3,
-
-    I_PRED_16x16_DC_LEFT = 4,
-    I_PRED_16x16_DC_TOP  = 5,
-    I_PRED_16x16_DC_128  = 6,
-};
-static const uint8_t x264_mb_pred_mode16x16_fix[7] =
-{
-    I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P,
-    I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC
-};
-
-enum intra4x4_pred_e
-{
-    I_PRED_4x4_V  = 0,
-    I_PRED_4x4_H  = 1,
-    I_PRED_4x4_DC = 2,
-    I_PRED_4x4_DDL= 3,
-    I_PRED_4x4_DDR= 4,
-    I_PRED_4x4_VR = 5,
-    I_PRED_4x4_HD = 6,
-    I_PRED_4x4_VL = 7,
-    I_PRED_4x4_HU = 8,
-
-    I_PRED_4x4_DC_LEFT = 9,
-    I_PRED_4x4_DC_TOP  = 10,
-    I_PRED_4x4_DC_128  = 11,
-};
-static const int8_t x264_mb_pred_mode4x4_fix[13] =
-{
-    -1,
-    I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
-    I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
-    I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
-    I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
-};
-#define x264_mb_pred_mode4x4_fix(t) x264_mb_pred_mode4x4_fix[(t)+1]
-
-/* must use the same numbering as intra4x4_pred_e */
-enum intra8x8_pred_e
-{
-    I_PRED_8x8_V  = 0,
-    I_PRED_8x8_H  = 1,
-    I_PRED_8x8_DC = 2,
-    I_PRED_8x8_DDL= 3,
-    I_PRED_8x8_DDR= 4,
-    I_PRED_8x8_VR = 5,
-    I_PRED_8x8_HD = 6,
-    I_PRED_8x8_VL = 7,
-    I_PRED_8x8_HU = 8,
-
-    I_PRED_8x8_DC_LEFT = 9,
-    I_PRED_8x8_DC_TOP  = 10,
-    I_PRED_8x8_DC_128  = 11,
-};
-
-void x264_predict_8x8_dc_c  ( pixel *src, pixel edge[36] );
-void x264_predict_8x8_h_c   ( pixel *src, pixel edge[36] );
-void x264_predict_8x8_v_c   ( pixel *src, pixel edge[36] );
-void x264_predict_4x4_dc_c  ( pixel *src );
-void x264_predict_4x4_h_c   ( pixel *src );
-void x264_predict_4x4_v_c   ( pixel *src );
-void x264_predict_16x16_dc_c( pixel *src );
-void x264_predict_16x16_h_c ( pixel *src );
-void x264_predict_16x16_v_c ( pixel *src );
-void x264_predict_16x16_p_c ( pixel *src );
-void x264_predict_8x8c_dc_c ( pixel *src );
-void x264_predict_8x8c_h_c  ( pixel *src );
-void x264_predict_8x8c_v_c  ( pixel *src );
-void x264_predict_8x8c_p_c  ( pixel *src );
-void x264_predict_8x16c_dc_c( pixel *src );
-void x264_predict_8x16c_h_c ( pixel *src );
-void x264_predict_8x16c_v_c ( pixel *src );
-void x264_predict_8x16c_p_c ( pixel *src );
-
-void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] );
-void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
-
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/quant.c b/android/src/main/libenc/jni/libx264/common/quant.c
deleted file mode 100755
index 312f7cd..0000000
--- a/android/src/main/libenc/jni/libx264/common/quant.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*****************************************************************************
- * quant.c: quantization and level-run
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Christian Heine <sennindemokrit@gmx.net>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#if HAVE_MMX
-#include "x86/quant.h"
-#endif
-#if ARCH_PPC
-#   include "ppc/quant.h"
-#endif
-#if ARCH_ARM
-#   include "arm/quant.h"
-#endif
-#if ARCH_AARCH64
-#   include "aarch64/quant.h"
-#endif
-#if ARCH_MIPS
-#   include "mips/quant.h"
-#endif
-
-#define QUANT_ONE( coef, mf, f ) \
-{ \
-    if( (coef) > 0 ) \
-        (coef) = (f + (coef)) * (mf) >> 16; \
-    else \
-        (coef) = - ((f - (coef)) * (mf) >> 16); \
-    nz |= (coef); \
-}
-
-static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
-{
-    int nz = 0;
-    for( int i = 0; i < 64; i++ )
-        QUANT_ONE( dct[i], mf[i], bias[i] );
-    return !!nz;
-}
-
-static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
-{
-    int nz = 0;
-    for( int i = 0; i < 16; i++ )
-        QUANT_ONE( dct[i], mf[i], bias[i] );
-    return !!nz;
-}
-
-static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
-{
-    int nza = 0;
-    for( int j = 0; j < 4; j++ )
-    {
-        int nz = 0;
-        for( int i = 0; i < 16; i++ )
-            QUANT_ONE( dct[j][i], mf[i], bias[i] );
-        nza |= (!!nz)<<j;
-    }
-    return nza;
-}
-
-static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
-{
-    int nz = 0;
-    for( int i = 0; i < 16; i++ )
-        QUANT_ONE( dct[i], mf, bias );
-    return !!nz;
-}
-
-static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
-{
-    int nz = 0;
-    QUANT_ONE( dct[0], mf, bias );
-    QUANT_ONE( dct[1], mf, bias );
-    QUANT_ONE( dct[2], mf, bias );
-    QUANT_ONE( dct[3], mf, bias );
-    return !!nz;
-}
-
-#define DEQUANT_SHL( x ) \
-    dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
-
-#define DEQUANT_SHR( x ) \
-    dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
-
-static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
-{
-    const int i_mf = i_qp%6;
-    const int i_qbits = i_qp/6 - 4;
-
-    if( i_qbits >= 0 )
-    {
-        for( int i = 0; i < 16; i++ )
-            DEQUANT_SHL( i );
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-        for( int i = 0; i < 16; i++ )
-            DEQUANT_SHR( i );
-    }
-}
-
-static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
-{
-    const int i_mf = i_qp%6;
-    const int i_qbits = i_qp/6 - 6;
-
-    if( i_qbits >= 0 )
-    {
-        for( int i = 0; i < 64; i++ )
-            DEQUANT_SHL( i );
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-        for( int i = 0; i < 64; i++ )
-            DEQUANT_SHR( i );
-    }
-}
-
-static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
-{
-    const int i_qbits = i_qp/6 - 6;
-
-    if( i_qbits >= 0 )
-    {
-        const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
-        for( int i = 0; i < 16; i++ )
-            dct[i] *= i_dmf;
-    }
-    else
-    {
-        const int i_dmf = dequant_mf[i_qp%6][0];
-        const int f = 1 << (-i_qbits-1);
-        for( int i = 0; i < 16; i++ )
-            dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
-    }
-}
-
-#define IDCT_DEQUANT_2X4_START \
-    int a0 = dct[0] + dct[1]; \
-    int a1 = dct[2] + dct[3]; \
-    int a2 = dct[4] + dct[5]; \
-    int a3 = dct[6] + dct[7]; \
-    int a4 = dct[0] - dct[1]; \
-    int a5 = dct[2] - dct[3]; \
-    int a6 = dct[4] - dct[5]; \
-    int a7 = dct[6] - dct[7]; \
-    int b0 = a0 + a1; \
-    int b1 = a2 + a3; \
-    int b2 = a4 + a5; \
-    int b3 = a6 + a7; \
-    int b4 = a0 - a1; \
-    int b5 = a2 - a3; \
-    int b6 = a4 - a5; \
-    int b7 = a6 - a7;
-
-static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
-{
-    IDCT_DEQUANT_2X4_START
-    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
-    dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
-    dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
-    dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
-    dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
-    dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
-    dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
-    dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
-    dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
-}
-
-static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
-{
-    IDCT_DEQUANT_2X4_START
-    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
-    dct[0] = ((b0 + b1) * dmf + 32) >> 6;
-    dct[1] = ((b2 + b3) * dmf + 32) >> 6;
-    dct[2] = ((b0 - b1) * dmf + 32) >> 6;
-    dct[3] = ((b2 - b3) * dmf + 32) >> 6;
-    dct[4] = ((b4 - b5) * dmf + 32) >> 6;
-    dct[5] = ((b6 - b7) * dmf + 32) >> 6;
-    dct[6] = ((b4 + b5) * dmf + 32) >> 6;
-    dct[7] = ((b6 + b7) * dmf + 32) >> 6;
-}
-
-static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
-{
-    IDCT_DEQUANT_2X4_START
-    out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
-    out[1] = ((b2 + b3) * dmf + 2080) >> 6;
-    out[2] = ((b0 - b1) * dmf + 2080) >> 6;
-    out[3] = ((b2 - b3) * dmf + 2080) >> 6;
-    out[4] = ((b4 - b5) * dmf + 2080) >> 6;
-    out[5] = ((b6 - b7) * dmf + 2080) >> 6;
-    out[6] = ((b4 + b5) * dmf + 2080) >> 6;
-    out[7] = ((b6 + b7) * dmf + 2080) >> 6;
-}
-#undef IDCT_DEQUANT_2X4_START
-
-static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
-{
-    int d0 = dct[0] + dct[1];
-    int d1 = dct[2] + dct[3];
-    int d2 = dct[0] - dct[1];
-    int d3 = dct[2] - dct[3];
-    out[0] = ((d0 + d1) * dmf >> 5) + 32;
-    out[1] = ((d0 - d1) * dmf >> 5) + 32;
-    out[2] = ((d2 + d3) * dmf >> 5) + 32;
-    out[3] = ((d2 - d3) * dmf >> 5) + 32;
-}
-
-static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
-{
-    dctcoef out[8];
-
-    if( chroma422 )
-        optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
-    else
-        optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
-
-    int sum = 0;
-    for( int i = 0; i < (chroma422?8:4); i++ )
-        sum |= ref[i] ^ out[i];
-    return sum >> 6;
-}
-
-static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
-{
-    /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
-    dctcoef dct_orig[8];
-    int coeff, nz;
-
-    if( chroma422 )
-        optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
-    else
-        optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
-
-    /* If the DC coefficients already round to zero, terminate early. */
-    int sum = 0;
-    for( int i = 0; i < (chroma422?8:4); i++ )
-        sum |= dct_orig[i];
-    if( !(sum >> 6) )
-        return 0;
-
-    /* Start with the highest frequency coefficient... is this the best option? */
-    for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
-    {
-        int level = dct[coeff];
-        int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
-
-        while( level )
-        {
-            dct[coeff] = level - sign;
-            if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
-            {
-                nz = 1;
-                dct[coeff] = level;
-                break;
-            }
-            level -= sign;
-        }
-    }
-
-    return nz;
-}
-
-static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
-{
-    return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
-}
-
-static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
-{
-    return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
-}
-
-static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
-{
-    for( int i = 0; i < size; i++ )
-    {
-        int level = dct[i];
-        int sign = level>>31;
-        level = (level+sign)^sign;
-        sum[i] += level;
-        level -= offset[i];
-        dct[i] = level<0 ? 0 : (level^sign)-sign;
-    }
-}
-
-/* (ref: JVT-B118)
- * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
- * to 0 (low score means set it to null)
- * Used in inter macroblock (luma and chroma)
- *  luma: for a 8x8 block: if score < 4 -> null
- *        for the complete mb: if score < 6 -> null
- *  chroma: for the complete mb: if score < 7 -> null
- */
-
-const uint8_t x264_decimate_table4[16] =
-{
-    3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
-};
-const uint8_t x264_decimate_table8[64] =
-{
-    3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
-    1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-};
-
-static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
-{
-    const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
-    int i_score = 0;
-    int idx = i_max - 1;
-
-    while( idx >= 0 && dct[idx] == 0 )
-        idx--;
-    while( idx >= 0 )
-    {
-        int i_run;
-
-        if( (unsigned)(dct[idx--] + 1) > 2 )
-            return 9;
-
-        i_run = 0;
-        while( idx >= 0 && dct[idx] == 0 )
-        {
-            idx--;
-            i_run++;
-        }
-        i_score += ds_table[i_run];
-    }
-
-    return i_score;
-}
-
-static int x264_decimate_score15( dctcoef *dct )
-{
-    return x264_decimate_score_internal( dct+1, 15 );
-}
-static int x264_decimate_score16( dctcoef *dct )
-{
-    return x264_decimate_score_internal( dct, 16 );
-}
-static int x264_decimate_score64( dctcoef *dct )
-{
-    return x264_decimate_score_internal( dct, 64 );
-}
-
-#define last(num)\
-static int x264_coeff_last##num( dctcoef *l )\
-{\
-    int i_last = num-1;\
-    while( i_last >= 0 && l[i_last] == 0 )\
-        i_last--;\
-    return i_last;\
-}
-
-last(4)
-last(8)
-last(15)
-last(16)
-last(64)
-
-#define level_run(num)\
-static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
-{\
-    int i_last = runlevel->last = x264_coeff_last##num(dct);\
-    int i_total = 0;\
-    int mask = 0;\
-    do\
-    {\
-        runlevel->level[i_total++] = dct[i_last];\
-        mask |= 1 << (i_last);\
-        while( --i_last >= 0 && dct[i_last] == 0 );\
-    } while( i_last >= 0 );\
-    runlevel->mask = mask;\
-    return i_total;\
-}
-
-level_run(4)
-level_run(8)
-level_run(15)
-level_run(16)
-
-#if ARCH_X86_64
-#define INIT_TRELLIS(cpu)\
-    pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
-    pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
-    pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
-    pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
-    pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
-    pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
-#else
-#define INIT_TRELLIS(...)
-#endif
-
-void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
-{
-    pf->quant_8x8 = quant_8x8;
-    pf->quant_4x4 = quant_4x4;
-    pf->quant_4x4x4 = quant_4x4x4;
-    pf->quant_4x4_dc = quant_4x4_dc;
-    pf->quant_2x2_dc = quant_2x2_dc;
-
-    pf->dequant_4x4 = dequant_4x4;
-    pf->dequant_4x4_dc = dequant_4x4_dc;
-    pf->dequant_8x8 = dequant_8x8;
-
-    pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
-    pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
-
-    pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
-    pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
-
-    pf->denoise_dct = x264_denoise_dct;
-    pf->decimate_score15 = x264_decimate_score15;
-    pf->decimate_score16 = x264_decimate_score16;
-    pf->decimate_score64 = x264_decimate_score64;
-
-    pf->coeff_last4 = x264_coeff_last4;
-    pf->coeff_last8 = x264_coeff_last8;
-    pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
-    pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
-    pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
-    pf->coeff_level_run4 = x264_coeff_level_run4;
-    pf->coeff_level_run8 = x264_coeff_level_run8;
-    pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
-    pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
-
-#if HIGH_BIT_DEPTH
-#if HAVE_MMX
-    INIT_TRELLIS( sse2 );
-    if( cpu&X264_CPU_MMX2 )
-    {
-#if ARCH_X86
-        pf->denoise_dct = x264_denoise_dct_mmx;
-        pf->decimate_score15 = x264_decimate_score15_mmx2;
-        pf->decimate_score16 = x264_decimate_score16_mmx2;
-        pf->decimate_score64 = x264_decimate_score64_mmx2;
-        pf->coeff_last8 = x264_coeff_last8_mmx2;
-        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
-        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
-        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
-        pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
-        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
-        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
-#endif
-        pf->coeff_last4 = x264_coeff_last4_mmx2;
-        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
-        if( cpu&X264_CPU_LZCNT )
-            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
-    }
-    if( cpu&X264_CPU_SSE2 )
-    {
-        pf->quant_4x4 = x264_quant_4x4_sse2;
-        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
-        pf->quant_8x8 = x264_quant_8x8_sse2;
-        pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
-        pf->dequant_4x4 = x264_dequant_4x4_sse2;
-        pf->dequant_8x8 = x264_dequant_8x8_sse2;
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
-        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
-        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
-        pf->denoise_dct = x264_denoise_dct_sse2;
-        pf->decimate_score15 = x264_decimate_score15_sse2;
-        pf->decimate_score16 = x264_decimate_score16_sse2;
-        pf->decimate_score64 = x264_decimate_score64_sse2;
-        pf->coeff_last8 = x264_coeff_last8_sse2;
-        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
-        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
-        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
-        pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
-        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
-        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
-            pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
-            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
-            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
-            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
-            pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
-            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
-            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
-        }
-    }
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        pf->quant_4x4 = x264_quant_4x4_ssse3;
-        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
-        pf->quant_8x8 = x264_quant_8x8_ssse3;
-        pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
-        pf->denoise_dct = x264_denoise_dct_ssse3;
-        pf->decimate_score15 = x264_decimate_score15_ssse3;
-        pf->decimate_score16 = x264_decimate_score16_ssse3;
-        pf->decimate_score64 = x264_decimate_score64_ssse3;
-        INIT_TRELLIS( ssse3 );
-    }
-    if( cpu&X264_CPU_SSE4 )
-    {
-        pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
-        pf->quant_4x4 = x264_quant_4x4_sse4;
-        pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
-        pf->quant_8x8 = x264_quant_8x8_sse4;
-    }
-    if( cpu&X264_CPU_AVX )
-    {
-        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
-        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
-        pf->denoise_dct = x264_denoise_dct_avx;
-    }
-    if( cpu&X264_CPU_XOP )
-    {
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
-        if( h->param.i_cqm_preset != X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_xop;
-            pf->dequant_8x8 = x264_dequant_8x8_xop;
-        }
-    }
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf->quant_4x4 = x264_quant_4x4_avx2;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
-        pf->quant_8x8 = x264_quant_8x8_avx2;
-        pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
-        pf->dequant_4x4 = x264_dequant_4x4_avx2;
-        pf->dequant_8x8 = x264_dequant_8x8_avx2;
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
-        pf->denoise_dct = x264_denoise_dct_avx2;
-        if( cpu&X264_CPU_LZCNT )
-            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
-    }
-#endif // HAVE_MMX
-#else // !HIGH_BIT_DEPTH
-#if HAVE_MMX
-    INIT_TRELLIS( sse2 );
-    if( cpu&X264_CPU_MMX )
-    {
-#if ARCH_X86
-        pf->dequant_4x4 = x264_dequant_4x4_mmx;
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
-        pf->dequant_8x8 = x264_dequant_8x8_mmx;
-        if( h->param.i_cqm_preset == X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
-            pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
-        }
-        pf->denoise_dct = x264_denoise_dct_mmx;
-#endif
-    }
-
-    if( cpu&X264_CPU_MMX2 )
-    {
-        pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
-#if ARCH_X86
-        pf->quant_4x4 = x264_quant_4x4_mmx2;
-        pf->quant_8x8 = x264_quant_8x8_mmx2;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
-        pf->decimate_score15 = x264_decimate_score15_mmx2;
-        pf->decimate_score16 = x264_decimate_score16_mmx2;
-        pf->decimate_score64 = x264_decimate_score64_mmx2;
-        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
-        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
-        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
-        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
-        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
-#endif
-        pf->coeff_last4 = x264_coeff_last4_mmx2;
-        pf->coeff_last8 = x264_coeff_last8_mmx2;
-        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
-        pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
-            pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
-            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
-            pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
-        }
-    }
-
-    if( cpu&X264_CPU_SSE2 )
-    {
-        pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
-        pf->quant_4x4 = x264_quant_4x4_sse2;
-        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
-        pf->quant_8x8 = x264_quant_8x8_sse2;
-        pf->dequant_4x4 = x264_dequant_4x4_sse2;
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
-        pf->dequant_8x8 = x264_dequant_8x8_sse2;
-        if( h->param.i_cqm_preset == X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
-            pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
-        }
-        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
-        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
-        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
-        pf->denoise_dct = x264_denoise_dct_sse2;
-        pf->decimate_score15 = x264_decimate_score15_sse2;
-        pf->decimate_score16 = x264_decimate_score16_sse2;
-        pf->decimate_score64 = x264_decimate_score64_sse2;
-        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
-        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
-        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
-        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
-        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
-            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
-            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
-            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
-            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
-        }
-    }
-
-    if( cpu&X264_CPU_SSSE3 )
-    {
-        pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
-        pf->quant_4x4 = x264_quant_4x4_ssse3;
-        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
-        pf->quant_8x8 = x264_quant_8x8_ssse3;
-        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
-        pf->denoise_dct = x264_denoise_dct_ssse3;
-        pf->decimate_score15 = x264_decimate_score15_ssse3;
-        pf->decimate_score16 = x264_decimate_score16_ssse3;
-        pf->decimate_score64 = x264_decimate_score64_ssse3;
-        INIT_TRELLIS( ssse3 );
-        pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
-        pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
-        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
-        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
-            pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
-            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
-            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
-        }
-    }
-
-    if( cpu&X264_CPU_SSE4 )
-    {
-        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
-        pf->quant_4x4 = x264_quant_4x4_sse4;
-        pf->quant_8x8 = x264_quant_8x8_sse4;
-        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
-    }
-
-    if( cpu&X264_CPU_AVX )
-    {
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
-        if( h->param.i_cqm_preset != X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_avx;
-            pf->dequant_8x8 = x264_dequant_8x8_avx;
-        }
-        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
-        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
-        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
-        pf->denoise_dct = x264_denoise_dct_avx;
-    }
-
-    if( cpu&X264_CPU_XOP )
-    {
-        if( h->param.i_cqm_preset != X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_xop;
-            pf->dequant_8x8 = x264_dequant_8x8_xop;
-        }
-    }
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf->quant_4x4 = x264_quant_4x4_avx2;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
-        pf->quant_8x8 = x264_quant_8x8_avx2;
-        pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
-        pf->dequant_4x4 = x264_dequant_4x4_avx2;
-        pf->dequant_8x8 = x264_dequant_8x8_avx2;
-        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
-        if( h->param.i_cqm_preset == X264_CQM_FLAT )
-        {
-            pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
-            pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
-        }
-        pf->decimate_score64 = x264_decimate_score64_avx2;
-        pf->denoise_dct = x264_denoise_dct_avx2;
-        if( cpu&X264_CPU_LZCNT )
-        {
-            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
-            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
-            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
-        }
-    }
-#endif // HAVE_MMX
-
-#if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC )
-    {
-        pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
-        pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
-        pf->quant_4x4 = x264_quant_4x4_altivec;
-        pf->quant_8x8 = x264_quant_8x8_altivec;
-
-        pf->dequant_4x4 = x264_dequant_4x4_altivec;
-        pf->dequant_8x8 = x264_dequant_8x8_altivec;
-    }
-#endif
-
-#if HAVE_ARMV6
-    if( cpu&X264_CPU_ARMV6 )
-    {
-        pf->coeff_last4 = x264_coeff_last4_arm;
-        pf->coeff_last8 = x264_coeff_last8_arm;
-    }
-#endif
-#if HAVE_ARMV6 || ARCH_AARCH64
-    if( cpu&X264_CPU_NEON )
-    {
-        pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
-        pf->quant_4x4      = x264_quant_4x4_neon;
-        pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
-        pf->quant_4x4x4    = x264_quant_4x4x4_neon;
-        pf->quant_8x8      = x264_quant_8x8_neon;
-        pf->dequant_4x4    = x264_dequant_4x4_neon;
-        pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
-        pf->dequant_8x8    = x264_dequant_8x8_neon;
-        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
-        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
-        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
-        pf->denoise_dct = x264_denoise_dct_neon;
-        pf->decimate_score15 = x264_decimate_score15_neon;
-        pf->decimate_score16 = x264_decimate_score16_neon;
-        pf->decimate_score64 = x264_decimate_score64_neon;
-    }
-#endif
-#if ARCH_AARCH64
-    if( cpu&X264_CPU_ARMV8 )
-    {
-        pf->coeff_last4 = x264_coeff_last4_aarch64;
-        pf->coeff_last8 = x264_coeff_last8_aarch64;
-        pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
-    }
-    if( cpu&X264_CPU_NEON )
-    {
-        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
-        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
-        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
-    }
-#endif
-
-#if HAVE_MSA
-    if( cpu&X264_CPU_MSA )
-    {
-        pf->quant_4x4      = x264_quant_4x4_msa;
-        pf->quant_4x4_dc   = x264_quant_4x4_dc_msa;
-        pf->quant_4x4x4    = x264_quant_4x4x4_msa;
-        pf->quant_8x8      = x264_quant_8x8_msa;
-        pf->dequant_4x4    = x264_dequant_4x4_msa;
-        pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa;
-        pf->dequant_8x8    = x264_dequant_8x8_msa;
-        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa;
-        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa;
-    }
-#endif
-#endif // HIGH_BIT_DEPTH
-    pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
-    pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
-    pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
-    pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
-    pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
-
-    pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
-    pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
-    pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
-    pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
-}
diff --git a/android/src/main/libenc/jni/libx264/common/quant.h b/android/src/main/libenc/jni/libx264/common/quant.h
deleted file mode 100755
index edb0f78..0000000
--- a/android/src/main/libenc/jni/libx264/common/quant.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*****************************************************************************
- * quant.h: quantization and level-run
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_QUANT_H
-#define X264_QUANT_H
-
-typedef struct
-{
-    int (*quant_8x8)  ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-    int (*quant_4x4)  ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-    int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
-    int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias );
-    int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
-
-    void (*dequant_8x8)( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
-    void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-    void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-
-    void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
-    void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
-
-    int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf );
-    int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf );
-
-    void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-
-    int (*decimate_score15)( dctcoef *dct );
-    int (*decimate_score16)( dctcoef *dct );
-    int (*decimate_score64)( dctcoef *dct );
-    int (*coeff_last[14])( dctcoef *dct );
-    int (*coeff_last4)( dctcoef *dct );
-    int (*coeff_last8)( dctcoef *dct );
-    int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
-    int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
-    int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
-
-#define TRELLIS_PARAMS const int *unquant_mf, const uint8_t *zigzag, int lambda2,\
-                       int last_nnz, dctcoef *coefs, dctcoef *quant_coefs, dctcoef *dct,\
-                       uint8_t *cabac_state_sig, uint8_t *cabac_state_last,\
-                       uint64_t level_state0, uint16_t level_state1
-    int (*trellis_cabac_4x4)( TRELLIS_PARAMS, int b_ac );
-    int (*trellis_cabac_8x8)( TRELLIS_PARAMS, int b_interlaced );
-    int (*trellis_cabac_4x4_psy)( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int psy_trellis );
-    int (*trellis_cabac_8x8_psy)( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int psy_trellis );
-    int (*trellis_cabac_dc)( TRELLIS_PARAMS, int num_coefs );
-    int (*trellis_cabac_chroma_422_dc)( TRELLIS_PARAMS );
-} x264_quant_function_t;
-
-void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/rectangle.c b/android/src/main/libenc/jni/libx264/common/rectangle.c
deleted file mode 100755
index 2e30c8a..0000000
--- a/android/src/main/libenc/jni/libx264/common/rectangle.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*****************************************************************************
- * rectangle.c: rectangle filling
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#define CACHE_FUNC(name,size,width,height)\
-static void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
-{\
-    x264_macroblock_cache_rect( target, width*size, height, size, val );\
-}
-
-#define CACHE_FUNCS(name,size)\
-CACHE_FUNC(name,size,4,4)\
-CACHE_FUNC(name,size,2,4)\
-CACHE_FUNC(name,size,4,2)\
-CACHE_FUNC(name,size,2,2)\
-CACHE_FUNC(name,size,2,1)\
-CACHE_FUNC(name,size,1,2)\
-CACHE_FUNC(name,size,1,1)\
-void (*x264_cache_##name##_func_table[10])(void *, uint32_t) =\
-{\
-    x264_macroblock_cache_##name##_1_1,\
-    x264_macroblock_cache_##name##_2_1,\
-    x264_macroblock_cache_##name##_1_2,\
-    x264_macroblock_cache_##name##_2_2,\
-    NULL,\
-    x264_macroblock_cache_##name##_4_2,\
-    NULL,\
-    x264_macroblock_cache_##name##_2_4,\
-    NULL,\
-    x264_macroblock_cache_##name##_4_4\
-};\
-
-CACHE_FUNCS(mv, 4)
-CACHE_FUNCS(mvd, 2)
-CACHE_FUNCS(ref, 1)
diff --git a/android/src/main/libenc/jni/libx264/common/rectangle.h b/android/src/main/libenc/jni/libx264/common/rectangle.h
deleted file mode 100755
index c8dd9f5..0000000
--- a/android/src/main/libenc/jni/libx264/common/rectangle.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*****************************************************************************
- * rectangle.h: rectangle filling
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Fiona Glaser <fiona@x264.com>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-/* This function should only be called with constant w / h / s arguments! */
-static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, int s, uint32_t v )
-{
-    uint8_t *d = dst;
-    uint16_t v2 = s == 2 ? v : v * 0x101;
-    uint32_t v4 = s == 4 ? v : s == 2 ? v * 0x10001 : v * 0x1010101;
-    uint64_t v8 = v4 + ((uint64_t)v4 << 32);
-    s *= 8;
-
-    if( w == 2 )
-    {
-        M16( d+s*0 ) = v2;
-        if( h == 1 ) return;
-        M16( d+s*1 ) = v2;
-        if( h == 2 ) return;
-        M16( d+s*2 ) = v2;
-        M16( d+s*3 ) = v2;
-    }
-    else if( w == 4 )
-    {
-        M32( d+s*0 ) = v4;
-        if( h == 1 ) return;
-        M32( d+s*1 ) = v4;
-        if( h == 2 ) return;
-        M32( d+s*2 ) = v4;
-        M32( d+s*3 ) = v4;
-    }
-    else if( w == 8 )
-    {
-        if( WORD_SIZE == 8 )
-        {
-            M64( d+s*0 ) = v8;
-            if( h == 1 ) return;
-            M64( d+s*1 ) = v8;
-            if( h == 2 ) return;
-            M64( d+s*2 ) = v8;
-            M64( d+s*3 ) = v8;
-        }
-        else
-        {
-            M32( d+s*0+0 ) = v4;
-            M32( d+s*0+4 ) = v4;
-            if( h == 1 ) return;
-            M32( d+s*1+0 ) = v4;
-            M32( d+s*1+4 ) = v4;
-            if( h == 2 ) return;
-            M32( d+s*2+0 ) = v4;
-            M32( d+s*2+4 ) = v4;
-            M32( d+s*3+0 ) = v4;
-            M32( d+s*3+4 ) = v4;
-        }
-    }
-    else if( w == 16 )
-    {
-        /* height 1, width 16 doesn't occur */
-        assert( h != 1 );
-#if HAVE_VECTOREXT && defined(__SSE__)
-        v4si v16 = {v,v,v,v};
-
-        M128( d+s*0+0 ) = (__m128)v16;
-        M128( d+s*1+0 ) = (__m128)v16;
-        if( h == 2 ) return;
-        M128( d+s*2+0 ) = (__m128)v16;
-        M128( d+s*3+0 ) = (__m128)v16;
-#else
-        if( WORD_SIZE == 8 )
-        {
-            do
-            {
-                M64( d+s*0+0 ) = v8;
-                M64( d+s*0+8 ) = v8;
-                M64( d+s*1+0 ) = v8;
-                M64( d+s*1+8 ) = v8;
-                h -= 2;
-                d += s*2;
-            } while( h );
-        }
-        else
-        {
-            do
-            {
-                M32( d+ 0 ) = v4;
-                M32( d+ 4 ) = v4;
-                M32( d+ 8 ) = v4;
-                M32( d+12 ) = v4;
-                d += s;
-            } while( --h );
-        }
-#endif
-    }
-    else
-        assert(0);
-}
-
-extern void (*x264_cache_mv_func_table[10])(void *, uint32_t);\
-extern void (*x264_cache_mvd_func_table[10])(void *, uint32_t);\
-extern void (*x264_cache_ref_func_table[10])(void *, uint32_t);\
-
-#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
-static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
-{
-    void *mv_cache = &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y];
-    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
-        x264_cache_mv_func_table[width + (height<<1)-3]( mv_cache, mv );
-    else
-        x264_macroblock_cache_rect( mv_cache, width*4, height, 4, mv );
-}
-static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mvd )
-{
-    void *mvd_cache = &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y];
-    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
-        x264_cache_mvd_func_table[width + (height<<1)-3]( mvd_cache, mvd );
-    else
-        x264_macroblock_cache_rect( mvd_cache, width*2, height, 2, mvd );
-}
-static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
-{
-    void *ref_cache = &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y];
-    if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
-        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, ref );
-    else
-        x264_macroblock_cache_rect( ref_cache, width, height, 1, ref );
-}
-static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
-{
-    x264_macroblock_cache_rect( &h->mb.cache.skip[X264_SCAN8_0+x+8*y], width, height, 1, b_skip );
-}
-static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int y, int i_mode )
-{
-    x264_macroblock_cache_rect( &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y], 2, 2, 1, i_mode );
-}
diff --git a/android/src/main/libenc/jni/libx264/common/set.c b/android/src/main/libenc/jni/libx264/common/set.c
deleted file mode 100755
index 7853551..0000000
--- a/android/src/main/libenc/jni/libx264/common/set.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/*****************************************************************************
- * set.c: quantization init
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-#define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
-#define DIV(n,d) (((n) + ((d)>>1)) / (d))
-
-static const uint8_t dequant4_scale[6][3] =
-{
-    { 10, 13, 16 },
-    { 11, 14, 18 },
-    { 13, 16, 20 },
-    { 14, 18, 23 },
-    { 16, 20, 25 },
-    { 18, 23, 29 }
-};
-static const uint16_t quant4_scale[6][3] =
-{
-    { 13107, 8066, 5243 },
-    { 11916, 7490, 4660 },
-    { 10082, 6554, 4194 },
-    {  9362, 5825, 3647 },
-    {  8192, 5243, 3355 },
-    {  7282, 4559, 2893 },
-};
-
-static const uint8_t quant8_scan[16] =
-{
-    0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
-};
-static const uint8_t dequant8_scale[6][6] =
-{
-    { 20, 18, 32, 19, 25, 24 },
-    { 22, 19, 35, 21, 28, 26 },
-    { 26, 23, 42, 24, 33, 31 },
-    { 28, 25, 45, 26, 35, 33 },
-    { 32, 28, 51, 30, 40, 38 },
-    { 36, 32, 58, 34, 46, 43 },
-};
-static const uint16_t quant8_scale[6][6] =
-{
-    { 13107, 11428, 20972, 12222, 16777, 15481 },
-    { 11916, 10826, 19174, 11058, 14980, 14290 },
-    { 10082,  8943, 15978,  9675, 12710, 11985 },
-    {  9362,  8228, 14913,  8931, 11984, 11259 },
-    {  8192,  7346, 13159,  7740, 10486,  9777 },
-    {  7282,  6428, 11570,  6830,  9118,  8640 }
-};
-
-int x264_cqm_init( x264_t *h )
-{
-    int def_quant4[6][16];
-    int def_quant8[6][64];
-    int def_dequant4[6][16];
-    int def_dequant8[6][64];
-    int quant4_mf[4][6][16];
-    int quant8_mf[4][6][64];
-    int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
-                        32 - h->param.analyse.i_luma_deadzone[0],
-                        32 - 11, 32 - 21 };
-    int max_qp_err = -1;
-    int max_chroma_qp_err = -1;
-    int min_qp_err = QP_MAX+1;
-    int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4
-                      : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */
-
-#define CQM_ALLOC( w, count )\
-    for( int i = 0; i < count; i++ )\
-    {\
-        int size = w*w;\
-        int start = w == 8 ? 4 : 0;\
-        int j;\
-        for( j = 0; j < i; j++ )\
-            if( !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
-                break;\
-        if( j < i )\
-        {\
-            h->  quant##w##_mf[i] = h->  quant##w##_mf[j];\
-            h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\
-            h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\
-        }\
-        else\
-        {\
-            CHECKED_MALLOC( h->  quant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
-            CHECKED_MALLOC( h->dequant##w##_mf[i],  6*size*sizeof(int) );\
-            CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(int) );\
-        }\
-        for( j = 0; j < i; j++ )\
-            if( deadzone[j] == deadzone[i] &&\
-                !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
-                break;\
-        if( j < i )\
-        {\
-            h->quant##w##_bias[i] = h->quant##w##_bias[j];\
-            h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\
-        }\
-        else\
-        {\
-            CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
-            CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
-        }\
-    }
-
-    CQM_ALLOC( 4, 4 )
-    CQM_ALLOC( 8, num_8x8_lists )
-
-    for( int q = 0; q < 6; q++ )
-    {
-        for( int i = 0; i < 16; i++ )
-        {
-            int j = (i&1) + ((i>>2)&1);
-            def_dequant4[q][i] = dequant4_scale[q][j];
-            def_quant4[q][i]   =   quant4_scale[q][j];
-        }
-        for( int i = 0; i < 64; i++ )
-        {
-            int j = quant8_scan[((i>>1)&12) | (i&3)];
-            def_dequant8[q][i] = dequant8_scale[q][j];
-            def_quant8[q][i]   =   quant8_scale[q][j];
-        }
-    }
-
-    for( int q = 0; q < 6; q++ )
-    {
-        for( int i_list = 0; i_list < 4; i_list++ )
-            for( int i = 0; i < 16; i++ )
-            {
-                h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
-                     quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
-            }
-        for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
-            for( int i = 0; i < 64; i++ )
-            {
-                h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
-                     quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
-            }
-    }
-    for( int q = 0; q <= QP_MAX_SPEC; q++ )
-    {
-        int j;
-        for( int i_list = 0; i_list < 4; i_list++ )
-            for( int i = 0; i < 16; i++ )
-            {
-                h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
-                h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
-                if( !j )
-                {
-                    min_qp_err = X264_MIN( min_qp_err, q );
-                    continue;
-                }
-                // round to nearest, unless that would cause the deadzone to be negative
-                h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
-                h->quant4_bias0[i_list][q][i] = (1<<15)/j;
-                if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
-                    max_qp_err = q;
-                if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
-                    max_chroma_qp_err = q;
-            }
-        if( h->param.analyse.b_transform_8x8 )
-            for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
-                for( int i = 0; i < 64; i++ )
-                {
-                    h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
-                    j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
-                    h->quant8_mf[i_list][q][i] = (uint16_t)j;
-
-                    if( !j )
-                    {
-                        min_qp_err = X264_MIN( min_qp_err, q );
-                        continue;
-                    }
-                    h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
-                    h->quant8_bias0[i_list][q][i] = (1<<15)/j;
-                    if( j > 0xffff && q > max_qp_err && (i_list == CQM_8IY || i_list == CQM_8PY) )
-                        max_qp_err = q;
-                    if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_8IC || i_list == CQM_8PC) )
-                        max_chroma_qp_err = q;
-                }
-    }
-
-    /* Emergency mode denoising. */
-    x264_emms();
-    CHECKED_MALLOC( h->nr_offset_emergency, sizeof(*h->nr_offset_emergency)*(QP_MAX-QP_MAX_SPEC) );
-    for( int q = 0; q < QP_MAX - QP_MAX_SPEC; q++ )
-        for( int cat = 0; cat < 3 + CHROMA444; cat++ )
-        {
-            int dct8x8 = cat&1;
-            if( !h->param.analyse.b_transform_8x8 && dct8x8 )
-                continue;
-
-            int size = dct8x8 ? 64 : 16;
-            udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
-            /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */
-            int dc_threshold =    (QP_MAX-QP_MAX_SPEC)*2/3;
-            int luma_threshold =  (QP_MAX-QP_MAX_SPEC)*2/3;
-            int chroma_threshold = 0;
-
-            for( int i = 0; i < size; i++ )
-            {
-                int max = (1 << (7 + BIT_DEPTH)) - 1;
-                /* True "emergency mode": remove all DCT coefficients */
-                if( q == QP_MAX - QP_MAX_SPEC - 1 )
-                {
-                    nr_offset[i] = max;
-                    continue;
-                }
-
-                int thresh = i == 0 ? dc_threshold : cat >= 2 ? chroma_threshold : luma_threshold;
-                if( q < thresh )
-                {
-                    nr_offset[i] = 0;
-                    continue;
-                }
-                double pos = (double)(q-thresh+1) / (QP_MAX - QP_MAX_SPEC - thresh);
-
-                /* XXX: this math is largely tuned for /dev/random input. */
-                double start = dct8x8 ? h->unquant8_mf[CQM_8PY][QP_MAX_SPEC][i]
-                                      : h->unquant4_mf[CQM_4PY][QP_MAX_SPEC][i];
-                /* Formula chosen as an exponential scale to vaguely mimic the effects
-                 * of a higher quantizer. */
-                double bias = (pow( 2, pos*(QP_MAX - QP_MAX_SPEC)/10. )*0.003-0.003) * start;
-                nr_offset[i] = X264_MIN( bias + 0.5, max );
-            }
-        }
-
-    if( !h->mb.b_lossless )
-    {
-        while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_min)] <= max_chroma_qp_err )
-            h->param.rc.i_qp_min++;
-        if( min_qp_err <= h->param.rc.i_qp_max )
-            h->param.rc.i_qp_max = min_qp_err-1;
-        if( max_qp_err >= h->param.rc.i_qp_min )
-            h->param.rc.i_qp_min = max_qp_err+1;
-        /* If long level-codes aren't allowed, we need to allow QP high enough to avoid them. */
-        if( !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH )
-            while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_max)] <= 12 || h->param.rc.i_qp_max <= 12 )
-                h->param.rc.i_qp_max++;
-        if( h->param.rc.i_qp_min > h->param.rc.i_qp_max )
-        {
-            x264_log( h, X264_LOG_ERROR, "Impossible QP constraints for CQM (min=%d, max=%d)\n", h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-            return -1;
-        }
-    }
-    return 0;
-fail:
-    x264_cqm_delete( h );
-    return -1;
-}
-
-#define CQM_DELETE( n, max )\
-    for( int i = 0; i < (max); i++ )\
-    {\
-        int j;\
-        for( j = 0; j < i; j++ )\
-            if( h->quant##n##_mf[i] == h->quant##n##_mf[j] )\
-                break;\
-        if( j == i )\
-        {\
-            x264_free( h->  quant##n##_mf[i] );\
-            x264_free( h->dequant##n##_mf[i] );\
-            x264_free( h->unquant##n##_mf[i] );\
-        }\
-        for( j = 0; j < i; j++ )\
-            if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\
-                break;\
-        if( j == i )\
-        {\
-            x264_free( h->quant##n##_bias[i] );\
-            x264_free( h->quant##n##_bias0[i] );\
-        }\
-    }
-
-void x264_cqm_delete( x264_t *h )
-{
-    CQM_DELETE( 4, 4 );
-    CQM_DELETE( 8, CHROMA444 ? 4 : 2 );
-    x264_free( h->nr_offset_emergency );
-}
-
-static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
-                                  uint8_t *cqm, const uint8_t *jvt, int length )
-{
-    int i;
-
-    char *p = strstr( buf, name );
-    if( !p )
-    {
-        memset( cqm, 16, length );
-        return 0;
-    }
-
-    p += strlen( name );
-    if( *p == 'U' || *p == 'V' )
-        p++;
-
-    char *nextvar = strstr( p, "INT" );
-
-    for( i = 0; i < length && (p = strpbrk( p, " \t\n," )) && (p = strpbrk( p, "0123456789" )); i++ )
-    {
-        int coef = -1;
-        sscanf( p, "%d", &coef );
-        if( i == 0 && coef == 0 )
-        {
-            memcpy( cqm, jvt, length );
-            return 0;
-        }
-        if( coef < 1 || coef > 255 )
-        {
-            x264_log( h, X264_LOG_ERROR, "bad coefficient in list '%s'\n", name );
-            return -1;
-        }
-        cqm[i] = coef;
-    }
-
-    if( (nextvar && p > nextvar) || i != length )
-    {
-        x264_log( h, X264_LOG_ERROR, "not enough coefficients in list '%s'\n", name );
-        return -1;
-    }
-
-    return 0;
-}
-
-int x264_cqm_parse_file( x264_t *h, const char *filename )
-{
-    char *p;
-    int b_error = 0;
-
-    h->param.i_cqm_preset = X264_CQM_CUSTOM;
-
-    char *buf = x264_slurp_file( filename );
-    if( !buf )
-    {
-        x264_log( h, X264_LOG_ERROR, "can't open file '%s'\n", filename );
-        return -1;
-    }
-
-    while( (p = strchr( buf, '#' )) != NULL )
-        memset( p, ' ', strcspn( p, "\n" ) );
-
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_LUMA",   h->param.cqm_4iy, x264_cqm_jvt4i, 16 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_LUMA",   h->param.cqm_4py, x264_cqm_jvt4p, 16 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_CHROMA", h->param.cqm_4ic, x264_cqm_jvt4i, 16 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_CHROMA", h->param.cqm_4pc, x264_cqm_jvt4p, 16 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_LUMA",   h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_LUMA",   h->param.cqm_8py, x264_cqm_jvt8p, 64 );
-    if( CHROMA444 )
-    {
-        b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8ic, x264_cqm_jvt8i, 64 );
-        b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8pc, x264_cqm_jvt8p, 64 );
-    }
-
-    x264_free( buf );
-    return b_error;
-}
-
diff --git a/android/src/main/libenc/jni/libx264/common/set.h b/android/src/main/libenc/jni/libx264/common/set.h
deleted file mode 100755
index 69ec67c..0000000
--- a/android/src/main/libenc/jni/libx264/common/set.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/*****************************************************************************
- * set.h: quantization init
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_SET_H
-#define X264_SET_H
-
-enum profile_e
-{
-    PROFILE_BASELINE = 66,
-    PROFILE_MAIN     = 77,
-    PROFILE_HIGH    = 100,
-    PROFILE_HIGH10  = 110,
-    PROFILE_HIGH422 = 122,
-    PROFILE_HIGH444_PREDICTIVE = 244,
-};
-
-enum chroma_format_e
-{
-    CHROMA_400 = 0,
-    CHROMA_420 = 1,
-    CHROMA_422 = 2,
-    CHROMA_444 = 3,
-};
-
-enum cqm4_e
-{
-    CQM_4IY = 0,
-    CQM_4PY = 1,
-    CQM_4IC = 2,
-    CQM_4PC = 3
-};
-enum cqm8_e
-{
-    CQM_8IY = 0,
-    CQM_8PY = 1,
-    CQM_8IC = 2,
-    CQM_8PC = 3,
-};
-
-typedef struct
-{
-    int i_id;
-
-    int i_profile_idc;
-    int i_level_idc;
-
-    int b_constraint_set0;
-    int b_constraint_set1;
-    int b_constraint_set2;
-    int b_constraint_set3;
-
-    int i_log2_max_frame_num;
-
-    int i_poc_type;
-    /* poc 0 */
-    int i_log2_max_poc_lsb;
-
-    int i_num_ref_frames;
-    int b_gaps_in_frame_num_value_allowed;
-    int i_mb_width;
-    int i_mb_height;
-    int b_frame_mbs_only;
-    int b_mb_adaptive_frame_field;
-    int b_direct8x8_inference;
-
-    int b_crop;
-    struct
-    {
-        int i_left;
-        int i_right;
-        int i_top;
-        int i_bottom;
-    } crop;
-
-    int b_vui;
-    struct
-    {
-        int b_aspect_ratio_info_present;
-        int i_sar_width;
-        int i_sar_height;
-
-        int b_overscan_info_present;
-        int b_overscan_info;
-
-        int b_signal_type_present;
-        int i_vidformat;
-        int b_fullrange;
-        int b_color_description_present;
-        int i_colorprim;
-        int i_transfer;
-        int i_colmatrix;
-
-        int b_chroma_loc_info_present;
-        int i_chroma_loc_top;
-        int i_chroma_loc_bottom;
-
-        int b_timing_info_present;
-        uint32_t i_num_units_in_tick;
-        uint32_t i_time_scale;
-        int b_fixed_frame_rate;
-
-        int b_nal_hrd_parameters_present;
-        int b_vcl_hrd_parameters_present;
-
-        struct
-        {
-            int i_cpb_cnt;
-            int i_bit_rate_scale;
-            int i_cpb_size_scale;
-            int i_bit_rate_value;
-            int i_cpb_size_value;
-            int i_bit_rate_unscaled;
-            int i_cpb_size_unscaled;
-            int b_cbr_hrd;
-
-            int i_initial_cpb_removal_delay_length;
-            int i_cpb_removal_delay_length;
-            int i_dpb_output_delay_length;
-            int i_time_offset_length;
-        } hrd;
-
-        int b_pic_struct_present;
-        int b_bitstream_restriction;
-        int b_motion_vectors_over_pic_boundaries;
-        int i_max_bytes_per_pic_denom;
-        int i_max_bits_per_mb_denom;
-        int i_log2_max_mv_length_horizontal;
-        int i_log2_max_mv_length_vertical;
-        int i_num_reorder_frames;
-        int i_max_dec_frame_buffering;
-
-        /* FIXME to complete */
-    } vui;
-
-    int b_qpprime_y_zero_transform_bypass;
-    int i_chroma_format_idc;
-
-} x264_sps_t;
-
-typedef struct
-{
-    int i_id;
-    int i_sps_id;
-
-    int b_cabac;
-
-    int b_pic_order;
-    int i_num_slice_groups;
-
-    int i_num_ref_idx_l0_default_active;
-    int i_num_ref_idx_l1_default_active;
-
-    int b_weighted_pred;
-    int b_weighted_bipred;
-
-    int i_pic_init_qp;
-    int i_pic_init_qs;
-
-    int i_chroma_qp_index_offset;
-
-    int b_deblocking_filter_control;
-    int b_constrained_intra_pred;
-    int b_redundant_pic_cnt;
-
-    int b_transform_8x8_mode;
-
-    int i_cqm_preset;
-    const uint8_t *scaling_list[8]; /* could be 12, but we don't allow separate Cb/Cr lists */
-
-} x264_pps_t;
-
-/* default quant matrices */
-static const uint8_t x264_cqm_jvt4i[16] =
-{
-      6,13,20,28,
-     13,20,28,32,
-     20,28,32,37,
-     28,32,37,42
-};
-static const uint8_t x264_cqm_jvt4p[16] =
-{
-    10,14,20,24,
-    14,20,24,27,
-    20,24,27,30,
-    24,27,30,34
-};
-static const uint8_t x264_cqm_jvt8i[64] =
-{
-     6,10,13,16,18,23,25,27,
-    10,11,16,18,23,25,27,29,
-    13,16,18,23,25,27,29,31,
-    16,18,23,25,27,29,31,33,
-    18,23,25,27,29,31,33,36,
-    23,25,27,29,31,33,36,38,
-    25,27,29,31,33,36,38,40,
-    27,29,31,33,36,38,40,42
-};
-static const uint8_t x264_cqm_jvt8p[64] =
-{
-     9,13,15,17,19,21,22,24,
-    13,13,17,19,21,22,24,25,
-    15,17,19,21,22,24,25,27,
-    17,19,21,22,24,25,27,28,
-    19,21,22,24,25,27,28,30,
-    21,22,24,25,27,28,30,32,
-    22,24,25,27,28,30,32,33,
-    24,25,27,28,30,32,33,35
-};
-static const uint8_t x264_cqm_flat16[64] =
-{
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16,
-    16,16,16,16,16,16,16,16
-};
-static const uint8_t * const x264_cqm_jvt[8] =
-{
-    x264_cqm_jvt4i, x264_cqm_jvt4p,
-    x264_cqm_jvt4i, x264_cqm_jvt4p,
-    x264_cqm_jvt8i, x264_cqm_jvt8p,
-    x264_cqm_jvt8i, x264_cqm_jvt8p
-};
-
-// 1080i25_avci50, 1080p25_avci50
-static const uint8_t x264_cqm_avci50_4ic[16] =
-{
-    16,22,28,40,
-    22,28,40,44,
-    28,40,44,48,
-    40,44,48,60
-};
-
-//  1080i25_avci50,
-static const uint8_t x264_cqm_avci50_1080i_8iy[64] =
-{
-    16,18,19,21,27,33,81,87,
-    18,19,21,24,30,33,81,87,
-    19,21,24,27,30,78,84,90,
-    21,24,27,30,33,78,84,90,
-    24,27,30,33,78,81,84,90,
-    24,27,30,33,78,81,84,93,
-    27,30,33,78,78,81,87,93,
-    30,33,33,78,81,84,87,96
-};
-
-//  1080p25_avci50, 720p25_avci50, 720p50_avci50
-static const uint8_t x264_cqm_avci50_p_8iy[64] =
-{
-    16,18,19,21,24,27,30,33,
-    18,19,21,24,27,30,33,78,
-    19,21,24,27,30,33,78,81,
-    21,24,27,30,33,78,81,84,
-    24,27,30,33,78,81,84,87,
-    27,30,33,78,81,84,87,90,
-    30,33,78,81,84,87,90,93,
-    33,78,81,84,87,90,93,96
-};
-
-//  1080i25_avci100, 1080p25_avci100
-static const uint8_t x264_cqm_avci100_1080_4ic[16] =
-{
-    16,20,26,32,
-    20,26,32,38,
-    26,32,38,44,
-    32,38,44,50
-};
-
-// 720p25_avci100, 720p50_avci100
-static const uint8_t x264_cqm_avci100_720p_4ic[16] =
-{
-    16,21,27,34,
-    21,27,34,41,
-    27,34,41,46,
-    34,41,46,54
-};
-
-//  1080i25_avci100,
-static const uint8_t x264_cqm_avci100_1080i_8iy[64] =
-{
-    16,19,20,23,24,26,32,42,
-    18,19,22,24,26,32,36,42,
-    18,20,23,24,26,32,36,63,
-    19,20,23,26,32,36,42,63,
-    20,22,24,26,32,36,59,63,
-    22,23,24,26,32,36,59,68,
-    22,23,24,26,32,42,59,68,
-    22,23,24,26,36,42,59,72
-};
-
-// 1080p25_avci100,
-static const uint8_t x264_cqm_avci100_1080p_8iy[64] =
-{
-    16,18,19,20,22,23,24,26,
-    18,19,20,22,23,24,26,32,
-    19,20,22,23,24,26,32,36,
-    20,22,23,24,26,32,36,42,
-    22,23,24,26,32,36,42,59,
-    23,24,26,32,36,42,59,63,
-    24,26,32,36,42,59,63,68,
-    26,32,36,42,59,63,68,72
-};
-
-// 720p25_avci100, 720p50_avci100
-static const uint8_t x264_cqm_avci100_720p_8iy[64] =
-{
-    16,18,19,21,22,24,26,32,
-    18,19,19,21,22,24,26,32,
-    19,19,21,22,22,24,26,32,
-    21,21,22,22,23,24,26,34,
-    22,22,22,23,24,25,26,34,
-    24,24,24,24,25,26,34,36,
-    26,26,26,26,26,34,36,38,
-    32,32,32,34,34,36,38,42
-};
-
-int  x264_cqm_init( x264_t *h );
-void x264_cqm_delete( x264_t *h );
-int  x264_cqm_parse_file( x264_t *h, const char *filename );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/threadpool.c b/android/src/main/libenc/jni/libx264/common/threadpool.c
deleted file mode 100755
index bb4dd5e..0000000
--- a/android/src/main/libenc/jni/libx264/common/threadpool.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*****************************************************************************
- * threadpool.c: thread pooling
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-typedef struct
-{
-    void *(*func)(void *);
-    void *arg;
-    void *ret;
-} x264_threadpool_job_t;
-
-struct x264_threadpool_t
-{
-    int            exit;
-    int            threads;
-    x264_pthread_t *thread_handle;
-    void           (*init_func)(void *);
-    void           *init_arg;
-
-    /* requires a synchronized list structure and associated methods,
-       so use what is already implemented for frames */
-    x264_sync_frame_list_t uninit; /* list of jobs that are awaiting use */
-    x264_sync_frame_list_t run;    /* list of jobs that are queued for processing by the pool */
-    x264_sync_frame_list_t done;   /* list of jobs that have finished processing */
-};
-
-static void *x264_threadpool_thread( x264_threadpool_t *pool )
-{
-    if( pool->init_func )
-        pool->init_func( pool->init_arg );
-
-    while( !pool->exit )
-    {
-        x264_threadpool_job_t *job = NULL;
-        x264_pthread_mutex_lock( &pool->run.mutex );
-        while( !pool->exit && !pool->run.i_size )
-            x264_pthread_cond_wait( &pool->run.cv_fill, &pool->run.mutex );
-        if( pool->run.i_size )
-        {
-            job = (void*)x264_frame_shift( pool->run.list );
-            pool->run.i_size--;
-        }
-        x264_pthread_mutex_unlock( &pool->run.mutex );
-        if( !job )
-            continue;
-        job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */
-        x264_sync_frame_list_push( &pool->done, (void*)job );
-    }
-    return NULL;
-}
-
-int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
-                          void (*init_func)(void *), void *init_arg )
-{
-    if( threads <= 0 )
-        return -1;
-
-    x264_threadpool_t *pool;
-    CHECKED_MALLOCZERO( pool, sizeof(x264_threadpool_t) );
-    *p_pool = pool;
-
-    pool->init_func = init_func;
-    pool->init_arg  = init_arg;
-    pool->threads   = threads;
-
-    CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
-
-    if( x264_sync_frame_list_init( &pool->uninit, pool->threads ) ||
-        x264_sync_frame_list_init( &pool->run, pool->threads ) ||
-        x264_sync_frame_list_init( &pool->done, pool->threads ) )
-        goto fail;
-
-    for( int i = 0; i < pool->threads; i++ )
-    {
-       x264_threadpool_job_t *job;
-       CHECKED_MALLOC( job, sizeof(x264_threadpool_job_t) );
-       x264_sync_frame_list_push( &pool->uninit, (void*)job );
-    }
-    for( int i = 0; i < pool->threads; i++ )
-        if( x264_pthread_create( pool->thread_handle+i, NULL, (void*)x264_threadpool_thread, pool ) )
-            goto fail;
-
-    return 0;
-fail:
-    return -1;
-}
-
-void x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg )
-{
-    x264_threadpool_job_t *job = (void*)x264_sync_frame_list_pop( &pool->uninit );
-    job->func = func;
-    job->arg  = arg;
-    x264_sync_frame_list_push( &pool->run, (void*)job );
-}
-
-void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg )
-{
-    x264_pthread_mutex_lock( &pool->done.mutex );
-    while( 1 )
-    {
-        for( int i = 0; i < pool->done.i_size; i++ )
-            if( ((x264_threadpool_job_t*)pool->done.list[i])->arg == arg )
-            {
-                x264_threadpool_job_t *job = (void*)x264_frame_shift( pool->done.list+i );
-                pool->done.i_size--;
-                x264_pthread_mutex_unlock( &pool->done.mutex );
-
-                void *ret = job->ret;
-                x264_sync_frame_list_push( &pool->uninit, (void*)job );
-                return ret;
-            }
-
-        x264_pthread_cond_wait( &pool->done.cv_fill, &pool->done.mutex );
-    }
-}
-
-static void x264_threadpool_list_delete( x264_sync_frame_list_t *slist )
-{
-    for( int i = 0; slist->list[i]; i++ )
-    {
-        x264_free( slist->list[i] );
-        slist->list[i] = NULL;
-    }
-    x264_sync_frame_list_delete( slist );
-}
-
-void x264_threadpool_delete( x264_threadpool_t *pool )
-{
-    x264_pthread_mutex_lock( &pool->run.mutex );
-    pool->exit = 1;
-    x264_pthread_cond_broadcast( &pool->run.cv_fill );
-    x264_pthread_mutex_unlock( &pool->run.mutex );
-    for( int i = 0; i < pool->threads; i++ )
-        x264_pthread_join( pool->thread_handle[i], NULL );
-
-    x264_threadpool_list_delete( &pool->uninit );
-    x264_threadpool_list_delete( &pool->run );
-    x264_threadpool_list_delete( &pool->done );
-    x264_free( pool->thread_handle );
-    x264_free( pool );
-}
diff --git a/android/src/main/libenc/jni/libx264/common/threadpool.h b/android/src/main/libenc/jni/libx264/common/threadpool.h
deleted file mode 100755
index 26c2207..0000000
--- a/android/src/main/libenc/jni/libx264/common/threadpool.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*****************************************************************************
- * threadpool.h: thread pooling
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_THREADPOOL_H
-#define X264_THREADPOOL_H
-
-typedef struct x264_threadpool_t x264_threadpool_t;
-
-#if HAVE_THREAD
-int   x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
-                            void (*init_func)(void *), void *init_arg );
-void  x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg );
-void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg );
-void  x264_threadpool_delete( x264_threadpool_t *pool );
-#else
-#define x264_threadpool_init(p,t,f,a) -1
-#define x264_threadpool_run(p,f,a)
-#define x264_threadpool_wait(p,a)     NULL
-#define x264_threadpool_delete(p)
-#endif
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/vlc.c b/android/src/main/libenc/jni/libx264/common/vlc.c
deleted file mode 100755
index cc9ed9d..0000000
--- a/android/src/main/libenc/jni/libx264/common/vlc.c
+++ /dev/null
@@ -1,869 +0,0 @@
-/*****************************************************************************
- * vlc.c : vlc tables
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common.h"
-
-/* [nC] */
-const vlc_t x264_coeff0_token[6] =
-{
-    { 0x1, 1 }, /* str=1 */
-    { 0x3, 2 }, /* str=11 */
-    { 0xf, 4 }, /* str=1111 */
-    { 0x3, 6 }, /* str=000011 */
-    { 0x1, 2 }, /* str=01 */
-    { 0x1, 1 }, /* str=1 */
-};
-
-/* [nC][i_total_coeff-1][i_trailing] */
-const vlc_t x264_coeff_token[6][16][4] =
-{
-    { /* table 0 */
-        { /* i_total 1 */
-            { 0x5, 6 }, /* str=000101 */
-            { 0x1, 2 }, /* str=01 */
-        },
-        { /* i_total 2 */
-            { 0x7, 8 }, /* str=00000111 */
-            { 0x4, 6 }, /* str=000100 */
-            { 0x1, 3 }, /* str=001 */
-        },
-        { /* i_total 3 */
-            { 0x7, 9 }, /* str=000000111 */
-            { 0x6, 8 }, /* str=00000110 */
-            { 0x5, 7 }, /* str=0000101 */
-            { 0x3, 5 }, /* str=00011 */
-        },
-        { /* i_total 4 */
-            { 0x7, 10 }, /* str=0000000111 */
-            { 0x6, 9 },  /* str=000000110 */
-            { 0x5, 8 },  /* str=00000101 */
-            { 0x3, 6 },  /* str=000011 */
-        },
-        { /* i_total 5 */
-            { 0x7, 11 }, /* str=00000000111 */
-            { 0x6, 10 }, /* str=0000000110 */
-            { 0x5, 9 },  /* str=000000101 */
-            { 0x4, 7 },  /* str=0000100 */
-        },
-        { /* i_total 6 */
-            { 0xf, 13 }, /* str=0000000001111 */
-            { 0x6, 11 }, /* str=00000000110 */
-            { 0x5, 10 }, /* str=0000000101 */
-            { 0x4, 8 },  /* str=00000100 */
-        },
-        { /* i_total 7 */
-            { 0xb, 13 }, /* str=0000000001011 */
-            { 0xe, 13 }, /* str=0000000001110 */
-            { 0x5, 11 }, /* str=00000000101 */
-            { 0x4, 9 },  /* str=000000100 */
-        },
-        { /* i_total 8 */
-            { 0x8, 13 }, /* str=0000000001000 */
-            { 0xa, 13 }, /* str=0000000001010 */
-            { 0xd, 13 }, /* str=0000000001101 */
-            { 0x4, 10 }, /* str=0000000100 */
-        },
-        { /* i_total 9 */
-            { 0xf, 14 }, /* str=00000000001111 */
-            { 0xe, 14 }, /* str=00000000001110 */
-            { 0x9, 13 }, /* str=0000000001001 */
-            { 0x4, 11 }, /* str=00000000100 */
-        },
-        { /* i_total 10 */
-            { 0xb, 14 }, /* str=00000000001011 */
-            { 0xa, 14 }, /* str=00000000001010 */
-            { 0xd, 14 }, /* str=00000000001101 */
-            { 0xc, 13 }, /* str=0000000001100 */
-        },
-        { /* i_total 14 */
-            { 0xf, 15 }, /* str=000000000001111 */
-            { 0xe, 15 }, /* str=000000000001110 */
-            { 0x9, 14 }, /* str=00000000001001 */
-            { 0xc, 14 }, /* str=00000000001100 */
-        },
-        { /* i_total 12 */
-            { 0xb, 15 }, /* str=000000000001011 */
-            { 0xa, 15 }, /* str=000000000001010 */
-            { 0xd, 15 }, /* str=000000000001101 */
-            { 0x8, 14 }, /* str=00000000001000 */
-        },
-        { /* i_total 13 */
-            { 0xf, 16 }, /* str=0000000000001111 */
-            { 0x1, 15 }, /* str=000000000000001 */
-            { 0x9, 15 }, /* str=000000000001001 */
-            { 0xc, 15 }, /* str=000000000001100 */
-        },
-        { /* i_total 14 */
-            { 0xb, 16 }, /* str=0000000000001011 */
-            { 0xe, 16 }, /* str=0000000000001110 */
-            { 0xd, 16 }, /* str=0000000000001101 */
-            { 0x8, 15 }, /* str=000000000001000 */
-        },
-        { /* i_total 15 */
-            { 0x7, 16 }, /* str=0000000000000111 */
-            { 0xa, 16 }, /* str=0000000000001010 */
-            { 0x9, 16 }, /* str=0000000000001001 */
-            { 0xc, 16 }, /* str=0000000000001100 */
-        },
-        { /* i_total 16 */
-            { 0x4, 16 }, /* str=0000000000000100 */
-            { 0x6, 16 }, /* str=0000000000000110 */
-            { 0x5, 16 }, /* str=0000000000000101 */
-            { 0x8, 16 }, /* str=0000000000001000 */
-        },
-    },
-    { /* table 1 */
-        { /* i_total 1 */
-            { 0xb, 6 }, /* str=001011 */
-            { 0x2, 2 }, /* str=10 */
-        },
-        { /* i_total 2 */
-            { 0x7, 6 }, /* str=000111 */
-            { 0x7, 5 }, /* str=00111 */
-            { 0x3, 3 }, /* str=011 */
-        },
-        { /* i_total 3 */
-            { 0x7, 7 }, /* str=0000111 */
-            { 0xa, 6 }, /* str=001010 */
-            { 0x9, 6 }, /* str=001001 */
-            { 0x5, 4 }, /* str=0101 */
-        },
-        { /* i_total 4 */
-            { 0x7, 8 }, /* str=00000111 */
-            { 0x6, 6 }, /* str=000110 */
-            { 0x5, 6 }, /* str=000101 */
-            { 0x4, 4 }, /* str=0100 */
-        },
-        { /* i_total 5 */
-            { 0x4, 8 }, /* str=00000100 */
-            { 0x6, 7 }, /* str=0000110 */
-            { 0x5, 7 }, /* str=0000101 */
-            { 0x6, 5 }, /* str=00110 */
-        },
-        { /* i_total 6 */
-            { 0x7, 9 }, /* str=000000111 */
-            { 0x6, 8 }, /* str=00000110 */
-            { 0x5, 8 }, /* str=00000101 */
-            { 0x8, 6 }, /* str=001000 */
-        },
-        { /* i_total 7 */
-            { 0xf, 11 }, /* str=00000001111 */
-            { 0x6, 9 },  /* str=000000110 */
-            { 0x5, 9 },  /* str=000000101 */
-            { 0x4, 6 },  /* str=000100 */
-        },
-        { /* i_total 8 */
-            { 0xb, 11 }, /* str=00000001011 */
-            { 0xe, 11 }, /* str=00000001110 */
-            { 0xd, 11 }, /* str=00000001101 */
-            { 0x4, 7 },  /* str=0000100 */
-        },
-        { /* i_total 9 */
-            { 0xf, 12 }, /* str=000000001111 */
-            { 0xa, 11 }, /* str=00000001010 */
-            { 0x9, 11 }, /* str=00000001001 */
-            { 0x4, 9 },  /* str=000000100 */
-        },
-        { /* i_total 10 */
-            { 0xb, 12 }, /* str=000000001011 */
-            { 0xe, 12 }, /* str=000000001110 */
-            { 0xd, 12 }, /* str=000000001101 */
-            { 0xc, 11 }, /* str=00000001100 */
-        },
-        { /* i_total 11 */
-            { 0x8, 12 }, /* str=000000001000 */
-            { 0xa, 12 }, /* str=000000001010 */
-            { 0x9, 12 }, /* str=000000001001 */
-            { 0x8, 11 }, /* str=00000001000 */
-        },
-        { /* i_total 12 */
-            { 0xf, 13 }, /* str=0000000001111 */
-            { 0xe, 13 }, /* str=0000000001110 */
-            { 0xd, 13 }, /* str=0000000001101 */
-            { 0xc, 12 }, /* str=000000001100 */
-        },
-        { /* i_total 13 */
-            { 0xb, 13 }, /* str=0000000001011 */
-            { 0xa, 13 }, /* str=0000000001010 */
-            { 0x9, 13 }, /* str=0000000001001 */
-            { 0xc, 13 }, /* str=0000000001100 */
-        },
-        { /* i_total 14 */
-            { 0x7, 13 }, /* str=0000000000111 */
-            { 0xb, 14 }, /* str=00000000001011 */
-            { 0x6, 13 }, /* str=0000000000110 */
-            { 0x8, 13 }, /* str=0000000001000 */
-        },
-        { /* i_total 15 */
-            { 0x9, 14 }, /* str=00000000001001 */
-            { 0x8, 14 }, /* str=00000000001000 */
-            { 0xa, 14 }, /* str=00000000001010 */
-            { 0x1, 13 }, /* str=0000000000001 */
-        },
-        { /* i_total 16 */
-            { 0x7, 14 }, /* str=00000000000111 */
-            { 0x6, 14 }, /* str=00000000000110 */
-            { 0x5, 14 }, /* str=00000000000101 */
-            { 0x4, 14 }, /* str=00000000000100 */
-        },
-    },
-    { /* table 2 */
-        { /* i_total 1 */
-            { 0xf, 6 }, /* str=001111 */
-            { 0xe, 4 }, /* str=1110 */
-        },
-        { /* i_total 2 */
-            { 0xb, 6 }, /* str=001011 */
-            { 0xf, 5 }, /* str=01111 */
-            { 0xd, 4 }, /* str=1101 */
-        },
-        { /* i_total 3 */
-            { 0x8, 6 }, /* str=001000 */
-            { 0xc, 5 }, /* str=01100 */
-            { 0xe, 5 }, /* str=01110 */
-            { 0xc, 4 }, /* str=1100 */
-        },
-        { /* i_total 4 */
-            { 0xf, 7 }, /* str=0001111 */
-            { 0xa, 5 }, /* str=01010 */
-            { 0xb, 5 }, /* str=01011 */
-            { 0xb, 4 }, /* str=1011 */
-        },
-        { /* i_total 5 */
-            { 0xb, 7 }, /* str=0001011 */
-            { 0x8, 5 }, /* str=01000 */
-            { 0x9, 5 }, /* str=01001 */
-            { 0xa, 4 }, /* str=1010 */
-        },
-        { /* i_total 6 */
-            { 0x9, 7 }, /* str=0001001 */
-            { 0xe, 6 }, /* str=001110 */
-            { 0xd, 6 }, /* str=001101 */
-            { 0x9, 4 }, /* str=1001 */
-        },
-        { /* i_total 7 */
-            { 0x8, 7 }, /* str=0001000 */
-            { 0xa, 6 }, /* str=001010 */
-            { 0x9, 6 }, /* str=001001 */
-            { 0x8, 4 }, /* str=1000 */
-        },
-        { /* i_total 8 */
-            { 0xf, 8 }, /* str=00001111 */
-            { 0xe, 7 }, /* str=0001110 */
-            { 0xd, 7 }, /* str=0001101 */
-            { 0xd, 5 }, /* str=01101 */
-        },
-        { /* i_total 9 */
-            { 0xb, 8 }, /* str=00001011 */
-            { 0xe, 8 }, /* str=00001110 */
-            { 0xa, 7 }, /* str=0001010 */
-            { 0xc, 6 }, /* str=001100 */
-        },
-        { /* i_total 10 */
-            { 0xf, 9 }, /* str=000001111 */
-            { 0xa, 8 }, /* str=00001010 */
-            { 0xd, 8 }, /* str=00001101 */
-            { 0xc, 7 }, /* str=0001100 */
-        },
-        { /* i_total 11 */
-            { 0xb, 9 }, /* str=000001011 */
-            { 0xe, 9 }, /* str=000001110 */
-            { 0x9, 8 }, /* str=00001001 */
-            { 0xc, 8 }, /* str=00001100 */
-        },
-        { /* i_total 12 */
-            { 0x8, 9 }, /* str=000001000 */
-            { 0xa, 9 }, /* str=000001010 */
-            { 0xd, 9 }, /* str=000001101 */
-            { 0x8, 8 }, /* str=00001000 */
-        },
-        { /* i_total 13 */
-            { 0xd, 10 }, /* str=0000001101 */
-            { 0x7, 9 },  /* str=000000111 */
-            { 0x9, 9 },  /* str=000001001 */
-            { 0xc, 9 },  /* str=000001100 */
-        },
-        { /* i_total 14 */
-            { 0x9, 10 }, /* str=0000001001 */
-            { 0xc, 10 }, /* str=0000001100 */
-            { 0xb, 10 }, /* str=0000001011 */
-            { 0xa, 10 }, /* str=0000001010 */
-        },
-        { /* i_total 15 */
-            { 0x5, 10 }, /* str=0000000101 */
-            { 0x8, 10 }, /* str=0000001000 */
-            { 0x7, 10 }, /* str=0000000111 */
-            { 0x6, 10 }, /* str=0000000110 */
-        },
-        { /* i_total 16 */
-            { 0x1, 10 }, /* str=0000000001 */
-            { 0x4, 10 }, /* str=0000000100 */
-            { 0x3, 10 }, /* str=0000000011 */
-            { 0x2, 10 }, /* str=0000000010 */
-        },
-    },
-    { /* table 3 */
-        { /* i_total 1 */
-            { 0x0, 6 }, /* str=000000 */
-            { 0x1, 6 }, /* str=000001 */
-        },
-        { /* i_total 2 */
-            { 0x4, 6 }, /* str=000100 */
-            { 0x5, 6 }, /* str=000101 */
-            { 0x6, 6 }, /* str=000110 */
-        },
-        { /* i_total 3 */
-            { 0x8, 6 }, /* str=001000 */
-            { 0x9, 6 }, /* str=001001 */
-            { 0xa, 6 }, /* str=001010 */
-            { 0xb, 6 }, /* str=001011 */
-        },
-        { /* i_total 4 */
-            { 0xc, 6 }, /* str=001100 */
-            { 0xd, 6 }, /* str=001101 */
-            { 0xe, 6 }, /* str=001110 */
-            { 0xf, 6 }, /* str=001111 */
-        },
-        { /* i_total 5 */
-            { 0x10, 6 }, /* str=010000 */
-            { 0x11, 6 }, /* str=010001 */
-            { 0x12, 6 }, /* str=010010 */
-            { 0x13, 6 }, /* str=010011 */
-        },
-        { /* i_total 6 */
-            { 0x14, 6 }, /* str=010100 */
-            { 0x15, 6 }, /* str=010101 */
-            { 0x16, 6 }, /* str=010110 */
-            { 0x17, 6 }, /* str=010111 */
-        },
-        { /* i_total 7 */
-            { 0x18, 6 }, /* str=011000 */
-            { 0x19, 6 }, /* str=011001 */
-            { 0x1a, 6 }, /* str=011010 */
-            { 0x1b, 6 }, /* str=011011 */
-        },
-        { /* i_total 8 */
-            { 0x1c, 6 }, /* str=011100 */
-            { 0x1d, 6 }, /* str=011101 */
-            { 0x1e, 6 }, /* str=011110 */
-            { 0x1f, 6 }, /* str=011111 */
-        },
-        { /* i_total 9 */
-            { 0x20, 6 }, /* str=100000 */
-            { 0x21, 6 }, /* str=100001 */
-            { 0x22, 6 }, /* str=100010 */
-            { 0x23, 6 }, /* str=100011 */
-        },
-        { /* i_total 10 */
-            { 0x24, 6 }, /* str=100100 */
-            { 0x25, 6 }, /* str=100101 */
-            { 0x26, 6 }, /* str=100110 */
-            { 0x27, 6 }, /* str=100111 */
-        },
-        { /* i_total 11 */
-            { 0x28, 6 }, /* str=101000 */
-            { 0x29, 6 }, /* str=101001 */
-            { 0x2a, 6 }, /* str=101010 */
-            { 0x2b, 6 }, /* str=101011 */
-        },
-        { /* i_total 12 */
-            { 0x2c, 6 }, /* str=101100 */
-            { 0x2d, 6 }, /* str=101101 */
-            { 0x2e, 6 }, /* str=101110 */
-            { 0x2f, 6 }, /* str=101111 */
-        },
-        { /* i_total 13 */
-            { 0x30, 6 }, /* str=110000 */
-            { 0x31, 6 }, /* str=110001 */
-            { 0x32, 6 }, /* str=110010 */
-            { 0x33, 6 }, /* str=110011 */
-        },
-        { /* i_total 14 */
-            { 0x34, 6 }, /* str=110100 */
-            { 0x35, 6 }, /* str=110101 */
-            { 0x36, 6 }, /* str=110110 */
-            { 0x37, 6 }, /* str=110111 */
-        },
-        { /* i_total 15 */
-            { 0x38, 6 }, /* str=111000 */
-            { 0x39, 6 }, /* str=111001 */
-            { 0x3a, 6 }, /* str=111010 */
-            { 0x3b, 6 }, /* str=111011 */
-        },
-        { /* i_total 16 */
-            { 0x3c, 6 }, /* str=111100 */
-            { 0x3d, 6 }, /* str=111101 */
-            { 0x3e, 6 }, /* str=111110 */
-            { 0x3f, 6 }, /* str=111111 */
-        },
-    },
-    { /* table 4 */
-        { /* i_total 1 */
-            { 0x7, 6 }, /* str=000111 */
-            { 0x1, 1 }, /* str=1 */
-        },
-        { /* i_total 2 */
-            { 0x4, 6 }, /* str=000100 */
-            { 0x6, 6 }, /* str=000110 */
-            { 0x1, 3 }, /* str=001 */
-        },
-        { /* i_total 3 */
-            { 0x3, 6 }, /* str=000011 */
-            { 0x3, 7 }, /* str=0000011 */
-            { 0x2, 7 }, /* str=0000010 */
-            { 0x5, 6 }, /* str=000101 */
-        },
-        { /* i_total 4 */
-            { 0x2, 6 }, /* str=000010 */
-            { 0x3, 8 }, /* str=00000011 */
-            { 0x2, 8 }, /* str=00000010 */
-            { 0x0, 7 }, /* str=0000000 */
-        },
-    },
-    { /* table 5 */
-        { /* i_total 1 */
-            { 0xf, 7 }, /* str=0001111 */
-            { 0x1, 2 }, /* str=01 */
-        },
-        { /* i_total 2 */
-            { 0xe, 7 }, /* str=0001110 */
-            { 0xd, 7 }, /* str=0001101 */
-            { 0x1, 3 }, /* str=001 */
-        },
-        { /* i_total 3 */
-            { 0x7, 9 }, /* str=000000111 */
-            { 0xc, 7 }, /* str=0001100 */
-            { 0xb, 7 }, /* str=0001011 */
-            { 0x1, 5 }, /* str=00001 */
-        },
-        { /* i_total 4 */
-            { 0x6, 9 }, /* str=000000110 */
-            { 0x5, 9 }, /* str=000000101 */
-            { 0xa, 7 }, /* str=0001010 */
-            { 0x1, 6 }, /* str=000001 */
-        },
-        { /* i_total 5 */
-            { 0x7, 10 }, /* str=0000000111 */
-            { 0x6, 10 }, /* str=0000000110 */
-            { 0x4, 9 },  /* str=000000100 */
-            { 0x9, 7 },  /* str=0001001 */
-        },
-        { /* i_total 6 */
-            { 0x7, 11 }, /* str=00000000111 */
-            { 0x6, 11 }, /* str=00000000110 */
-            { 0x5, 10 }, /* str=0000000101 */
-            { 0x8, 7 },  /* str=0001000 */
-        },
-        { /* i_total 7 */
-            { 0x7, 12 }, /* str=000000000111 */
-            { 0x6, 12 }, /* str=000000000110 */
-            { 0x5, 11 }, /* str=00000000101 */
-            { 0x4, 10 }, /* str=0000000100 */
-        },
-        { /* i_total 8 */
-            { 0x7, 13 }, /* str=0000000000111 */
-            { 0x5, 12 }, /* str=000000000101 */
-            { 0x4, 12 }, /* str=000000000100 */
-            { 0x4, 11 }, /* str=00000000100 */
-        },
-    },
-};
-
-/* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros[15][16] =
-{
-    { /* i_total 1 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x3, 5 }, /* str=00011 */
-        { 0x2, 5 }, /* str=00010 */
-        { 0x3, 6 }, /* str=000011 */
-        { 0x2, 6 }, /* str=000010 */
-        { 0x3, 7 }, /* str=0000011 */
-        { 0x2, 7 }, /* str=0000010 */
-        { 0x3, 8 }, /* str=00000011 */
-        { 0x2, 8 }, /* str=00000010 */
-        { 0x3, 9 }, /* str=000000011 */
-        { 0x2, 9 }, /* str=000000010 */
-        { 0x1, 9 }, /* str=000000001 */
-    },
-    { /* i_total 2 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x5, 4 }, /* str=0101 */
-        { 0x4, 4 }, /* str=0100 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x3, 5 }, /* str=00011 */
-        { 0x2, 5 }, /* str=00010 */
-        { 0x3, 6 }, /* str=000011 */
-        { 0x2, 6 }, /* str=000010 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x0, 6 }, /* str=000000 */
-    },
-    { /* i_total 3 */
-        { 0x5, 4 }, /* str=0101 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 4 }, /* str=0100 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x3, 5 }, /* str=00011 */
-        { 0x2, 5 }, /* str=00010 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x0, 6 }, /* str=000000 */
-    },
-    { /* i_total 4 */
-        { 0x3, 5 }, /* str=00011 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x5, 4 }, /* str=0101 */
-        { 0x4, 4 }, /* str=0100 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x2, 5 }, /* str=00010 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x0, 5 }, /* str=00000 */
-    },
-    { /* i_total 5 */
-        { 0x5, 4 }, /* str=0101 */
-        { 0x4, 4 }, /* str=0100 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x0, 5 }, /* str=00000 */
-    },
-    { /* i_total 6 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 6 }, /* str=000000 */
-    },
-    { /* i_total 7 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 6 }, /* str=000000 */
-    },
-    { /* i_total 8 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 6 }, /* str=000000 */
-    },
-    { /* i_total 9 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x0, 6 }, /* str=000000 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 5 }, /* str=00001 */
-    },
-    { /* i_total 10 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x0, 5 }, /* str=00000 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 4 }, /* str=0001 */
-    },
-    { /* i_total 11 */
-        { 0x0, 4 }, /* str=0000 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x3, 3 }, /* str=011 */
-    },
-    { /* i_total 12 */
-        { 0x0, 4 }, /* str=0000 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x1, 3 }, /* str=001 */
-    },
-    { /* i_total 13 */
-        { 0x0, 3 }, /* str=000 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x1, 2 }, /* str=01 */
-    },
-    { /* i_total 14 */
-        { 0x0, 2 }, /* str=00 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 1 }, /* str=1 */
-    },
-    { /* i_total 15 */
-        { 0x0, 1 }, /* str=0 */
-        { 0x1, 1 }, /* str=1 */
-    },
-};
-
-/* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros_2x2_dc[3][4] =
-{
-    { /* i_total 1 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 3 }  /* str=000 */
-    },
-    { /* i_total 2 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x0, 2 }, /* str=00 */
-    },
-    { /* i_total 3 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x0, 1 }, /* str=0 */
-    },
-};
-
-/* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros_2x4_dc[7][8] =
-{
-    { /* i_total 1 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 4 }, /* str=0010 */
-        { 0x3, 4 }, /* str=0011 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x0, 5 }, /* str=00000 */
-    },
-    { /* i_total 2 */
-        { 0x0, 3 }, /* str=000 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x7, 3 }, /* str=111 */
-    },
-    { /* i_total 3 */
-        { 0x0, 3 }, /* str=000 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x7, 3 }, /* str=111 */
-    },
-    { /* i_total 4 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x0, 2 }, /* str=00 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x7, 3 }, /* str=111 */
-    },
-    { /* i_total 5 */
-        { 0x0, 2 }, /* str=00 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x3, 2 }, /* str=11 */
-    },
-    { /* i_total 6 */
-        { 0x0, 2 }, /* str=00 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 1 }, /* str=1 */
-    },
-    { /* i_total 7 */
-        { 0x0, 1 }, /* str=0 */
-        { 0x1, 1 }, /* str=1 */
-    }
-};
-
-/* [MIN( i_zero_left-1, 6 )][run_before] */
-static const vlc_t run_before[7][16] =
-{
-    { /* i_zero_left 1 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x0, 1 }, /* str=0 */
-    },
-    { /* i_zero_left 2 */
-        { 0x1, 1 }, /* str=1 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x0, 2 }, /* str=00 */
-    },
-    { /* i_zero_left 3 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x0, 2 }, /* str=00 */
-    },
-    { /* i_zero_left 4 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x1, 2 }, /* str=01 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 3 }, /* str=000 */
-    },
-    { /* i_zero_left 5 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x2, 2 }, /* str=10 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x0, 3 }, /* str=000 */
-    },
-    { /* i_zero_left 6 */
-        { 0x3, 2 }, /* str=11 */
-        { 0x0, 3 }, /* str=000 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-    },
-    { /* i_zero_left >6 */
-        { 0x7, 3 }, /* str=111 */
-        { 0x6, 3 }, /* str=110 */
-        { 0x5, 3 }, /* str=101 */
-        { 0x4, 3 }, /* str=100 */
-        { 0x3, 3 }, /* str=011 */
-        { 0x2, 3 }, /* str=010 */
-        { 0x1, 3 }, /* str=001 */
-        { 0x1, 4 }, /* str=0001 */
-        { 0x1, 5 }, /* str=00001 */
-        { 0x1, 6 }, /* str=000001 */
-        { 0x1, 7 }, /* str=0000001 */
-        { 0x1, 8 }, /* str=00000001 */
-        { 0x1, 9 }, /* str=000000001 */
-        { 0x1, 10 }, /* str=0000000001 */
-        { 0x1, 11 }, /* str=00000000001 */
-    },
-};
-
-vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
-uint32_t x264_run_before[1<<16];
-
-void x264_cavlc_init( x264_t *h )
-{
-    for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
-        for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
-        {
-            int mask = level >> 15;
-            int abs_level = (level^mask)-mask;
-            int i_level_code = abs_level*2-mask-2;
-            int i_next = i_suffix;
-            vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
-
-            if( ( i_level_code >> i_suffix ) < 14 )
-            {
-                vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix;
-                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
-            }
-            else if( i_suffix == 0 && i_level_code < 30 )
-            {
-                vlc->i_size = 19;
-                vlc->i_bits = (1<<4) + (i_level_code - 14);
-            }
-            else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 )
-            {
-                vlc->i_size = 15 + i_suffix;
-                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
-            }
-            else
-            {
-                i_level_code -= 15 << i_suffix;
-                if( i_suffix == 0 )
-                    i_level_code -= 15;
-                vlc->i_size = 28;
-                vlc->i_bits = (1<<12) + i_level_code;
-            }
-            if( i_next == 0 )
-                i_next++;
-            if( abs_level > (3 << (i_next-1)) && i_next < 6 )
-                i_next++;
-            vlc->i_next = i_next;
-        }
-
-    for( int i = 1; i < (1<<16); i++ )
-    {
-        x264_run_level_t runlevel;
-        ALIGNED_ARRAY_16( dctcoef, dct, [16] );
-        int size = 0;
-        int bits = 0;
-        for( int j = 0; j < 16; j++ )
-            dct[j] = i&(1<<j);
-        int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
-        int zeros = runlevel.last + 1 - total;
-        uint32_t mask = i << (x264_clz( i ) + 1);
-        for( int j = 0; j < total-1 && zeros > 0; j++ )
-        {
-            int idx = X264_MIN(zeros, 7) - 1;
-            int run = x264_clz( mask );
-            int len = run_before[idx][run].i_size;
-            size += len;
-            bits <<= len;
-            bits |= run_before[idx][run].i_bits;
-            zeros -= run;
-            mask <<= run + 1;
-        }
-        x264_run_before[i] = (bits << 5) + size;
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/common/win32thread.c b/android/src/main/libenc/jni/libx264/common/win32thread.c
deleted file mode 100755
index 8c43938..0000000
--- a/android/src/main/libenc/jni/libx264/common/win32thread.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*****************************************************************************
- * win32thread.c: windows threading
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *          Pegasys Inc. <http://www.pegasys-inc.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-/* Microsoft's way of supporting systems with >64 logical cpus can be found at
- * http://www.microsoft.com/whdc/system/Sysinternals/MoreThan64proc.mspx */
-
-/* Based on the agreed standing that x264 does not need to utilize >64 logical cpus,
- * this API does not detect nor utilize more than 64 cpus for systems that have them. */
-
-#include "common.h"
-
-#if HAVE_WINRT
-/* _beginthreadex() is technically the correct option, but it's only available for Desktop applications.
- * Using CreateThread() as an alternative works on Windows Store and Windows Phone 8.1+ as long as we're
- * using a dynamically linked MSVCRT which happens to be a requirement for WinRT applications anyway */
-#define _beginthreadex CreateThread
-#define InitializeCriticalSectionAndSpinCount(a, b) InitializeCriticalSectionEx(a, b, CRITICAL_SECTION_NO_DEBUG_INFO)
-#define WaitForSingleObject(a, b) WaitForSingleObjectEx(a, b, FALSE)
-#else
-#include <process.h>
-#endif
-
-/* number of times to spin a thread about to block on a locked mutex before retrying and sleeping if still locked */
-#define X264_SPIN_COUNT 0
-
-/* global mutex for replacing MUTEX_INITIALIZER instances */
-static x264_pthread_mutex_t static_mutex;
-
-/* _beginthreadex requires that the start routine is __stdcall */
-static unsigned __stdcall x264_win32thread_worker( void *arg )
-{
-    x264_pthread_t *h = arg;
-    *h->p_ret = h->func( h->arg );
-    return 0;
-}
-
-int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr,
-                         void *(*start_routine)( void* ), void *arg )
-{
-    thread->func   = start_routine;
-    thread->arg    = arg;
-    thread->p_ret  = &thread->ret;
-    thread->ret    = NULL;
-    thread->handle = (void*)_beginthreadex( NULL, 0, x264_win32thread_worker, thread, 0, NULL );
-    return !thread->handle;
-}
-
-int x264_pthread_join( x264_pthread_t thread, void **value_ptr )
-{
-    DWORD ret = WaitForSingleObject( thread.handle, INFINITE );
-    if( ret != WAIT_OBJECT_0 )
-        return -1;
-    if( value_ptr )
-        *value_ptr = *thread.p_ret;
-    CloseHandle( thread.handle );
-    return 0;
-}
-
-int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr )
-{
-    return !InitializeCriticalSectionAndSpinCount( mutex, X264_SPIN_COUNT );
-}
-
-int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex )
-{
-    DeleteCriticalSection( mutex );
-    return 0;
-}
-
-int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex )
-{
-    static const x264_pthread_mutex_t init = X264_PTHREAD_MUTEX_INITIALIZER;
-    if( !memcmp( mutex, &init, sizeof(x264_pthread_mutex_t) ) )
-        *mutex = static_mutex;
-    EnterCriticalSection( mutex );
-    return 0;
-}
-
-int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex )
-{
-    LeaveCriticalSection( mutex );
-    return 0;
-}
-
-void x264_win32_threading_destroy( void )
-{
-    x264_pthread_mutex_destroy( &static_mutex );
-    memset( &static_mutex, 0, sizeof(static_mutex) );
-}
-
-#if HAVE_WINRT
-int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr )
-{
-    InitializeConditionVariable( cond );
-    return 0;
-}
-
-int x264_pthread_cond_destroy( x264_pthread_cond_t *cond )
-{
-    return 0;
-}
-
-int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond )
-{
-    WakeAllConditionVariable( cond );
-    return 0;
-}
-
-int x264_pthread_cond_signal( x264_pthread_cond_t *cond )
-{
-    WakeConditionVariable( cond );
-    return 0;
-}
-
-int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex )
-{
-    return !SleepConditionVariableCS( cond, mutex, INFINITE );
-}
-
-int x264_win32_threading_init( void )
-{
-    return x264_pthread_mutex_init( &static_mutex, NULL );
-}
-
-int x264_pthread_num_processors_np( void )
-{
-    SYSTEM_INFO si;
-    GetNativeSystemInfo(&si);
-    return si.dwNumberOfProcessors;
-}
-
-#else
-
-static struct
-{
-    /* function pointers to conditional variable API on windows 6.0+ kernels */
-    void (WINAPI *cond_broadcast)( x264_pthread_cond_t *cond );
-    void (WINAPI *cond_init)( x264_pthread_cond_t *cond );
-    void (WINAPI *cond_signal)( x264_pthread_cond_t *cond );
-    BOOL (WINAPI *cond_wait)( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex, DWORD milliseconds );
-} thread_control;
-
-/* for pre-Windows 6.0 platforms we need to define and use our own condition variable and api */
-typedef struct
-{
-    x264_pthread_mutex_t mtx_broadcast;
-    x264_pthread_mutex_t mtx_waiter_count;
-    volatile int waiter_count;
-    HANDLE semaphore;
-    HANDLE waiters_done;
-    volatile int is_broadcast;
-} x264_win32_cond_t;
-
-int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr )
-{
-    if( thread_control.cond_init )
-    {
-        thread_control.cond_init( cond );
-        return 0;
-    }
-
-    /* non native condition variables */
-    x264_win32_cond_t *win32_cond = calloc( 1, sizeof(x264_win32_cond_t) );
-    if( !win32_cond )
-        return -1;
-    cond->Ptr = win32_cond;
-    win32_cond->semaphore = CreateSemaphoreW( NULL, 0, 0x7fffffff, NULL );
-    if( !win32_cond->semaphore )
-        return -1;
-
-    if( x264_pthread_mutex_init( &win32_cond->mtx_waiter_count, NULL ) )
-        return -1;
-    if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) )
-        return -1;
-
-    win32_cond->waiters_done = CreateEventW( NULL, FALSE, FALSE, NULL );
-    if( !win32_cond->waiters_done )
-        return -1;
-
-    return 0;
-}
-
-int x264_pthread_cond_destroy( x264_pthread_cond_t *cond )
-{
-    /* native condition variables do not destroy */
-    if( thread_control.cond_init )
-        return 0;
-
-    /* non native condition variables */
-    x264_win32_cond_t *win32_cond = cond->Ptr;
-    CloseHandle( win32_cond->semaphore );
-    CloseHandle( win32_cond->waiters_done );
-    x264_pthread_mutex_destroy( &win32_cond->mtx_broadcast );
-    x264_pthread_mutex_destroy( &win32_cond->mtx_waiter_count );
-    free( win32_cond );
-
-    return 0;
-}
-
-int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond )
-{
-    if( thread_control.cond_broadcast )
-    {
-        thread_control.cond_broadcast( cond );
-        return 0;
-    }
-
-    /* non native condition variables */
-    x264_win32_cond_t *win32_cond = cond->Ptr;
-    x264_pthread_mutex_lock( &win32_cond->mtx_broadcast );
-    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
-    int have_waiter = 0;
-
-    if( win32_cond->waiter_count )
-    {
-        win32_cond->is_broadcast = 1;
-        have_waiter = 1;
-    }
-
-    if( have_waiter )
-    {
-        ReleaseSemaphore( win32_cond->semaphore, win32_cond->waiter_count, NULL );
-        x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
-        WaitForSingleObject( win32_cond->waiters_done, INFINITE );
-        win32_cond->is_broadcast = 0;
-    }
-    else
-        x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
-    return x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast );
-}
-
-int x264_pthread_cond_signal( x264_pthread_cond_t *cond )
-{
-    if( thread_control.cond_signal )
-    {
-        thread_control.cond_signal( cond );
-        return 0;
-    }
-
-    /* non-native condition variables */
-    x264_win32_cond_t *win32_cond = cond->Ptr;
-
-    x264_pthread_mutex_lock( &win32_cond->mtx_broadcast );
-    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
-    int have_waiter = win32_cond->waiter_count;
-    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
-
-    if( have_waiter )
-    {
-        ReleaseSemaphore( win32_cond->semaphore, 1, NULL );
-        WaitForSingleObject( win32_cond->waiters_done, INFINITE );
-    }
-
-    return x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast );
-}
-
-int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex )
-{
-    if( thread_control.cond_wait )
-        return !thread_control.cond_wait( cond, mutex, INFINITE );
-
-    /* non native condition variables */
-    x264_win32_cond_t *win32_cond = cond->Ptr;
-
-    x264_pthread_mutex_lock( &win32_cond->mtx_broadcast );
-    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
-    win32_cond->waiter_count++;
-    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
-    x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast );
-
-    // unlock the external mutex
-    x264_pthread_mutex_unlock( mutex );
-    WaitForSingleObject( win32_cond->semaphore, INFINITE );
-
-    x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count );
-    win32_cond->waiter_count--;
-    int last_waiter = !win32_cond->waiter_count || !win32_cond->is_broadcast;
-    x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count );
-
-    if( last_waiter )
-        SetEvent( win32_cond->waiters_done );
-
-    // lock the external mutex
-    return x264_pthread_mutex_lock( mutex );
-}
-
-int x264_win32_threading_init( void )
-{
-    /* find function pointers to API functions, if they exist */
-    HANDLE kernel_dll = GetModuleHandleW( L"kernel32.dll" );
-    thread_control.cond_init = (void*)GetProcAddress( kernel_dll, "InitializeConditionVariable" );
-    if( thread_control.cond_init )
-    {
-        /* we're on a windows 6.0+ kernel, acquire the rest of the functions */
-        thread_control.cond_broadcast = (void*)GetProcAddress( kernel_dll, "WakeAllConditionVariable" );
-        thread_control.cond_signal = (void*)GetProcAddress( kernel_dll, "WakeConditionVariable" );
-        thread_control.cond_wait = (void*)GetProcAddress( kernel_dll, "SleepConditionVariableCS" );
-    }
-    return x264_pthread_mutex_init( &static_mutex, NULL );
-}
-
-int x264_pthread_num_processors_np( void )
-{
-    DWORD_PTR system_cpus, process_cpus = 0;
-    int cpus = 0;
-
-    /* GetProcessAffinityMask returns affinities of 0 when the process has threads in multiple processor groups.
-     * On platforms that support processor grouping, use GetThreadGroupAffinity to get the current thread's affinity instead. */
-#if ARCH_X86_64
-    /* find function pointers to API functions specific to x86_64 platforms, if they exist */
-    HANDLE kernel_dll = GetModuleHandleW( L"kernel32.dll" );
-    BOOL (*get_thread_affinity)( HANDLE thread, void *group_affinity ) = (void*)GetProcAddress( kernel_dll, "GetThreadGroupAffinity" );
-    if( get_thread_affinity )
-    {
-        /* running on a platform that supports >64 logical cpus */
-        struct /* GROUP_AFFINITY */
-        {
-            ULONG_PTR mask; // KAFFINITY = ULONG_PTR
-            USHORT group;
-            USHORT reserved[3];
-        } thread_affinity;
-        if( get_thread_affinity( GetCurrentThread(), &thread_affinity ) )
-            process_cpus = thread_affinity.mask;
-    }
-#endif
-    if( !process_cpus )
-        GetProcessAffinityMask( GetCurrentProcess(), &process_cpus, &system_cpus );
-    for( DWORD_PTR bit = 1; bit; bit <<= 1 )
-        cpus += !!(process_cpus & bit);
-
-    return cpus ? cpus : 1;
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/win32thread.h b/android/src/main/libenc/jni/libx264/common/win32thread.h
deleted file mode 100755
index c16ad00..0000000
--- a/android/src/main/libenc/jni/libx264/common/win32thread.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*****************************************************************************
- * win32thread.h: windows threading
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_WIN32THREAD_H
-#define X264_WIN32THREAD_H
-
-#include <windows.h>
-/* the following macro is used within x264 */
-#undef ERROR
-
-typedef struct
-{
-    void *handle;
-    void *(*func)( void* arg );
-    void *arg;
-    void **p_ret;
-    void *ret;
-} x264_pthread_t;
-#define x264_pthread_attr_t int
-
-/* the conditional variable api for windows 6.0+ uses critical sections and not mutexes */
-typedef CRITICAL_SECTION x264_pthread_mutex_t;
-#define X264_PTHREAD_MUTEX_INITIALIZER {0}
-#define x264_pthread_mutexattr_t int
-
-#if HAVE_WINRT
-typedef CONDITION_VARIABLE x264_pthread_cond_t;
-#else
-typedef struct
-{
-    void *Ptr;
-} x264_pthread_cond_t;
-#endif
-#define x264_pthread_condattr_t int
-
-int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr,
-                         void *(*start_routine)( void* ), void *arg );
-int x264_pthread_join( x264_pthread_t thread, void **value_ptr );
-
-int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr );
-int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex );
-int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex );
-int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex );
-
-int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr );
-int x264_pthread_cond_destroy( x264_pthread_cond_t *cond );
-int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond );
-int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex );
-int x264_pthread_cond_signal( x264_pthread_cond_t *cond );
-
-#define x264_pthread_attr_init(a) 0
-#define x264_pthread_attr_destroy(a) 0
-
-int  x264_win32_threading_init( void );
-void x264_win32_threading_destroy( void );
-
-int x264_pthread_num_processors_np( void );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/bitstream-a.asm b/android/src/main/libenc/jni/libx264/common/x86/bitstream-a.asm
deleted file mode 100755
index da9ce68..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/bitstream-a.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-;*****************************************************************************
-;* bitstream-a.asm: x86 bitstream functions
-;*****************************************************************************
-;* Copyright (C) 2010-2016 x264 project
-;*
-;* Authors: Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
-;-----------------------------------------------------------------------------
-%macro NAL_LOOP 2
-%%escape:
-    ; Detect false positive to avoid unneccessary escape loop
-    xor      r3d, r3d
-    cmp byte [r0+r1-1], 0
-    setnz    r3b
-    xor       k3, k4
-    jnz .escape
-    jmp %%continue
-ALIGN 16
-%1:
-    mova [r0+r1+mmsize], m1
-    pcmpeqb   m1, m0
-    mova [r0+r1], m2
-    pcmpeqb   m2, m0
-    pmovmskb r3d, m1
-    %2        m1, [r1+r2+3*mmsize]
-    pmovmskb r4d, m2
-    %2        m2, [r1+r2+2*mmsize]
-    shl       k3, mmsize
-    or        k3, k4
-    lea       k4, [2*r3+1]
-    and       k4, k3
-    jnz %%escape
-%%continue:
-    add       r1, 2*mmsize
-    jl %1
-%endmacro
-
-%macro NAL_ESCAPE 0
-%if mmsize == 32
-    %xdefine k3 r3
-    %xdefine k4 r4
-%else
-    %xdefine k3 r3d
-    %xdefine k4 r4d
-%endif
-
-cglobal nal_escape, 3,5
-    movzx    r3d, byte [r1]
-    sub       r1, r2 ; r1 = offset of current src pointer from end of src
-    pxor      m0, m0
-    mov     [r0], r3b
-    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
-    or       r3d, 0xffffff00 ; ignore data before src
-
-    ; Start off by jumping into the escape loop in case there's an escape at the start.
-    ; And do a few more in scalar until dst is aligned.
-    jmp .escape_loop
-
-%if mmsize == 16
-    NAL_LOOP .loop_aligned, mova
-    jmp .ret
-%endif
-    NAL_LOOP .loop_unaligned, movu
-.ret:
-    movifnidn rax, r0
-    RET
-
-.escape:
-    ; Skip bytes that are known to be valid
-    and       k4, k3
-    tzcnt     k4, k4
-    xor      r3d, r3d ; the last two bytes are known to be zero
-    add       r1, r4
-.escape_loop:
-    inc       r1
-    jge .ret
-    movzx    r4d, byte [r1+r2]
-    shl      r3d, 8
-    or       r3d, r4d
-    test     r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
-    jz .add_escape_byte
-.escaped:
-    lea      r4d, [r0+r1]
-    mov  [r0+r1], r3b
-    test     r4d, mmsize-1 ; Do SIMD when dst is aligned
-    jnz .escape_loop
-    movu      m1, [r1+r2+mmsize]
-    movu      m2, [r1+r2]
-%if mmsize == 16
-    lea      r4d, [r1+r2]
-    test     r4d, mmsize-1
-    jz .loop_aligned
-%endif
-    jmp .loop_unaligned
-
-.add_escape_byte:
-    mov byte [r0+r1], 3
-    inc       r0
-    or       r3d, 0x0300
-    jmp .escaped
-%endmacro
-
-INIT_MMX mmx2
-NAL_ESCAPE
-INIT_XMM sse2
-NAL_ESCAPE
-%if ARCH_X86_64
-INIT_YMM avx2
-NAL_ESCAPE
-%endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/cabac-a.asm b/android/src/main/libenc/jni/libx264/common/x86/cabac-a.asm
deleted file mode 100755
index baa820a..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/cabac-a.asm
+++ /dev/null
@@ -1,756 +0,0 @@
-;*****************************************************************************
-;* cabac-a.asm: x86 cabac
-;*****************************************************************************
-;* Copyright (C) 2008-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Holger Lubitz <holger@lubitz.org>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA
-
-coeff_abs_level1_ctx:       db 1, 2, 3, 4, 0, 0, 0, 0
-coeff_abs_levelgt1_ctx:     db 5, 5, 5, 5, 6, 7, 8, 9
-coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
-                            db 4, 4, 4, 4, 5, 6, 7, 7
-
-%if ARCH_X86_64
-%macro COEFF_LAST_TABLE 17
-    %define funccpu1 %1
-    %define funccpu2 %2
-    %define funccpu3 %3
-    %rep 14
-        %ifidn %4, 4
-            dq mangle(x264_coeff_last%4_ %+ funccpu1)
-        %elifidn %4, 64
-            dq mangle(x264_coeff_last%4_ %+ funccpu2)
-        %else
-            dq mangle(x264_coeff_last%4_ %+ funccpu3)
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-cextern coeff_last4_mmx2
-cextern coeff_last4_mmx2_lzcnt
-cextern coeff_last15_sse2
-cextern coeff_last15_sse2_lzcnt
-cextern coeff_last16_sse2
-cextern coeff_last16_sse2_lzcnt
-cextern coeff_last64_sse2
-cextern coeff_last64_sse2_lzcnt
-cextern coeff_last64_avx2_lzcnt
-
-%ifdef PIC
-SECTION .data
-%endif
-coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-%endif
-
-SECTION .text
-
-cextern cabac_range_lps
-cextern cabac_transition
-cextern cabac_renorm_shift
-cextern cabac_entropy
-cextern cabac_size_unary
-cextern cabac_transition_unary
-cextern significant_coeff_flag_offset
-cextern significant_coeff_flag_offset_8x8
-cextern last_coeff_flag_offset
-cextern last_coeff_flag_offset_8x8
-cextern coeff_abs_level_m1_offset
-cextern count_cat_m1
-cextern cabac_encode_ue_bypass
-
-%if ARCH_X86_64
-    %define pointer resq
-%else
-    %define pointer resd
-%endif
-
-struc cb
-    .low: resd 1
-    .range: resd 1
-    .queue: resd 1
-    .bytes_outstanding: resd 1
-    .start: pointer 1
-    .p: pointer 1
-    .end: pointer 1
-    align 16, resb 1
-    .bits_encoded: resd 1
-    .state: resb 1024
-endstruc
-
-%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
-%ifdef PIC
-    %ifidn %4, 0
-        movzx %1, byte [%2+%3+r7-$$]
-    %else
-        lea   %5, [r7+%4]
-        movzx %1, byte [%2+%3+%5-$$]
-    %endif
-%else
-    movzx %1, byte [%2+%3+%4]
-%endif
-%endmacro
-
-%macro CABAC 1
-; t3 must be ecx, since it's used for shift.
-%if WIN64
-    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
-%elif ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
-%else
-    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
-%endif
-
-cglobal cabac_encode_decision_%1, 1,7
-    movifnidn t1d, r1m
-    mov   t5d, [r0+cb.range]
-    movzx t6d, byte [r0+cb.state+t1]
-    movifnidn t0,  r0 ; WIN64
-    mov   t4d, ~1
-    mov   t3d, t5d
-    and   t4d, t6d
-    shr   t5d, 6
-    movifnidn t2d, r2m
-%if WIN64
-    PUSH r7
-%endif
-%ifdef PIC
-    lea    r7, [$$]
-%endif
-    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
-    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
-    and   t6d, 1
-    sub   t3d, t5d
-    cmp   t6d, t2d
-    mov   t6d, [t0+cb.low]
-    lea    t2, [t6+t3]
-    cmovne t3d, t5d
-    cmovne t6d, t2d
-    mov   [t0+cb.state+t1], t4b
-;cabac_encode_renorm
-    mov   t4d, t3d
-%ifidn %1, bmi2
-    lzcnt t3d, t3d
-    sub   t3d, 23
-    shlx  t4d, t4d, t3d
-    shlx  t6d, t6d, t3d
-%else
-    shr   t3d, 3
-    LOAD_GLOBAL t3d, cabac_renorm_shift, t3
-    shl   t4d, t3b
-    shl   t6d, t3b
-%endif
-%if WIN64
-    POP r7
-%endif
-    mov   [t0+cb.range], t4d
-    add   t3d, [t0+cb.queue]
-    jge cabac_putbyte_%1
-.update_queue_low:
-    mov   [t0+cb.low], t6d
-    mov   [t0+cb.queue], t3d
-    RET
-
-cglobal cabac_encode_bypass_%1, 2,3
-    mov       t7d, [r0+cb.low]
-    and       r1d, [r0+cb.range]
-    lea       t7d, [t7*2+r1]
-    movifnidn  t0, r0 ; WIN64
-    mov       t3d, [r0+cb.queue]
-    inc       t3d
-%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
-    jge cabac_putbyte_%1
-%else
-    jge .putbyte
-%endif
-    mov   [t0+cb.low], t7d
-    mov   [t0+cb.queue], t3d
-    RET
-%if ARCH_X86_64 == 0
-.putbyte:
-    PROLOGUE 0,7
-    movifnidn t6d, t7d
-    jmp cabac_putbyte_%1
-%endif
-
-%ifnidn %1,bmi2
-cglobal cabac_encode_terminal_%1, 1,3
-    sub  dword [r0+cb.range], 2
-; shortcut: the renormalization shift in terminal
-; can only be 0 or 1 and is zero over 99% of the time.
-    test dword [r0+cb.range], 0x100
-    je .renorm
-    RET
-.renorm:
-    shl  dword [r0+cb.low], 1
-    shl  dword [r0+cb.range], 1
-    inc  dword [r0+cb.queue]
-    jge .putbyte
-    RET
-.putbyte:
-    PROLOGUE 0,7
-    movifnidn t0, r0 ; WIN64
-    mov t3d, [r0+cb.queue]
-    mov t6d, [t0+cb.low]
-%endif
-
-cabac_putbyte_%1:
-    ; alive: t0=cb t3=queue t6=low
-%if WIN64
-    DECLARE_REG_TMP 3,6,1,0,2,5,4
-%endif
-%ifidn %1, bmi2
-    add   t3d, 10
-    shrx  t2d, t6d, t3d
-    bzhi  t6d, t6d, t3d
-    sub   t3d, 18
-%else
-    mov   t1d, -1
-    add   t3d, 10
-    mov   t2d, t6d
-    shl   t1d, t3b
-    shr   t2d, t3b ; out
-    not   t1d
-    sub   t3d, 18
-    and   t6d, t1d
-%endif
-    mov   t5d, [t0+cb.bytes_outstanding]
-    cmp   t2b, 0xff ; FIXME is a 32bit op faster?
-    jz    .postpone
-    mov    t1, [t0+cb.p]
-    add   [t1-1], t2h
-    dec   t2h
-.loop_outstanding:
-    mov   [t1], t2h
-    inc   t1
-    dec   t5d
-    jge .loop_outstanding
-    mov   [t1-1], t2b
-    mov   [t0+cb.p], t1
-.postpone:
-    inc   t5d
-    mov   [t0+cb.bytes_outstanding], t5d
-    jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
-%endmacro
-
-CABAC asm
-CABAC bmi2
-
-; %1 = label name
-; %2 = node_ctx init?
-%macro COEFF_ABS_LEVEL_GT1 2
-%if %2
-    %define ctx 1
-%else
-    movzx  r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
-    %define ctx r11
-%endif
-    movzx   r9d, byte [r8+ctx]
-; if( coeff_abs > 1 )
-    cmp     r1d, 1
-    jg .%1_gt1
-; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
-    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
-    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
-    lea     r0d, [r0+r9+256]
-    mov [r8+ctx], r10b
-%if %2
-    mov     r2d, 1
-%else
-    movzx   r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
-%endif
-    jmp .%1_end
-
-.%1_gt1:
-; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
-    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
-    xor     r9d, 1
-    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
-    mov [r8+ctx], r10b
-    add     r0d, r9d
-%if %2
-    %define ctx 5
-%else
-    movzx  r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
-    %define ctx r11
-%endif
-; if( coeff_abs < 15 )
-    cmp     r1d, 15
-    jge .%1_escape
-    shl     r1d, 7
-; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
-    movzx   r9d, byte [r8+ctx]
-    add     r9d, r1d
-    movzx  r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
-; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
-    movzx   r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
-    mov [r8+ctx], r10b
-    add     r0d, r9d
-    jmp .%1_gt1_end
-
-.%1_escape:
-; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
-    movzx   r9d, byte [r8+ctx]
-    movzx  r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
-; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
-    movzx   r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
-    add     r0d, r9d
-    mov [r8+ctx], r10b
-    sub     r1d, 14
-%if cpuflag(lzcnt)
-    lzcnt   r9d, r1d
-    xor     r9d, 0x1f
-%else
-    bsr     r9d, r1d
-%endif
-; bs_size_ue_big(coeff_abs-15)<<8
-    shl     r9d, 9
-; (ilog2(coeff_abs-14)+1) << 8
-    lea     r0d, [r0+r9+256]
-.%1_gt1_end:
-%if %2
-    mov     r2d, 4
-%else
-    movzx   r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
-%endif
-.%1_end:
-%endmacro
-
-%macro LOAD_DCTCOEF 1
-%if HIGH_BIT_DEPTH
-    mov     %1, [dct+r6*4]
-%else
-    movzx   %1, word [dct+r6*2]
-%endif
-%endmacro
-
-%macro ABS_DCTCOEFS 2
-%assign i 0
-%rep %2/16
-%if HIGH_BIT_DEPTH
-    ABSD   m0, [%1+ 0+i*64], m4
-    ABSD   m1, [%1+16+i*64], m5
-    ABSD   m2, [%1+32+i*64], m4
-    ABSD   m3, [%1+48+i*64], m5
-    mova [rsp+ 0+i*64], m0
-    mova [rsp+16+i*64], m1
-    mova [rsp+32+i*64], m2
-    mova [rsp+48+i*64], m3
-%else
-    ABSW   m0, [%1+ 0+i*32], m2
-    ABSW   m1, [%1+16+i*32], m3
-    mova [rsp+ 0+i*32], m0
-    mova [rsp+16+i*32], m1
-%endif
-%assign i i+1
-%endrep
-%endmacro
-
-%macro SIG_OFFSET 1
-%if %1
-    movzx  r11d, byte [r4+r6]
-%endif
-%endmacro
-
-%macro LAST_OFFSET 1
-%if %1
-    movzx  r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
-;                                                   int ctx_block_cat, x264_cabac_t *cb );
-;-----------------------------------------------------------------------------
-
-;%1 = 8x8 mode
-%macro CABAC_RESIDUAL_RD 2
-%if %1
-    %define func cabac_block_residual_8x8_rd_internal
-    %define maxcoeffs 64
-    %define dct rsp
-%else
-    %define func cabac_block_residual_rd_internal
-    %define maxcoeffs 16
-    %define dct r4
-%endif
-
-%ifdef PIC
-    cglobal func, 4,13
-    lea     r12, [$$]
-    %define GLOBAL +r12-$$
-%else
-    cglobal func, 4,12
-    %define GLOBAL
-%endif
-
-%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
-    SUB     rsp, pad
-    shl     r1d, 4                                            ; MB_INTERLACED*16
-%if %1
-    lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
-%endif
-    add     r1d, r2d
-    movzx   r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL]    ; r5 = ctx_sig
-    movzx   r7d, word [last_coeff_flag_offset+r1*2 GLOBAL]           ; r7 = ctx_last
-    movzx   r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]        ; r8 = ctx_level
-
-; abs() all the coefficients; copy them to the stack to avoid
-; changing the originals.
-; overreading is okay; it's all valid aligned data anyways.
-%if %1
-    ABS_DCTCOEFS r0, 64
-%else
-    mov      r4, r0                                           ; r4 = dct
-    mov      r6, ~SIZEOF_DCTCOEF
-    and      r6, r4                                           ; handle AC coefficient case
-    ABS_DCTCOEFS r6, 16
-    sub      r4, r6                                           ; calculate our new dct pointer
-    add      r4, rsp                                          ; restore AC coefficient offset
-%endif
-    mov      r1, [%2+gprsize*r2 GLOBAL]
-; for improved OOE performance, run coeff_last on the original coefficients.
-    call     r1                                               ; coeff_last[ctx_block_cat]( dct )
-; we know on 64-bit that the SSE2 versions of this function only
-; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
-; don't need r2 in 8x8 mode.
-    mov     r0d, [r3+cb.bits_encoded]                         ; r0 = cabac.f8_bits_encoded
-; pre-add some values to simplify addressing
-    add      r3, cb.state
-    add      r5, r3
-    add      r7, r3
-    add      r8, r3                                           ; precalculate cabac state pointers
-
-; if( last != count_cat_m1[ctx_block_cat] )
-%if %1
-    cmp     r6b, 63
-%else
-    cmp     r6b, [count_cat_m1+r2 GLOBAL]
-%endif
-    je .skip_last_sigmap
-
-; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
-; so we'll use r11 for this.
-%if %1
-    %define siglast_ctx r11
-%else
-    %define siglast_ctx r6
-%endif
-
-; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
-; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
-    SIG_OFFSET %1
-    movzx   r1d, byte [r5+siglast_ctx]
-    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
-    xor     r1d, 1
-    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
-    mov [r5+siglast_ctx], r9b
-    add     r0d, r1d
-
-    LAST_OFFSET %1
-    movzx   r1d, byte [r7+siglast_ctx]
-    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
-    xor     r1d, 1
-    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
-    mov [r7+siglast_ctx], r9b
-    add     r0d, r1d
-.skip_last_sigmap:
-    LOAD_DCTCOEF r1d
-    COEFF_ABS_LEVEL_GT1 last, 1
-; for( int i = last-1 ; i >= 0; i-- )
-    dec     r6d
-    jl .end
-.coeff_loop:
-    LOAD_DCTCOEF r1d
-; if( l[i] )
-    SIG_OFFSET %1
-    movzx   r9d, byte [r5+siglast_ctx]
-    test    r1d, r1d
-    jnz .coeff_nonzero
-; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
-    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
-    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
-    mov [r5+siglast_ctx], r10b
-    add     r0d, r9d
-    dec     r6d
-    jge .coeff_loop
-    jmp .end
-.coeff_nonzero:
-; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
-    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
-    xor     r9d, 1
-    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
-    mov [r5+siglast_ctx], r10b
-    add     r0d, r9d
-; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
-    LAST_OFFSET %1
-    movzx   r9d, byte [r7+siglast_ctx]
-    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
-    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
-    mov [r7+siglast_ctx], r10b
-    add     r0d, r9d
-    COEFF_ABS_LEVEL_GT1 coeff, 0
-    dec     r6d
-    jge .coeff_loop
-.end:
-    mov [r3+cb.bits_encoded-cb.state], r0d
-    ADD     rsp, pad
-    RET
-%endmacro
-
-%if ARCH_X86_64
-INIT_XMM sse2
-CABAC_RESIDUAL_RD 0, coeff_last_sse2
-CABAC_RESIDUAL_RD 1, coeff_last_sse2
-INIT_XMM sse2,lzcnt
-CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
-CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
-INIT_XMM ssse3
-CABAC_RESIDUAL_RD 0, coeff_last_sse2
-CABAC_RESIDUAL_RD 1, coeff_last_sse2
-INIT_XMM ssse3,lzcnt
-CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
-CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
-%endif
-
-;-----------------------------------------------------------------------------
-; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
-;                                                int ctx_block_cat, x264_cabac_t *cb );
-;-----------------------------------------------------------------------------
-
-%macro CALL_CABAC 0
-%if cpuflag(bmi2)
-    call cabac_encode_decision_bmi2
-%else
-    call cabac_encode_decision_asm
-%endif
-%if WIN64 ; move cabac back
-    mov r0, r3
-%endif
-%endmacro
-
-; %1 = 8x8 mode
-; %2 = dct register
-; %3 = countcat
-; %4 = name
-%macro SIGMAP_LOOP 3-4
-.sigmap_%4loop:
-%if HIGH_BIT_DEPTH
-    mov      %2, [dct+r10*4]
-%else
-    movsx    %2, word [dct+r10*2]
-%endif
-%if %1
-    movzx   r1d, byte [sigoff_8x8 + r10]
-    add     r1d, sigoffd
-%else
-    lea     r1d, [sigoffd + r10d]
-%endif
-    test     %2, %2
-    jz .sigmap_%4zero               ; if( l[i] )
-    inc coeffidxd
-    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i];
-    mov     r2d, 1
-    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
-%if %1
-    movzx   r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
-    add     r1d, lastoffd
-%else
-    lea     r1d, [lastoffd + r10d]
-%endif
-    cmp    r10d, lastm              ; if( i == last )
-    je .sigmap_%4last
-    xor     r2d, r2d
-    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
-    jmp .sigmap_%4loop_endcheck
-.sigmap_%4zero:
-    xor     r2d, r2d
-    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
-.sigmap_%4loop_endcheck:
-    inc    r10d
-    cmp    r10d, %3
-    jne .sigmap_%4loop              ; if( ++i == count_m1 )
-%if HIGH_BIT_DEPTH
-    mov      %2, [dct+r10*4]
-%else
-    movsx    %2, word [dct+r10*2]
-%endif
-    inc coeffidxd
-    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i]
-    jmp .sigmap_%4end
-.sigmap_%4last:                     ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
-    mov     r2d, 1
-    CALL_CABAC
-.sigmap_%4end:
-%if %1==0
-    jmp .level_loop_start
-%endif
-%endmacro
-
-%macro CABAC_RESIDUAL 1
-cglobal cabac_block_residual_internal, 4,15
-%ifdef PIC
-; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
-    lea     r7, [$$]
-    %define lastm [rsp+4*1]
-    %define GLOBAL +r7-$$
-%else
-    %define lastm r7d
-    %define GLOBAL
-%endif
-%assign pad gprsize+4*2+4*64-(stack_offset&15)
-    SUB     rsp, pad
-    shl     r1d, 4
-
-    %define sigoffq r8
-    %define sigoffd r8d
-    %define lastoffq r9
-    %define lastoffd r9d
-    %define leveloffq r10
-    %define leveloffd r10d
-    %define leveloffm [rsp+4*0]
-    %define countcatd r11d
-    %define sigoff_8x8 r12
-    %define coeffidxq r13
-    %define coeffidxd r13d
-    %define dct r14
-    %define coeffs rsp+4*2
-
-    lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
-    add     r1d, r2d
-    movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
-    movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
-    movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
-    movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
-    mov coeffidxd, -1
-    mov     dct, r0
-    mov leveloffm, leveloffd
-
-    mov      r1, [%1+gprsize*r2 GLOBAL]
-    call     r1
-    mov   lastm, eax
-; put cabac in r0; needed for cabac_encode_decision
-    mov      r0, r3
-
-    xor    r10d, r10d
-    cmp countcatd, 63
-    je .sigmap_8x8
-    SIGMAP_LOOP 0, r12d, countcatd,
-.sigmap_8x8:
-    SIGMAP_LOOP 1, r11d, 63, _8x8
-.level_loop_start:
-; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
-    %define nodectxq r8
-    %define nodectxd r8d
-    mov leveloffd, leveloffm
-    xor nodectxd, nodectxd
-.level_loop:
-    mov     r9d, [coeffs+coeffidxq*4]
-    mov    r11d, r9d
-    sar    r11d, 31
-    add     r9d, r11d
-    movzx   r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
-    xor     r9d, r11d
-    add     r1d, leveloffd
-    cmp     r9d, 1
-    jg .level_gt1
-    xor     r2d, r2d
-    CALL_CABAC
-    movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
-    jmp .level_sign
-.level_gt1:
-    mov     r2d, 1
-    CALL_CABAC
-    movzx  r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
-    add    r14d, leveloffd
-    cmp     r9d, 15
-    mov    r12d, 15
-    cmovl  r12d, r9d
-    sub    r12d, 2
-    jz .level_eq2
-.level_gt1_loop:
-    mov     r1d, r14d
-    mov     r2d, 1
-    CALL_CABAC
-    dec    r12d
-    jg .level_gt1_loop
-    cmp     r9d, 15
-    jge .level_bypass
-.level_eq2:
-    mov     r1d, r14d
-    xor     r2d, r2d
-    CALL_CABAC
-    jmp .level_gt1_end
-.level_bypass:
-    lea     r2d, [r9d-15]
-    xor     r1d, r1d
-    push     r0
-; we could avoid this if we implemented it in asm, but I don't feel like that
-; right now.
-%if UNIX64
-    push     r7
-    push     r8
-%else
-    sub      rsp, 32 ; shadow space
-%endif
-    call cabac_encode_ue_bypass
-%if UNIX64
-    pop      r8
-    pop      r7
-%else
-    add      rsp, 32
-%endif
-    pop      r0
-.level_gt1_end:
-    movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
-.level_sign:
-    mov     r1d, r11d
-%if cpuflag(bmi2)
-    call cabac_encode_bypass_bmi2
-%else
-    call cabac_encode_bypass_asm
-%endif
-%if WIN64
-    mov      r0, r3
-%endif
-    dec coeffidxd
-    jge .level_loop
-    ADD     rsp, pad
-    RET
-%endmacro
-
-%if ARCH_X86_64
-INIT_XMM sse2
-CABAC_RESIDUAL coeff_last_sse2
-INIT_XMM sse2,lzcnt
-CABAC_RESIDUAL coeff_last_sse2_lzcnt
-INIT_XMM avx2,bmi2
-CABAC_RESIDUAL coeff_last_avx2_lzcnt
-%endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/const-a.asm b/android/src/main/libenc/jni/libx264/common/x86/const-a.asm
deleted file mode 100755
index ea61c81..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/const-a.asm
+++ /dev/null
@@ -1,82 +0,0 @@
-;*****************************************************************************
-;* const-a.asm: x86 global constants
-;*****************************************************************************
-;* Copyright (C) 2010-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-
-SECTION_RODATA 32
-
-const pb_1,        times 32 db 1
-const hsub_mul,    times 16 db 1, -1
-const pw_1,        times 16 dw 1
-const pw_16,       times 16 dw 16
-const pw_32,       times 16 dw 32
-const pw_512,      times 16 dw 512
-const pw_00ff,     times 16 dw 0x00ff
-const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
-const pw_0to15,    dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-const pd_1,        times 8 dd 1
-const pd_0123,     dd 0,1,2,3
-const pd_4567,     dd 4,5,6,7
-const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
-const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
-const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-
-const pb_01,       times  8 db 0,1
-const pb_0,        times 16 db 0
-const pb_a1,       times 16 db 0xa1
-const pb_3,        times 16 db 3
-const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
-
-const pw_2,        times 8 dw 2
-const pw_m2,       times 8 dw -2
-const pw_4,        times 8 dw 4
-const pw_8,        times 8 dw 8
-const pw_64,       times 8 dw 64
-const pw_256,      times 8 dw 256
-const pw_32_0,     times 4 dw 32,
-                   times 4 dw 0
-const pw_8000,     times 8 dw 0x8000
-const pw_3fff,     times 8 dw 0x3fff
-const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
-const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
-const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
-const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
-
-const pd_8,        times 4 dd 8
-const pd_32,       times 4 dd 32
-const pd_1024,     times 4 dd 1024
-const pd_ffff,     times 4 dd 0xffff
-const pw_ff00,     times 8 dw 0xff00
-
-const popcnt_table
-%assign x 0
-%rep 256
-; population count
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
-
-const sw_64,       dd 64
diff --git a/android/src/main/libenc/jni/libx264/common/x86/cpu-a.asm b/android/src/main/libenc/jni/libx264/common/x86/cpu-a.asm
deleted file mode 100755
index 8eccb4a..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/cpu-a.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-;*****************************************************************************
-;* cpu-a.asm: x86 cpu utilities
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;*          Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
-;-----------------------------------------------------------------------------
-cglobal cpu_cpuid, 5,7
-    push rbx
-    push  r4
-    push  r3
-    push  r2
-    push  r1
-    mov  eax, r0d
-    xor  ecx, ecx
-    cpuid
-    pop   r4
-    mov [r4], eax
-    pop   r4
-    mov [r4], ebx
-    pop   r4
-    mov [r4], ecx
-    pop   r4
-    mov [r4], edx
-    pop  rbx
-    RET
-
-;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
-;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
-    push  r2
-    push  r1
-    mov  ecx, r0d
-    xgetbv
-    pop   r4
-    mov [r4], eax
-    pop   r4
-    mov [r4], edx
-    RET
-
-%if ARCH_X86_64
-
-;-----------------------------------------------------------------------------
-; void stack_align( void (*func)(void*), void *arg );
-;-----------------------------------------------------------------------------
-cglobal stack_align
-    push rbp
-    mov  rbp, rsp
-%if WIN64
-    sub  rsp, 32 ; shadow space
-%endif
-    and  rsp, ~31
-    mov  rax, r0
-    mov   r0, r1
-    mov   r1, r2
-    mov   r2, r3
-    call rax
-    leave
-    ret
-
-%else
-
-;-----------------------------------------------------------------------------
-; int cpu_cpuid_test( void )
-; return 0 if unsupported
-;-----------------------------------------------------------------------------
-cglobal cpu_cpuid_test
-    pushfd
-    push    ebx
-    push    ebp
-    push    esi
-    push    edi
-    pushfd
-    pop     eax
-    mov     ebx, eax
-    xor     eax, 0x200000
-    push    eax
-    popfd
-    pushfd
-    pop     eax
-    xor     eax, ebx
-    pop     edi
-    pop     esi
-    pop     ebp
-    pop     ebx
-    popfd
-    ret
-
-cglobal stack_align
-    push ebp
-    mov  ebp, esp
-    sub  esp, 12
-    and  esp, ~31
-    mov  ecx, [ebp+8]
-    mov  edx, [ebp+12]
-    mov  [esp], edx
-    mov  edx, [ebp+16]
-    mov  [esp+4], edx
-    mov  edx, [ebp+20]
-    mov  [esp+8], edx
-    call ecx
-    leave
-    ret
-
-%endif
-
-;-----------------------------------------------------------------------------
-; void cpu_emms( void )
-;-----------------------------------------------------------------------------
-cglobal cpu_emms
-    emms
-    ret
-
-;-----------------------------------------------------------------------------
-; void cpu_sfence( void )
-;-----------------------------------------------------------------------------
-cglobal cpu_sfence
-    sfence
-    ret
diff --git a/android/src/main/libenc/jni/libx264/common/x86/dct-32.asm b/android/src/main/libenc/jni/libx264/common/x86/dct-32.asm
deleted file mode 100755
index ebc8ebf..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/dct-32.asm
+++ /dev/null
@@ -1,596 +0,0 @@
-;*****************************************************************************
-;* dct-32.asm: x86_32 transform and zigzag
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Min Chen <chenm001.163.com>
-;*          Christian Heine <sennindemokrit@gmx.net>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION .text
-
-cextern pd_32
-cextern pw_pixel_max
-cextern pw_2
-cextern pw_m2
-cextern pw_32
-cextern hsub_mul
-
-%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
-    %xdefine %%base %1
-    %rep %0/2
-    %xdefine %%tmp m%2
-    %rotate %0/2
-    mova [%%base + %2*16], %%tmp
-    %rotate 1-%0/2
-    %endrep
-%endmacro
-
-%macro UNSPILL_SHUFFLE 3-*
-    %xdefine %%base %1
-    %rep %0/2
-    %xdefine %%tmp m%2
-    %rotate %0/2
-    mova %%tmp, [%%base + %2*16]
-    %rotate 1-%0/2
-    %endrep
-%endmacro
-
-%macro SPILL 2+ ; assume offsets are the same as reg numbers
-    SPILL_SHUFFLE %1, %2, %2
-%endmacro
-
-%macro UNSPILL 2+
-    UNSPILL_SHUFFLE %1, %2, %2
-%endmacro
-
-; in: size, m0..m7
-; out: 0,4,6 in memory at %10,%11,%12, rest in regs
-%macro DCT8_1D 12
-    SUMSUB_BA %1, %9, %2      ; %9 = s07,  %2 = d07
-    SUMSUB_BA %1, %8, %3      ; %8 = s16,  %3 = d16
-    SUMSUB_BA %1, %7, %4      ; %7 = s25,  %4 = d25
-    SUMSUB_BA %1, %6, %5      ; %6 = s34,  %5 = d34
-    SUMSUB_BA %1, %6, %9      ; %6 = a0,   %9 = a2
-    SUMSUB_BA %1, %7, %8      ; %7 = a1,   %8 = a3
-    SUMSUB_BA %1, %7, %6      ; %7 = dst0, %6 = dst4
-    mova     %10, m%7
-    mova     %11, m%6
-    psra%1   m%7, m%8, 1      ; a3>>1
-    padd%1   m%7, m%9         ; a2 + (a3>>1)
-    psra%1   m%9, 1           ; a2>>1
-    psub%1   m%9, m%8         ; (a2>>1) - a3
-    mova     %12, m%9
-    psra%1   m%6, m%4, 1
-    padd%1   m%6, m%4         ; d25+(d25>>1)
-    psub%1   m%8, m%2, m%5    ; a5 = d07-d34-(d25+(d25>>1))
-    psub%1   m%8, m%6
-    psra%1   m%6, m%3, 1
-    padd%1   m%6, m%3         ; d16+(d16>>1)
-    padd%1   m%9, m%2, m%5
-    psub%1   m%9, m%6         ; a6 = d07+d34-(d16+(d16>>1))
-    psra%1   m%6, m%2, 1
-    padd%1   m%6, m%2         ; d07+(d07>>1)
-    padd%1   m%6, m%3
-    padd%1   m%6, m%4         ; a4 = d16+d25+(d07+(d07>>1))
-    psra%1   m%2, m%5, 1
-    padd%1   m%2, m%5         ; d34+(d34>>1)
-    padd%1   m%2, m%3
-    psub%1   m%2, m%4         ; a7 = d16-d25+(d34+(d34>>1))
-    psra%1   m%5, m%2, 2
-    padd%1   m%5, m%6         ; a4 + (a7>>2)
-    psra%1   m%4, m%9, 2
-    padd%1   m%4, m%8         ; a5 + (a6>>2)
-    psra%1   m%6, 2
-    psra%1   m%8, 2
-    psub%1   m%6, m%2         ; (a4>>2) - a7
-    psub%1   m%9, m%8         ; a6 - (a5>>2)
-    SWAP %3, %5, %4, %7, %9, %6
-%endmacro
-
-; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
-; out: m0..m7
-%macro IDCT8_1D 11
-    psra%1   m%2, m%4, 1
-    psra%1   m%6, m%8, 1
-    psub%1   m%2, m%8
-    padd%1   m%6, m%4
-    psra%1   m%8, m%3, 1
-    padd%1   m%8, m%3
-    padd%1   m%8, m%5
-    padd%1   m%8, m%7
-    psra%1   m%4, m%7, 1
-    padd%1   m%4, m%7
-    padd%1   m%4, m%9
-    psub%1   m%4, m%3
-    psub%1   m%3, m%5
-    psub%1   m%7, m%5
-    padd%1   m%3, m%9
-    psub%1   m%7, m%9
-    psra%1   m%5, 1
-    psra%1   m%9, 1
-    psub%1   m%3, m%5
-    psub%1   m%7, m%9
-    psra%1   m%5, m%8, 2
-    psra%1   m%9, m%4, 2
-    padd%1   m%5, m%7
-    padd%1   m%9, m%3
-    psra%1   m%7, 2
-    psra%1   m%3, 2
-    psub%1   m%8, m%7
-    psub%1   m%3, m%4
-    mova     m%4, %10
-    mova     m%7, %11
-    SUMSUB_BA %1, %7, %4
-    SUMSUB_BA %1, %6, %7
-    SUMSUB_BA %1, %2, %4
-    SUMSUB_BA %1, %8, %6
-    SUMSUB_BA %1, %3, %2
-    SUMSUB_BA %1, %9, %4
-    SUMSUB_BA %1, %5, %7
-    SWAP %2, %4
-    SWAP %6, %8
-    SWAP %2, %6, %7
-    SWAP %4, %9, %8
-%endmacro
-
-%if HIGH_BIT_DEPTH
-
-%macro SUB8x8_DCT8 0
-cglobal sub8x8_dct8, 3,3,8
-global current_function %+ .skip_prologue
-.skip_prologue:
-    LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
-    LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
-
-    DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
-    mova  m0, [r0]
-
-    mova  [r0+0x30], m5
-    mova  [r0+0x70], m7
-    TRANSPOSE4x4W 0,1,2,3,4
-    WIDEN_SXWD 0,4
-    WIDEN_SXWD 1,5
-    WIDEN_SXWD 2,6
-    WIDEN_SXWD 3,7
-    DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
-    mova  [r0+0x20], m4
-    mova  [r0+0x40], m1
-    mova  [r0+0x60], m5
-    mova  [r0+0xA0], m6
-    mova  [r0+0xE0], m7
-    mova  m4, [r0+0x10]
-    mova  m5, [r0+0x30]
-    mova  m6, [r0+0x50]
-    mova  m7, [r0+0x70]
-
-    TRANSPOSE4x4W 4,5,6,7,0
-    WIDEN_SXWD 4,0
-    WIDEN_SXWD 5,1
-    WIDEN_SXWD 6,2
-    WIDEN_SXWD 7,3
-    DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
-    mova  [r0+0x30], m0
-    mova  [r0+0x50], m5
-    mova  [r0+0x70], m1
-    mova  [r0+0xB0], m2
-    mova  [r0+0xF0], m3
-    ret
-%endmacro ; SUB8x8_DCT8
-
-INIT_XMM sse2
-SUB8x8_DCT8
-INIT_XMM sse4
-SUB8x8_DCT8
-INIT_XMM avx
-SUB8x8_DCT8
-
-%macro ADD8x8_IDCT8 0
-cglobal add8x8_idct8, 2,2
-    add r1, 128
-global current_function %+ .skip_prologue
-.skip_prologue:
-    UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
-    IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
-    mova   [r1+0], m4
-    TRANSPOSE4x4D 0,1,2,3,4
-    paddd      m0, [pd_32]
-    mova       m4, [r1+0]
-    SPILL_SHUFFLE   r1, 0,1,2,3, -8,-6,-4,-2
-    TRANSPOSE4x4D 4,5,6,7,3
-    paddd      m4, [pd_32]
-    SPILL_SHUFFLE   r1, 4,5,6,7, 0,2,4,6
-    UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
-    IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
-    mova  [r1+16], m4
-    TRANSPOSE4x4D 0,1,2,3,4
-    mova       m4, [r1+16]
-    mova [r1-112], m0
-    TRANSPOSE4x4D 4,5,6,7,0
-    SPILL_SHUFFLE   r1, 4,5,6,7, 1,3,5,7
-    UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
-    IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
-    SPILL_SHUFFLE   r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
-    UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
-    IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
-    SPILL_SHUFFLE   r1, 7,6,5, 7,6,5
-    mova       m7, [pw_pixel_max]
-    pxor       m6, m6
-    mova       m5, [r1-128]
-    STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
-    mova       m0, [r1-112]
-    STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
-    mova       m0, [r1-96]
-    STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
-    mova       m0, [r1-80]
-    STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
-    mova       m0, [r1-64]
-    STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
-    mova       m0, [r1-48]
-    mova       m1, [r1+80]
-    STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
-    mova       m0, [r1-32]
-    mova       m1, [r1+96]
-    STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
-    mova       m0, [r1-16]
-    mova       m1, [r1+112]
-    STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
-    RET
-%endmacro ; ADD8x8_IDCT8
-
-INIT_XMM sse2
-ADD8x8_IDCT8
-INIT_XMM avx
-ADD8x8_IDCT8
-
-%else ; !HIGH_BIT_DEPTH
-
-INIT_MMX
-ALIGN 16
-load_diff_4x8_mmx:
-    LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
-    LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
-    LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
-    LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
-    LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
-    LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
-    movq  [r0], m0
-    LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
-    LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
-    movq  m0, [r0]
-    ret
-
-cglobal dct8_mmx
-    DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
-    SAVE_MM_PERMUTATION
-    ret
-
-;-----------------------------------------------------------------------------
-; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal sub8x8_dct8_mmx, 3,3
-global sub8x8_dct8_mmx.skip_prologue
-.skip_prologue:
-    RESET_MM_PERMUTATION
-    call load_diff_4x8_mmx
-    call dct8_mmx
-    UNSPILL r0, 0
-    TRANSPOSE4x4W 0,1,2,3,4
-    SPILL r0, 0,1,2,3
-    UNSPILL r0, 4,6
-    TRANSPOSE4x4W 4,5,6,7,0
-    SPILL r0, 4,5,6,7
-    RESET_MM_PERMUTATION
-    add   r1, 4
-    add   r2, 4
-    add   r0, 8
-    call load_diff_4x8_mmx
-    sub   r1, 4
-    sub   r2, 4
-    call dct8_mmx
-    sub   r0, 8
-    UNSPILL r0+8, 4,6
-    TRANSPOSE4x4W 4,5,6,7,0
-    SPILL r0+8, 4,5,6,7
-    UNSPILL r0+8, 0
-    TRANSPOSE4x4W 0,1,2,3,5
-    UNSPILL r0, 4,5,6,7
-    SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
-    movq  mm4, m6 ; depends on the permutation to not produce conflicts
-    movq  mm0, m4
-    movq  mm1, m5
-    movq  mm2, mm4
-    movq  mm3, m7
-    RESET_MM_PERMUTATION
-    UNSPILL r0+8, 4,5,6,7
-    add   r0, 8
-    call dct8_mmx
-    sub   r0, 8
-    SPILL r0+8, 1,2,3,5,7
-    RESET_MM_PERMUTATION
-    UNSPILL r0, 0,1,2,3,4,5,6,7
-    call dct8_mmx
-    SPILL r0, 1,2,3,5,7
-    ret
-
-cglobal idct8_mmx
-    IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
-    SAVE_MM_PERMUTATION
-    ret
-
-%macro ADD_STORE_ROW 3
-    movq  m1, [r0+%1*FDEC_STRIDE]
-    punpckhbw m2, m1, m0
-    punpcklbw m1, m0
-    paddw m1, %2
-    paddw m2, %3
-    packuswb m1, m2
-    movq  [r0+%1*FDEC_STRIDE], m1
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-cglobal add8x8_idct8_mmx, 2,2
-global add8x8_idct8_mmx.skip_prologue
-.skip_prologue:
-    INIT_MMX
-    add word [r1], 32
-    UNSPILL r1, 1,2,3,5,6,7
-    call idct8_mmx
-    SPILL r1, 7
-    TRANSPOSE4x4W 0,1,2,3,7
-    SPILL r1, 0,1,2,3
-    UNSPILL r1, 7
-    TRANSPOSE4x4W 4,5,6,7,0
-    SPILL r1, 4,5,6,7
-    INIT_MMX
-    UNSPILL r1+8, 1,2,3,5,6,7
-    add r1, 8
-    call idct8_mmx
-    sub r1, 8
-    SPILL r1+8, 7
-    TRANSPOSE4x4W 0,1,2,3,7
-    SPILL r1+8, 0,1,2,3
-    UNSPILL r1+8, 7
-    TRANSPOSE4x4W 4,5,6,7,0
-    SPILL r1+8, 4,5,6,7
-    INIT_MMX
-    movq  m3, [r1+0x08]
-    movq  m0, [r1+0x40]
-    movq  [r1+0x40], m3
-    movq  [r1+0x08], m0
-    ; memory layout at this time:
-    ; A0------ A1------
-    ; B0------ F0------
-    ; C0------ G0------
-    ; D0------ H0------
-    ; E0------ E1------
-    ; B1------ F1------
-    ; C1------ G1------
-    ; D1------ H1------
-    UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
-    UNSPILL r1+8, 5,6,7
-    add r1, 8
-    call idct8_mmx
-    sub r1, 8
-    psraw m0, 6
-    psraw m1, 6
-    psraw m2, 6
-    psraw m3, 6
-    psraw m4, 6
-    psraw m5, 6
-    psraw m6, 6
-    psraw m7, 6
-    movq  [r1+0x08], m0 ; mm4
-    movq  [r1+0x48], m4 ; mm5
-    movq  [r1+0x58], m5 ; mm0
-    movq  [r1+0x68], m6 ; mm2
-    movq  [r1+0x78], m7 ; mm6
-    movq  mm5, [r1+0x18]
-    movq  mm6, [r1+0x28]
-    movq  [r1+0x18], m1 ; mm1
-    movq  [r1+0x28], m2 ; mm7
-    movq  mm7, [r1+0x38]
-    movq  [r1+0x38], m3 ; mm3
-    movq  mm1, [r1+0x10]
-    movq  mm2, [r1+0x20]
-    movq  mm3, [r1+0x30]
-    call idct8_mmx
-    psraw m0, 6
-    psraw m1, 6
-    psraw m2, 6
-    psraw m3, 6
-    psraw m4, 6
-    psraw m5, 6
-    psraw m6, 6
-    psraw m7, 6
-    SPILL r1, 0,1,2
-    pxor  m0, m0
-    ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
-    ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
-    ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
-    ADD_STORE_ROW 3, m3, [r1+0x38]
-    ADD_STORE_ROW 4, m4, [r1+0x48]
-    ADD_STORE_ROW 5, m5, [r1+0x58]
-    ADD_STORE_ROW 6, m6, [r1+0x68]
-    ADD_STORE_ROW 7, m7, [r1+0x78]
-    ret
-
-%macro DCT_SUB8 0
-cglobal sub8x8_dct, 3,3
-    add r2, 4*FDEC_STRIDE
-global current_function %+ .skip_prologue
-.skip_prologue:
-%if cpuflag(ssse3)
-    mova m7, [hsub_mul]
-%endif
-    LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
-    SPILL r0, 1,2
-    SWAP 2, 7
-    LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
-    UNSPILL r0, 1
-    SPILL r0, 7
-    SWAP 2, 7
-    UNSPILL r0, 2
-    DCT4_1D 0, 1, 2, 3, 7
-    TRANSPOSE2x4x4W 0, 1, 2, 3, 7
-    UNSPILL r0, 7
-    SPILL r0, 2
-    DCT4_1D 4, 5, 6, 7, 2
-    TRANSPOSE2x4x4W 4, 5, 6, 7, 2
-    UNSPILL r0, 2
-    SPILL r0, 6
-    DCT4_1D 0, 1, 2, 3, 6
-    UNSPILL r0, 6
-    STORE_DCT 0, 1, 2, 3, r0, 0
-    DCT4_1D 4, 5, 6, 7, 3
-    STORE_DCT 4, 5, 6, 7, r0, 64
-    ret
-
-;-----------------------------------------------------------------------------
-; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal sub8x8_dct8, 3,3
-    add r2, 4*FDEC_STRIDE
-global current_function %+ .skip_prologue
-.skip_prologue:
-%if cpuflag(ssse3)
-    mova m7, [hsub_mul]
-    LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
-    SPILL r0, 0,1
-    SWAP 1, 7
-    LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
-    UNSPILL r0, 0,1
-%else
-    LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
-    LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
-    LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
-    LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
-    LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
-    LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
-    SPILL r0, 0
-    LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
-    LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
-    UNSPILL r0, 0
-%endif
-    DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
-    UNSPILL r0, 0,4
-    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
-    UNSPILL r0, 4
-    DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
-    SPILL r0, 1,2,3,5,7
-    ret
-%endmacro
-
-INIT_XMM sse2
-%define movdqa movaps
-%define punpcklqdq movlhps
-DCT_SUB8
-%undef movdqa
-%undef punpcklqdq
-INIT_XMM ssse3
-DCT_SUB8
-INIT_XMM avx
-DCT_SUB8
-INIT_XMM xop
-DCT_SUB8
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
-;-----------------------------------------------------------------------------
-%macro ADD8x8 0
-cglobal add8x8_idct, 2,2
-    add r0, 4*FDEC_STRIDE
-global current_function %+ .skip_prologue
-.skip_prologue:
-    UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
-    SBUTTERFLY qdq, 0, 1, 4
-    SBUTTERFLY qdq, 2, 3, 4
-    UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
-    SPILL r1, 0
-    SBUTTERFLY qdq, 4, 5, 0
-    SBUTTERFLY qdq, 6, 7, 0
-    UNSPILL r1,0
-    IDCT4_1D w,0,1,2,3,r1
-    SPILL r1, 4
-    TRANSPOSE2x4x4W 0,1,2,3,4
-    UNSPILL r1, 4
-    IDCT4_1D w,4,5,6,7,r1
-    SPILL r1, 0
-    TRANSPOSE2x4x4W 4,5,6,7,0
-    UNSPILL r1, 0
-    paddw m0, [pw_32]
-    IDCT4_1D w,0,1,2,3,r1
-    paddw m4, [pw_32]
-    IDCT4_1D w,4,5,6,7,r1
-    SPILL r1, 6,7
-    pxor m7, m7
-    DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
-    DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
-    UNSPILL_SHUFFLE r1, 0,2, 6,7
-    DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
-    DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
-    STORE_IDCT m1, m3, m5, m2
-    ret
-%endmacro ; ADD8x8
-
-INIT_XMM sse2
-ADD8x8
-INIT_XMM avx
-ADD8x8
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-%macro ADD8x8_IDCT8 0
-cglobal add8x8_idct8, 2,2
-    add r0, 4*FDEC_STRIDE
-global current_function %+ .skip_prologue
-.skip_prologue:
-    UNSPILL r1, 1,2,3,5,6,7
-    IDCT8_1D   w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
-    SPILL r1, 6
-    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
-    paddw      m0, [pw_32]
-    SPILL r1, 0
-    IDCT8_1D   w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
-    SPILL r1, 6,7
-    pxor       m7, m7
-    DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
-    DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
-    UNSPILL_SHUFFLE r1, 0,2, 6,7
-    DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
-    DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
-    STORE_IDCT m1, m3, m5, m2
-    ret
-%endmacro ; ADD8x8_IDCT8
-
-INIT_XMM sse2
-ADD8x8_IDCT8
-INIT_XMM avx
-ADD8x8_IDCT8
-%endif ; !HIGH_BIT_DEPTH
diff --git a/android/src/main/libenc/jni/libx264/common/x86/dct-64.asm b/android/src/main/libenc/jni/libx264/common/x86/dct-64.asm
deleted file mode 100755
index 7249944..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/dct-64.asm
+++ /dev/null
@@ -1,430 +0,0 @@
-;*****************************************************************************
-;* dct-64.asm: x86_64 transform and zigzag
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Min Chen <chenm001.163.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION .text
-
-cextern pd_32
-cextern pw_pixel_max
-cextern pw_2
-cextern pw_m2
-cextern pw_32
-cextern hsub_mul
-
-; in: size, m0..m7, temp, temp
-; out: m0..m7
-%macro DCT8_1D 11
-    SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
-    SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
-    SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
-    SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
-
-    SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
-    SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
-
-    psra%1   m%10, m%2, 1
-    padd%1   m%10, m%2
-    padd%1   m%10, m%3
-    padd%1   m%10, m%4 ; %10=a4
-
-    psra%1   m%11, m%5, 1
-    padd%1   m%11, m%5
-    padd%1   m%11, m%3
-    psub%1   m%11, m%4 ; %11=a7
-
-    SUMSUB_BA %1, %5, %2
-    psub%1   m%2, m%4
-    psub%1   m%5, m%3
-    psra%1   m%4, 1
-    psra%1   m%3, 1
-    psub%1   m%2, m%4 ; %2=a5
-    psub%1   m%5, m%3 ; %5=a6
-
-    psra%1   m%3, m%11, 2
-    padd%1   m%3, m%10 ; %3=b1
-    psra%1   m%10, 2
-    psub%1   m%10, m%11 ; %10=b7
-
-    SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
-
-    psra%1   m%4, m%8, 1
-    padd%1   m%4, m%9 ; %4=b2
-    psra%1   m%9, 1
-    psub%1   m%9, m%8 ; %9=b6
-
-    psra%1   m%8, m%5, 2
-    padd%1   m%8, m%2 ; %8=b3
-    psra%1   m%2, 2
-    psub%1   m%5, m%2 ; %5=b5
-
-    SWAP %2, %7, %5, %8, %9, %10
-%endmacro
-
-%macro IDCT8_1D 11
-    SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
-
-    psra%1   m%10, m%3, 1
-    padd%1   m%10, m%3
-    padd%1   m%10, m%5
-    padd%1   m%10, m%7  ; %9=a7
-
-    psra%1   m%11, m%4, 1
-    psub%1   m%11, m%8 ; %10=a4
-    psra%1   m%8, 1
-    padd%1   m%8, m%4  ; %7=a6
-
-    psra%1   m%4, m%7, 1
-    padd%1   m%4, m%7
-    padd%1   m%4, m%9
-    psub%1   m%4, m%3  ; %3=a5
-
-    psub%1   m%3, m%5
-    psub%1   m%7, m%5
-    padd%1   m%3, m%9
-    psub%1   m%7, m%9
-    psra%1   m%5, 1
-    psra%1   m%9, 1
-    psub%1   m%3, m%5  ; %2=a3
-    psub%1   m%7, m%9  ; %6=a1
-
-    psra%1   m%5, m%10, 2
-    padd%1   m%5, m%7  ; %4=b1
-    psra%1   m%7, 2
-    psub%1   m%10, m%7  ; %9=b7
-
-    SUMSUB_BA %1, %8, %6, %7  ;  %7=b0, %5=b6
-    SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
-
-    psra%1   m%9, m%4, 2
-    padd%1   m%9, m%3 ; %8=b3
-    psra%1   m%3, 2
-    psub%1   m%3, m%4 ; %2=b5
-
-    SUMSUB_BA %1, %10, %8, %7  ; %9=c0,  %7=c7
-    SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
-    SUMSUB_BA %1, %9, %2, %7  ; %8=c2,  %1=c5
-    SUMSUB_BA %1, %5, %6, %7  ; %4=c3,  %5=c4
-
-    SWAP %11, %4
-    SWAP  %2, %10, %7
-    SWAP  %4, %9, %8
-%endmacro
-
-%if HIGH_BIT_DEPTH
-
-%macro SUB8x8_DCT8 0
-cglobal sub8x8_dct8, 3,3,14
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
-    LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
-
-    DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
-
-    TRANSPOSE4x4W 0,1,2,3,8
-    WIDEN_SXWD 0,8
-    WIDEN_SXWD 1,9
-    WIDEN_SXWD 2,10
-    WIDEN_SXWD 3,11
-    DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
-    mova  [r0+0x00], m0
-    mova  [r0+0x20], m8
-    mova  [r0+0x40], m1
-    mova  [r0+0x60], m9
-    mova  [r0+0x80], m2
-    mova  [r0+0xA0], m10
-    mova  [r0+0xC0], m3
-    mova  [r0+0xE0], m11
-
-    TRANSPOSE4x4W 4,5,6,7,0
-    WIDEN_SXWD 4,0
-    WIDEN_SXWD 5,1
-    WIDEN_SXWD 6,2
-    WIDEN_SXWD 7,3
-    DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
-    mova  [r0+0x10], m4
-    mova  [r0+0x30], m0
-    mova  [r0+0x50], m5
-    mova  [r0+0x70], m1
-    mova  [r0+0x90], m6
-    mova  [r0+0xB0], m2
-    mova  [r0+0xD0], m7
-    mova  [r0+0xF0], m3
-    ret
-%endmacro ; SUB8x8_DCT8
-
-INIT_XMM sse2
-SUB8x8_DCT8
-INIT_XMM sse4
-SUB8x8_DCT8
-INIT_XMM avx
-SUB8x8_DCT8
-
-%macro ADD8x8_IDCT8 0
-cglobal add8x8_idct8, 2,2,16
-    add r1, 128
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    mova     m0, [r1-128]
-    mova     m1, [r1-96]
-    mova     m2, [r1-64]
-    mova     m3, [r1-32]
-    mova     m4, [r1+ 0]
-    mova     m5, [r1+32]
-    mova     m6, [r1+64]
-    mova     m7, [r1+96]
-    IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
-    TRANSPOSE4x4D 0,1,2,3,8
-    TRANSPOSE4x4D 4,5,6,7,8
-    paddd     m0, [pd_32]
-    paddd     m4, [pd_32]
-    mova [r1+64], m6
-    mova [r1+96], m7
-    mova      m8, [r1-112]
-    mova      m9, [r1-80]
-    mova     m10, [r1-48]
-    mova     m11, [r1-16]
-    mova     m12, [r1+16]
-    mova     m13, [r1+48]
-    mova     m14, [r1+80]
-    mova     m15, [r1+112]
-    IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
-    TRANSPOSE4x4D 8,9,10,11,6
-    TRANSPOSE4x4D 12,13,14,15,6
-    IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
-    mova [r1-112], m8
-    mova  [r1-80], m9
-    mova       m6, [r1+64]
-    mova       m7, [r1+96]
-    IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
-    pxor       m8, m8
-    mova       m9, [pw_pixel_max]
-    STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
-    STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
-    STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
-    STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
-    mova       m0, [r1-112]
-    mova       m1, [r1-80]
-    STORE_DIFF  m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
-    STORE_DIFF  m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
-    STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
-    STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
-    ret
-%endmacro ; ADD8x8_IDCT8
-
-INIT_XMM sse2
-ADD8x8_IDCT8
-INIT_XMM avx
-ADD8x8_IDCT8
-
-%else ; !HIGH_BIT_DEPTH
-
-%macro DCT_SUB8 0
-cglobal sub8x8_dct, 3,3,10
-    add r2, 4*FDEC_STRIDE
-%if cpuflag(ssse3)
-    mova m7, [hsub_mul]
-%endif
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    SWAP 7, 9
-    LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
-    LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
-    DCT4_1D 0, 1, 2, 3, 8
-    TRANSPOSE2x4x4W 0, 1, 2, 3, 8
-    DCT4_1D 4, 5, 6, 7, 8
-    TRANSPOSE2x4x4W 4, 5, 6, 7, 8
-    DCT4_1D 0, 1, 2, 3, 8
-    STORE_DCT 0, 1, 2, 3, r0, 0
-    DCT4_1D 4, 5, 6, 7, 8
-    STORE_DCT 4, 5, 6, 7, r0, 64
-    ret
-
-;-----------------------------------------------------------------------------
-; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal sub8x8_dct8, 3,3,11
-    add r2, 4*FDEC_STRIDE
-%if cpuflag(ssse3)
-    mova m7, [hsub_mul]
-%endif
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    SWAP 7, 10
-    LOAD_DIFF8x4  0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
-    LOAD_DIFF8x4  4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
-    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
-    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
-    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
-    movdqa  [r0+0x00], m0
-    movdqa  [r0+0x10], m1
-    movdqa  [r0+0x20], m2
-    movdqa  [r0+0x30], m3
-    movdqa  [r0+0x40], m4
-    movdqa  [r0+0x50], m5
-    movdqa  [r0+0x60], m6
-    movdqa  [r0+0x70], m7
-    ret
-%endmacro
-
-INIT_XMM sse2
-%define movdqa movaps
-%define punpcklqdq movlhps
-DCT_SUB8
-%undef movdqa
-%undef punpcklqdq
-INIT_XMM ssse3
-DCT_SUB8
-INIT_XMM avx
-DCT_SUB8
-INIT_XMM xop
-DCT_SUB8
-
-INIT_YMM avx2
-cglobal sub16x16_dct8, 3,3,10
-    add  r0, 128
-    add  r2, 4*FDEC_STRIDE
-    call .sub16x8_dct8
-    add  r0, 256
-    add  r1, FENC_STRIDE*8
-    add  r2, FDEC_STRIDE*8
-    call .sub16x8_dct8
-    RET
-.sub16x8_dct8:
-    LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
-    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
-    LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
-    LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
-    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
-    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
-    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
-    mova    [r0-0x80+0x00], xm0
-    vextracti128 [r0+0x00], m0, 1
-    mova    [r0-0x80+0x10], xm1
-    vextracti128 [r0+0x10], m1, 1
-    mova    [r0-0x80+0x20], xm2
-    vextracti128 [r0+0x20], m2, 1
-    mova    [r0-0x80+0x30], xm3
-    vextracti128 [r0+0x30], m3, 1
-    mova    [r0-0x80+0x40], xm4
-    vextracti128 [r0+0x40], m4, 1
-    mova    [r0-0x80+0x50], xm5
-    vextracti128 [r0+0x50], m5, 1
-    mova    [r0-0x80+0x60], xm6
-    vextracti128 [r0+0x60], m6, 1
-    mova    [r0-0x80+0x70], xm7
-    vextracti128 [r0+0x70], m7, 1
-    ret
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-%macro ADD8x8_IDCT8 0
-cglobal add8x8_idct8, 2,2,11
-    add r0, 4*FDEC_STRIDE
-    pxor m7, m7
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    SWAP 7, 9
-    movdqa  m0, [r1+0x00]
-    movdqa  m1, [r1+0x10]
-    movdqa  m2, [r1+0x20]
-    movdqa  m3, [r1+0x30]
-    movdqa  m4, [r1+0x40]
-    movdqa  m5, [r1+0x50]
-    movdqa  m6, [r1+0x60]
-    movdqa  m7, [r1+0x70]
-    IDCT8_1D      w,0,1,2,3,4,5,6,7,8,10
-    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
-    paddw         m0, [pw_32] ; rounding for the >>6 at the end
-    IDCT8_1D      w,0,1,2,3,4,5,6,7,8,10
-    DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
-    DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
-    DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
-    DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
-    STORE_IDCT m1, m3, m5, m7
-    ret
-%endmacro ; ADD8x8_IDCT8
-
-INIT_XMM sse2
-ADD8x8_IDCT8
-INIT_XMM avx
-ADD8x8_IDCT8
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
-;-----------------------------------------------------------------------------
-%macro ADD8x8 0
-cglobal add8x8_idct, 2,2,11
-    add  r0, 4*FDEC_STRIDE
-    pxor m7, m7
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    SWAP 7, 9
-    mova   m0, [r1+ 0]
-    mova   m2, [r1+16]
-    mova   m1, [r1+32]
-    mova   m3, [r1+48]
-    SBUTTERFLY qdq, 0, 1, 4
-    SBUTTERFLY qdq, 2, 3, 4
-    mova   m4, [r1+64]
-    mova   m6, [r1+80]
-    mova   m5, [r1+96]
-    mova   m7, [r1+112]
-    SBUTTERFLY qdq, 4, 5, 8
-    SBUTTERFLY qdq, 6, 7, 8
-    IDCT4_1D w,0,1,2,3,8,10
-    TRANSPOSE2x4x4W 0,1,2,3,8
-    IDCT4_1D w,4,5,6,7,8,10
-    TRANSPOSE2x4x4W 4,5,6,7,8
-    paddw m0, [pw_32]
-    IDCT4_1D w,0,1,2,3,8,10
-    paddw m4, [pw_32]
-    IDCT4_1D w,4,5,6,7,8,10
-    DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
-    DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
-    DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
-    DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
-    STORE_IDCT m1, m3, m5, m7
-    ret
-%endmacro ; ADD8x8
-
-INIT_XMM sse2
-ADD8x8
-INIT_XMM avx
-ADD8x8
-
-%endif ; !HIGH_BIT_DEPTH
diff --git a/android/src/main/libenc/jni/libx264/common/x86/dct-a.asm b/android/src/main/libenc/jni/libx264/common/x86/dct-a.asm
deleted file mode 100755
index 150a6ed..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/dct-a.asm
+++ /dev/null
@@ -1,1885 +0,0 @@
-;*****************************************************************************
-;* dct-a.asm: x86 transform and zigzag
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Holger Lubitz <holger@lubitz.org>
-;*          Loren Merritt <lorenm@u.washington.edu>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Min Chen <chenm001.163.com>
-;*          Fiona Glaser <fiona@x264.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
-pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
-pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
-pb_subacmask:   dw 0,-1,-1,-1,-1,-1,-1,-1
-pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
-pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
-pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
-pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
-
-pb_scan8framet1: SHUFFLE_MASK_W 0,  1,  6,  7,  8,  9, 13, 14
-pb_scan8framet2: SHUFFLE_MASK_W 2 , 3,  4,  7,  9, 15, 10, 14
-pb_scan8framet3: SHUFFLE_MASK_W 0,  1,  5,  6,  8, 11, 12, 13
-pb_scan8framet4: SHUFFLE_MASK_W 0,  3,  4,  5,  8, 11, 12, 15
-pb_scan8framet5: SHUFFLE_MASK_W 1,  2,  6,  7,  9, 10, 13, 14
-pb_scan8framet6: SHUFFLE_MASK_W 0,  3,  4,  5, 10, 11, 12, 15
-pb_scan8framet7: SHUFFLE_MASK_W 1,  2,  6,  7,  8,  9, 14, 15
-pb_scan8framet8: SHUFFLE_MASK_W 0,  1,  2,  7,  8, 10, 11, 14
-pb_scan8framet9: SHUFFLE_MASK_W 1,  4,  5,  7,  8, 13, 14, 15
-
-pb_scan8frame1: SHUFFLE_MASK_W  0,  8,  1,  2,  9, 12,  4, 13
-pb_scan8frame2: SHUFFLE_MASK_W  4,  0,  1,  5,  8, 10, 12, 14
-pb_scan8frame3: SHUFFLE_MASK_W 12, 10,  8,  6,  2,  3,  7,  9
-pb_scan8frame4: SHUFFLE_MASK_W  0,  1,  8, 12,  4, 13,  9,  2
-pb_scan8frame5: SHUFFLE_MASK_W  5, 14, 10,  3, 11, 15,  6,  7
-pb_scan8frame6: SHUFFLE_MASK_W  6,  8, 12, 13,  9,  7,  5,  3
-pb_scan8frame7: SHUFFLE_MASK_W  1,  3,  5,  7, 10, 14, 15, 11
-pb_scan8frame8: SHUFFLE_MASK_W  10, 3, 11, 14,  5,  6, 15,  7
-
-pb_scan8field1 : SHUFFLE_MASK_W    0,   1,   2,   8,   9,   3,   4,  10
-pb_scan8field2a: SHUFFLE_MASK_W 0x80,  11,   5,   6,   7,  12,0x80,0x80
-pb_scan8field2b: SHUFFLE_MASK_W    0,0x80,0x80,0x80,0x80,0x80,   1,   8
-pb_scan8field3a: SHUFFLE_MASK_W   10,   5,   6,   7,  11,0x80,0x80,0x80
-pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80,   1,   8,   2
-pb_scan8field4a: SHUFFLE_MASK_W    4,   5,   6,   7,  11,0x80,0x80,0x80
-pb_scan8field6 : SHUFFLE_MASK_W    4,   5,   6,   7,  11,0x80,0x80,  12
-pb_scan8field7 : SHUFFLE_MASK_W    5,   6,   7,  11,0x80,0x80,  12,  13
-
-SECTION .text
-
-cextern pw_32_0
-cextern pw_32
-cextern pw_512
-cextern pw_8000
-cextern pw_pixel_max
-cextern hsub_mul
-cextern pb_1
-cextern pw_1
-cextern pd_1
-cextern pd_32
-cextern pw_ppppmmmm
-cextern pw_pmpmpmpm
-cextern deinterleave_shufd
-cextern pb_unpackbd1
-cextern pb_unpackbd2
-
-%macro WALSH4_1D 6
-    SUMSUB_BADC %1, %5, %4, %3, %2, %6
-    SUMSUB_BADC %1, %5, %3, %4, %2, %6
-    SWAP %2, %5, %4
-%endmacro
-
-%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
-    movq  m%3, m%4
-    pxor  m%1, m%4
-    psubw m%3, m%2
-    pxor  m%2, m%4
-    pavgw m%3, m%1
-    pavgw m%2, m%1
-    pxor  m%3, m%4
-    pxor  m%2, m%4
-    SWAP %1, %2, %3
-%endmacro
-
-%macro DCT_UNPACK 3
-    punpcklwd %3, %1
-    punpckhwd %2, %1
-    psrad     %3, 16
-    psrad     %2, 16
-    SWAP      %1, %3
-%endmacro
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void dct4x4dc( dctcoef d[4][4] )
-;-----------------------------------------------------------------------------
-%macro DCT4x4_DC 0
-cglobal dct4x4dc, 1,1,5
-    mova   m0, [r0+ 0]
-    mova   m1, [r0+16]
-    mova   m2, [r0+32]
-    mova   m3, [r0+48]
-    WALSH4_1D  d, 0,1,2,3,4
-    TRANSPOSE4x4D 0,1,2,3,4
-    paddd  m0, [pd_1]
-    WALSH4_1D  d, 0,1,2,3,4
-    psrad  m0, 1
-    psrad  m1, 1
-    psrad  m2, 1
-    psrad  m3, 1
-    mova [r0+ 0], m0
-    mova [r0+16], m1
-    mova [r0+32], m2
-    mova [r0+48], m3
-    RET
-%endmacro ; DCT4x4_DC
-
-INIT_XMM sse2
-DCT4x4_DC
-INIT_XMM avx
-DCT4x4_DC
-%else
-
-INIT_MMX mmx2
-cglobal dct4x4dc, 1,1
-    movq   m3, [r0+24]
-    movq   m2, [r0+16]
-    movq   m1, [r0+ 8]
-    movq   m0, [r0+ 0]
-    movq   m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
-    WALSH4_1D  w, 0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
-    SUMSUB_BADC w, 1, 0, 3, 2, 4
-    SWAP 0, 1
-    SWAP 2, 3
-    SUMSUB_17BIT 0,2,4,7
-    SUMSUB_17BIT 1,3,5,7
-    movq  [r0+0], m0
-    movq  [r0+8], m2
-    movq [r0+16], m3
-    movq [r0+24], m1
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void idct4x4dc( int32_t d[4][4] )
-;-----------------------------------------------------------------------------
-%macro IDCT4x4DC 0
-cglobal idct4x4dc, 1,1
-    mova   m3, [r0+48]
-    mova   m2, [r0+32]
-    mova   m1, [r0+16]
-    mova   m0, [r0+ 0]
-    WALSH4_1D  d,0,1,2,3,4
-    TRANSPOSE4x4D 0,1,2,3,4
-    WALSH4_1D  d,0,1,2,3,4
-    mova  [r0+ 0], m0
-    mova  [r0+16], m1
-    mova  [r0+32], m2
-    mova  [r0+48], m3
-    RET
-%endmacro ; IDCT4x4DC
-
-INIT_XMM sse2
-IDCT4x4DC
-INIT_XMM avx
-IDCT4x4DC
-%else
-
-;-----------------------------------------------------------------------------
-; void idct4x4dc( int16_t d[4][4] )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx
-cglobal idct4x4dc, 1,1
-    movq   m3, [r0+24]
-    movq   m2, [r0+16]
-    movq   m1, [r0+ 8]
-    movq   m0, [r0+ 0]
-    WALSH4_1D  w,0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
-    WALSH4_1D  w,0,1,2,3,4
-    movq  [r0+ 0], m0
-    movq  [r0+ 8], m1
-    movq  [r0+16], m2
-    movq  [r0+24], m3
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
-;-----------------------------------------------------------------------------
-%if WIN64
-    DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
-%else
-    DECLARE_REG_TMP 2
-%endif
-
-%macro INSERT_COEFF 3 ; dst, src, imm
-    %if %3
-        %if HIGH_BIT_DEPTH
-            %if cpuflag(sse4)
-                pinsrd %1, %2, %3
-            %elif %3 == 2
-                movd       m2, %2
-            %elif %3 == 1
-                punpckldq  %1, %2
-            %else
-                punpckldq  m2, %2
-                punpcklqdq %1, m2
-            %endif
-        %else
-            %if %3 == 2
-                punpckldq  %1, %2
-            %else
-                pinsrw %1, %2, %3
-            %endif
-        %endif
-    %else
-        movd %1, %2
-    %endif
-    %if HIGH_BIT_DEPTH
-        mov %2, t0d
-    %else
-        mov %2, t0w
-    %endif
-%endmacro
-
-%macro DCT2x4DC 2
-cglobal dct2x4dc, 2,3
-    xor          t0d, t0d
-    INSERT_COEFF  m0, [r1+0*16*SIZEOF_DCTCOEF], 0
-    INSERT_COEFF  m0, [r1+1*16*SIZEOF_DCTCOEF], 2
-    add           r1, 4*16*SIZEOF_DCTCOEF
-    INSERT_COEFF  m0, [r1-2*16*SIZEOF_DCTCOEF], 1
-    INSERT_COEFF  m0, [r1-1*16*SIZEOF_DCTCOEF], 3
-    INSERT_COEFF  m1, [r1+0*16*SIZEOF_DCTCOEF], 0
-    INSERT_COEFF  m1, [r1+1*16*SIZEOF_DCTCOEF], 2
-    INSERT_COEFF  m1, [r1+2*16*SIZEOF_DCTCOEF], 1
-    INSERT_COEFF  m1, [r1+3*16*SIZEOF_DCTCOEF], 3
-    SUMSUB_BA     %1, 1, 0, 2
-    SBUTTERFLY    %2, 1, 0, 2
-    SUMSUB_BA     %1, 0, 1, 2
-    SBUTTERFLY    %2, 0, 1, 2
-    SUMSUB_BA     %1, 1, 0, 2
-    pshuf%1       m0, m0, q1032
-    mova        [r0], m1
-    mova [r0+mmsize], m0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-DCT2x4DC d, dq
-INIT_XMM avx
-DCT2x4DC d, dq
-%else
-INIT_MMX mmx2
-DCT2x4DC w, wd
-%endif
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx
-cglobal sub4x4_dct, 3,3
-.skip_prologue:
-    LOAD_DIFF  m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
-    LOAD_DIFF  m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
-    LOAD_DIFF  m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
-    LOAD_DIFF  m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
-    DCT4_1D 0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
-
-    SUMSUB_BADC w, 3, 0, 2, 1
-    SUMSUB_BA   w, 2, 3, 4
-    DCT_UNPACK m2, m4, m5
-    DCT_UNPACK m3, m6, m7
-    mova  [r0+ 0], m2 ; s03 + s12
-    mova  [r0+ 8], m4
-    mova  [r0+32], m3 ; s03 - s12
-    mova  [r0+40], m6
-
-    DCT_UNPACK m0, m2, m4
-    DCT_UNPACK m1, m3, m5
-    SUMSUB2_AB  d, 0, 1, 4
-    SUMSUB2_AB  d, 2, 3, 5
-    mova  [r0+16], m0 ; d03*2 + d12
-    mova  [r0+24], m2
-    mova  [r0+48], m4 ; d03 - 2*d12
-    mova  [r0+56], m5
-    RET
-%else
-
-%macro SUB_DCT4 0
-cglobal sub4x4_dct, 3,3
-.skip_prologue:
-%if cpuflag(ssse3)
-    mova m5, [hsub_mul]
-%endif
-    LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
-    DCT4_1D 0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
-    DCT4_1D 0,1,2,3,4
-    movq  [r0+ 0], m0
-    movq  [r0+ 8], m1
-    movq  [r0+16], m2
-    movq  [r0+24], m3
-    RET
-%endmacro
-
-INIT_MMX mmx
-SUB_DCT4
-INIT_MMX ssse3
-SUB_DCT4
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
-;-----------------------------------------------------------------------------
-%macro STORE_DIFFx2 6
-    psrad     %1, 6
-    psrad     %2, 6
-    packssdw  %1, %2
-    movq      %3, %5
-    movhps    %3, %6
-    paddsw    %1, %3
-    CLIPW     %1, %4, [pw_pixel_max]
-    movq      %5, %1
-    movhps    %6, %1
-%endmacro
-
-%macro ADD4x4_IDCT 0
-cglobal add4x4_idct, 2,2,6
-    add   r0, 2*FDEC_STRIDEB
-.skip_prologue:
-    mova  m1, [r1+16]
-    mova  m3, [r1+48]
-    mova  m2, [r1+32]
-    mova  m0, [r1+ 0]
-    IDCT4_1D d,0,1,2,3,4,5
-    TRANSPOSE4x4D 0,1,2,3,4
-    paddd m0, [pd_32]
-    IDCT4_1D d,0,1,2,3,4,5
-    pxor  m5, m5
-    STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
-    STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
-    RET
-%endmacro
-
-INIT_XMM sse2
-ADD4x4_IDCT
-INIT_XMM avx
-ADD4x4_IDCT
-
-%else ; !HIGH_BIT_DEPTH
-
-INIT_MMX mmx
-cglobal add4x4_idct, 2,2
-    pxor m7, m7
-.skip_prologue:
-    movq  m1, [r1+ 8]
-    movq  m3, [r1+24]
-    movq  m2, [r1+16]
-    movq  m0, [r1+ 0]
-    IDCT4_1D w,0,1,2,3,4,5
-    TRANSPOSE4x4W 0,1,2,3,4
-    paddw m0, [pw_32]
-    IDCT4_1D w,0,1,2,3,4,5
-    STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
-    STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
-    STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
-    STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
-    RET
-
-%macro ADD4x4 0
-cglobal add4x4_idct, 2,2,6
-    mova      m1, [r1+0x00]     ; row1/row0
-    mova      m3, [r1+0x10]     ; row3/row2
-    psraw     m0, m1, 1         ; row1>>1/...
-    psraw     m2, m3, 1         ; row3>>1/...
-    movsd     m0, m1            ; row1>>1/row0
-    movsd     m2, m3            ; row3>>1/row2
-    psubw     m0, m3            ; row1>>1-row3/row0-2
-    paddw     m2, m1            ; row3>>1+row1/row0+2
-    SBUTTERFLY2 wd, 0, 2, 1
-    SUMSUB_BA w, 2, 0, 1
-    pshuflw   m1, m2, q2301
-    pshufhw   m2, m2, q2301
-    punpckldq m1, m0
-    punpckhdq m2, m0
-    SWAP       0, 1
-
-    mova      m1, [pw_32_0]
-    paddw     m1, m0            ; row1/row0 corrected
-    psraw     m0, 1             ; row1>>1/...
-    psraw     m3, m2, 1         ; row3>>1/...
-    movsd     m0, m1            ; row1>>1/row0
-    movsd     m3, m2            ; row3>>1/row2
-    psubw     m0, m2            ; row1>>1-row3/row0-2
-    paddw     m3, m1            ; row3>>1+row1/row0+2
-    SBUTTERFLY2 qdq, 0, 3, 1
-    SUMSUB_BA w, 3, 0, 1
-
-    movd      m4, [r0+FDEC_STRIDE*0]
-    movd      m1, [r0+FDEC_STRIDE*1]
-    movd      m2, [r0+FDEC_STRIDE*2]
-    movd      m5, [r0+FDEC_STRIDE*3]
-    punpckldq m1, m4            ; row0/row1
-    pxor      m4, m4
-    punpckldq m2, m5            ; row3/row2
-    punpcklbw m1, m4
-    psraw     m3, 6
-    punpcklbw m2, m4
-    psraw     m0, 6
-    paddsw    m3, m1
-    paddsw    m0, m2
-    packuswb  m0, m3            ; row0/row1/row3/row2
-    pextrd   [r0+FDEC_STRIDE*0], m0, 3
-    pextrd   [r0+FDEC_STRIDE*1], m0, 2
-    movd     [r0+FDEC_STRIDE*2], m0
-    pextrd   [r0+FDEC_STRIDE*3], m0, 1
-    RET
-%endmacro ; ADD4x4
-
-INIT_XMM sse4
-ADD4x4
-INIT_XMM avx
-ADD4x4
-
-%macro STOREx2_AVX2 9
-    movq      xm%3, [r0+%5*FDEC_STRIDE]
-    vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
-    movq      xm%4, [r0+%7*FDEC_STRIDE]
-    vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
-    punpcklbw  m%3, m%9
-    punpcklbw  m%4, m%9
-    psraw      m%1, 6
-    psraw      m%2, 6
-    paddsw     m%1, m%3
-    paddsw     m%2, m%4
-    packuswb   m%1, m%2
-    vextracti128 xm%2, m%1, 1
-    movq   [r0+%5*FDEC_STRIDE], xm%1
-    movq   [r0+%6*FDEC_STRIDE], xm%2
-    movhps [r0+%7*FDEC_STRIDE], xm%1
-    movhps [r0+%8*FDEC_STRIDE], xm%2
-%endmacro
-
-INIT_YMM avx2
-cglobal add8x8_idct, 2,3,8
-    add    r0, 4*FDEC_STRIDE
-    pxor   m7, m7
-    TAIL_CALL .skip_prologue, 0
-global current_function %+ .skip_prologue
-.skip_prologue:
-    ; TRANSPOSE4x4Q
-    mova       xm0, [r1+ 0]
-    mova       xm1, [r1+32]
-    mova       xm2, [r1+16]
-    mova       xm3, [r1+48]
-    vinserti128 m0, m0, [r1+ 64], 1
-    vinserti128 m1, m1, [r1+ 96], 1
-    vinserti128 m2, m2, [r1+ 80], 1
-    vinserti128 m3, m3, [r1+112], 1
-    SBUTTERFLY qdq, 0, 1, 4
-    SBUTTERFLY qdq, 2, 3, 4
-    IDCT4_1D w,0,1,2,3,4,5
-    TRANSPOSE2x4x4W 0,1,2,3,4
-    paddw m0, [pw_32]
-    IDCT4_1D w,0,1,2,3,4,5
-    STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
-    STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
-    ret
-
-; 2xdst, 2xtmp, 4xsrcrow, 1xzero
-%macro LOAD_DIFF8x2_AVX2 9
-    movq    xm%1, [r1+%5*FENC_STRIDE]
-    movq    xm%2, [r1+%6*FENC_STRIDE]
-    vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
-    vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
-    punpcklbw m%1, m%9
-    punpcklbw m%2, m%9
-    movq    xm%3, [r2+(%5-4)*FDEC_STRIDE]
-    movq    xm%4, [r2+(%6-4)*FDEC_STRIDE]
-    vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
-    vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
-    punpcklbw m%3, m%9
-    punpcklbw m%4, m%9
-    psubw    m%1, m%3
-    psubw    m%2, m%4
-%endmacro
-
-; 4x src, 1x tmp
-%macro STORE8_DCT_AVX2 5
-    SBUTTERFLY qdq, %1, %2, %5
-    SBUTTERFLY qdq, %3, %4, %5
-    mova [r0+  0], xm%1
-    mova [r0+ 16], xm%3
-    mova [r0+ 32], xm%2
-    mova [r0+ 48], xm%4
-    vextracti128 [r0+ 64], m%1, 1
-    vextracti128 [r0+ 80], m%3, 1
-    vextracti128 [r0+ 96], m%2, 1
-    vextracti128 [r0+112], m%4, 1
-%endmacro
-
-%macro STORE16_DCT_AVX2 5
-    SBUTTERFLY qdq, %1, %2, %5
-    SBUTTERFLY qdq, %3, %4, %5
-    mova [r0+ 0-128], xm%1
-    mova [r0+16-128], xm%3
-    mova [r0+32-128], xm%2
-    mova [r0+48-128], xm%4
-    vextracti128 [r0+ 0], m%1, 1
-    vextracti128 [r0+16], m%3, 1
-    vextracti128 [r0+32], m%2, 1
-    vextracti128 [r0+48], m%4, 1
-%endmacro
-
-INIT_YMM avx2
-cglobal sub8x8_dct, 3,3,7
-    pxor m6, m6
-    add r2, 4*FDEC_STRIDE
-    LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
-    LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
-    DCT4_1D 0, 1, 2, 3, 4
-    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
-    DCT4_1D 0, 1, 2, 3, 4
-    STORE8_DCT_AVX2 0, 1, 2, 3, 4
-    RET
-
-INIT_YMM avx2
-cglobal sub16x16_dct, 3,3,6
-    add r0, 128
-    add r2, 4*FDEC_STRIDE
-    call .sub16x4_dct
-    add r0, 64
-    add r1, 4*FENC_STRIDE
-    add r2, 4*FDEC_STRIDE
-    call .sub16x4_dct
-    add r0, 256-64
-    add r1, 4*FENC_STRIDE
-    add r2, 4*FDEC_STRIDE
-    call .sub16x4_dct
-    add r0, 64
-    add r1, 4*FENC_STRIDE
-    add r2, 4*FDEC_STRIDE
-    call .sub16x4_dct
-    RET
-.sub16x4_dct:
-    LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
-    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
-    DCT4_1D 0, 1, 2, 3, 4
-    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
-    DCT4_1D 0, 1, 2, 3, 4
-    STORE16_DCT_AVX2 0, 1, 2, 3, 4
-    ret
-%endif ; HIGH_BIT_DEPTH
-
-INIT_MMX
-;-----------------------------------------------------------------------------
-; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 7
-cglobal %1, 3,3,%7
-%if HIGH_BIT_DEPTH == 0
-%if mmsize == 8
-    pxor m7, m7
-%else
-    add r2, 4*FDEC_STRIDE
-    mova m7, [hsub_mul]
-%endif
-%endif ; !HIGH_BIT_DEPTH
-.skip_prologue:
-    call %2.skip_prologue
-    add  r0, %3
-    add  r1, %4-%5-%6*FENC_STRIDE
-    add  r2, %4-%5-%6*FDEC_STRIDE
-    call %2.skip_prologue
-    add  r0, %3
-    add  r1, (%4-%6)*FENC_STRIDE-%5-%4
-    add  r2, (%4-%6)*FDEC_STRIDE-%5-%4
-    call %2.skip_prologue
-    add  r0, %3
-    add  r1, %4-%5-%6*FENC_STRIDE
-    add  r2, %4-%5-%6*FDEC_STRIDE
-    TAIL_CALL %2.skip_prologue, 1
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
-;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 6-7
-%if HIGH_BIT_DEPTH
-cglobal %1, 2,2,%7
-%if %3==256
-    add r1, 128
-%endif
-%else
-cglobal %1, 2,2,11
-    pxor m7, m7
-%endif
-%if mmsize>=16 && %3!=256
-    add  r0, 4*FDEC_STRIDE
-%endif
-.skip_prologue:
-    call %2.skip_prologue
-    add  r0, %4-%5-%6*FDEC_STRIDE
-    add  r1, %3
-    call %2.skip_prologue
-    add  r0, (%4-%6)*FDEC_STRIDE-%5-%4
-    add  r1, %3
-    call %2.skip_prologue
-    add  r0, %4-%5-%6*FDEC_STRIDE
-    add  r1, %3
-    TAIL_CALL %2.skip_prologue, 1
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_MMX
-SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   64,  8, 0, 0, 0
-SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   64, 16, 8, 8, 0
-INIT_XMM
-ADD_NxN_IDCT add8x8_idct_sse2,   add4x4_idct_sse2, 64,  8, 0, 0, 6
-ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
-ADD_NxN_IDCT add8x8_idct_avx,    add4x4_idct_avx,  64,  8, 0, 0, 6
-ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  64, 16, 8, 8, 6
-cextern add8x8_idct8_sse2.skip_prologue
-cextern add8x8_idct8_avx.skip_prologue
-ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
-ADD_NxN_IDCT add16x16_idct8_avx,  add8x8_idct8_avx,  256, 16, 0, 0, 16
-cextern sub8x8_dct8_sse2.skip_prologue
-cextern sub8x8_dct8_sse4.skip_prologue
-cextern sub8x8_dct8_avx.skip_prologue
-SUB_NxN_DCT  sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
-SUB_NxN_DCT  sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
-SUB_NxN_DCT  sub16x16_dct8_avx,  sub8x8_dct8_avx,  256, 16, 0, 0, 14
-%else ; !HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 0
-INIT_MMX
-SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   32, 4, 0, 0, 0
-ADD_NxN_IDCT add8x8_idct_mmx,    add4x4_idct_mmx,  32, 4, 0, 0
-SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   32, 8, 4, 4, 0
-ADD_NxN_IDCT add16x16_idct_mmx,  add8x8_idct_mmx,  32, 8, 4, 4
-
-cextern sub8x8_dct8_mmx.skip_prologue
-cextern add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT  sub16x16_dct8_mmx,  sub8x8_dct8_mmx,  128, 8, 0, 0, 0
-ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
-%endif
-
-INIT_XMM
-cextern sub8x8_dct_sse2.skip_prologue
-cextern sub8x8_dct_ssse3.skip_prologue
-cextern sub8x8_dct_avx.skip_prologue
-cextern sub8x8_dct_xop.skip_prologue
-SUB_NxN_DCT  sub16x16_dct_sse2,  sub8x8_dct_sse2,  128, 8, 0, 0, 10
-SUB_NxN_DCT  sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
-SUB_NxN_DCT  sub16x16_dct_avx,   sub8x8_dct_avx,   128, 8, 0, 0, 10
-SUB_NxN_DCT  sub16x16_dct_xop,   sub8x8_dct_xop,   128, 8, 0, 0, 10
-
-cextern add8x8_idct_sse2.skip_prologue
-cextern add8x8_idct_avx.skip_prologue
-ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  128, 8, 0, 0
-
-cextern add8x8_idct8_sse2.skip_prologue
-cextern add8x8_idct8_avx.skip_prologue
-ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
-ADD_NxN_IDCT add16x16_idct8_avx,  add8x8_idct8_avx,  128, 8, 0, 0
-
-cextern sub8x8_dct8_sse2.skip_prologue
-cextern sub8x8_dct8_ssse3.skip_prologue
-cextern sub8x8_dct8_avx.skip_prologue
-SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2,  128, 8, 0, 0, 11
-SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
-SUB_NxN_DCT  sub16x16_dct8_avx,   sub8x8_dct8_avx,   128, 8, 0, 0, 11
-
-INIT_YMM
-ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
-;-----------------------------------------------------------------------------
-%macro ADD_DC 2
-    mova    m0, [%1+FDEC_STRIDEB*0] ; 8pixels
-    mova    m1, [%1+FDEC_STRIDEB*1]
-    mova    m2, [%1+FDEC_STRIDEB*2]
-    paddsw  m0, %2
-    paddsw  m1, %2
-    paddsw  m2, %2
-    paddsw  %2, [%1+FDEC_STRIDEB*3]
-    CLIPW   m0, m5, m6
-    CLIPW   m1, m5, m6
-    CLIPW   m2, m5, m6
-    CLIPW   %2, m5, m6
-    mova    [%1+FDEC_STRIDEB*0], m0
-    mova    [%1+FDEC_STRIDEB*1], m1
-    mova    [%1+FDEC_STRIDEB*2], m2
-    mova    [%1+FDEC_STRIDEB*3], %2
-%endmacro
-
-%macro ADD_IDCT_DC 0
-cglobal add8x8_idct_dc, 2,2,7
-    mova        m6, [pw_pixel_max]
-    pxor        m5, m5
-    mova        m3, [r1]
-    paddd       m3, [pd_32]
-    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
-    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
-    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
-    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
-    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
-    ADD_DC r0+FDEC_STRIDEB*0, m4
-    ADD_DC r0+FDEC_STRIDEB*4, m3
-    RET
-
-cglobal add16x16_idct_dc, 2,3,8
-    mov         r2, 4
-    mova        m6, [pw_pixel_max]
-    mova        m7, [pd_32]
-    pxor        m5, m5
-.loop:
-    mova        m3, [r1]
-    paddd       m3, m7
-    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
-    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
-    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
-    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
-    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
-    ADD_DC r0+FDEC_STRIDEB*0, m4
-    ADD_DC r0+SIZEOF_PIXEL*8, m3
-    add         r1, 16
-    add         r0, 4*FDEC_STRIDEB
-    dec         r2
-    jg .loop
-    RET
-%endmacro ; ADD_IDCT_DC
-
-INIT_XMM sse2
-ADD_IDCT_DC
-INIT_XMM avx
-ADD_IDCT_DC
-
-%else ;!HIGH_BIT_DEPTH
-%macro ADD_DC 3
-    mova    m4, [%3+FDEC_STRIDE*0]
-    mova    m5, [%3+FDEC_STRIDE*1]
-    mova    m6, [%3+FDEC_STRIDE*2]
-    paddusb m4, %1
-    paddusb m5, %1
-    paddusb m6, %1
-    paddusb %1, [%3+FDEC_STRIDE*3]
-    psubusb m4, %2
-    psubusb m5, %2
-    psubusb m6, %2
-    psubusb %1, %2
-    mova [%3+FDEC_STRIDE*0], m4
-    mova [%3+FDEC_STRIDE*1], m5
-    mova [%3+FDEC_STRIDE*2], m6
-    mova [%3+FDEC_STRIDE*3], %1
-%endmacro
-
-INIT_MMX mmx2
-cglobal add8x8_idct_dc, 2,2
-    mova      m0, [r1]
-    pxor      m1, m1
-    add       r0, FDEC_STRIDE*4
-    paddw     m0, [pw_32]
-    psraw     m0, 6
-    psubw     m1, m0
-    packuswb  m0, m0
-    packuswb  m1, m1
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-    pshufw    m2, m0, q3322
-    pshufw    m3, m1, q3322
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-    ADD_DC    m0, m1, r0-FDEC_STRIDE*4
-    ADD_DC    m2, m3, r0
-    RET
-
-INIT_XMM ssse3
-cglobal add8x8_idct_dc, 2,2
-    movh     m0, [r1]
-    pxor     m1, m1
-    add      r0, FDEC_STRIDE*4
-    pmulhrsw m0, [pw_512]
-    psubw    m1, m0
-    mova     m5, [pb_unpackbd1]
-    packuswb m0, m0
-    packuswb m1, m1
-    pshufb   m0, m5
-    pshufb   m1, m5
-    movh     m2, [r0+FDEC_STRIDE*-4]
-    movh     m3, [r0+FDEC_STRIDE*-3]
-    movh     m4, [r0+FDEC_STRIDE*-2]
-    movh     m5, [r0+FDEC_STRIDE*-1]
-    movhps   m2, [r0+FDEC_STRIDE* 0]
-    movhps   m3, [r0+FDEC_STRIDE* 1]
-    movhps   m4, [r0+FDEC_STRIDE* 2]
-    movhps   m5, [r0+FDEC_STRIDE* 3]
-    paddusb  m2, m0
-    paddusb  m3, m0
-    paddusb  m4, m0
-    paddusb  m5, m0
-    psubusb  m2, m1
-    psubusb  m3, m1
-    psubusb  m4, m1
-    psubusb  m5, m1
-    movh   [r0+FDEC_STRIDE*-4], m2
-    movh   [r0+FDEC_STRIDE*-3], m3
-    movh   [r0+FDEC_STRIDE*-2], m4
-    movh   [r0+FDEC_STRIDE*-1], m5
-    movhps [r0+FDEC_STRIDE* 0], m2
-    movhps [r0+FDEC_STRIDE* 1], m3
-    movhps [r0+FDEC_STRIDE* 2], m4
-    movhps [r0+FDEC_STRIDE* 3], m5
-    RET
-
-INIT_MMX mmx2
-cglobal add16x16_idct_dc, 2,3
-    mov       r2, 4
-.loop:
-    mova      m0, [r1]
-    pxor      m1, m1
-    paddw     m0, [pw_32]
-    psraw     m0, 6
-    psubw     m1, m0
-    packuswb  m0, m0
-    packuswb  m1, m1
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-    pshufw    m2, m0, q3322
-    pshufw    m3, m1, q3322
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-    ADD_DC    m0, m1, r0
-    ADD_DC    m2, m3, r0+8
-    add       r1, 8
-    add       r0, FDEC_STRIDE*4
-    dec       r2
-    jg .loop
-    RET
-
-INIT_XMM sse2
-cglobal add16x16_idct_dc, 2,2,8
-    call .loop
-    add       r0, FDEC_STRIDE*4
-    TAIL_CALL .loop, 0
-.loop:
-    add       r0, FDEC_STRIDE*4
-    movq      m0, [r1+0]
-    movq      m2, [r1+8]
-    add       r1, 16
-    punpcklwd m0, m0
-    punpcklwd m2, m2
-    pxor      m3, m3
-    paddw     m0, [pw_32]
-    paddw     m2, [pw_32]
-    psraw     m0, 6
-    psraw     m2, 6
-    psubw     m1, m3, m0
-    packuswb  m0, m1
-    psubw     m3, m2
-    punpckhbw m1, m0, m0
-    packuswb  m2, m3
-    punpckhbw m3, m2, m2
-    punpcklbw m0, m0
-    punpcklbw m2, m2
-    ADD_DC    m0, m1, r0+FDEC_STRIDE*-4
-    ADD_DC    m2, m3, r0
-    ret
-
-%macro ADD16x16 0
-cglobal add16x16_idct_dc, 2,2,8
-    call .loop
-    add      r0, FDEC_STRIDE*4
-    TAIL_CALL .loop, 0
-.loop:
-    add      r0, FDEC_STRIDE*4
-    mova     m0, [r1]
-    add      r1, 16
-    pxor     m1, m1
-    pmulhrsw m0, [pw_512]
-    psubw    m1, m0
-    mova     m5, [pb_unpackbd1]
-    mova     m6, [pb_unpackbd2]
-    packuswb m0, m0
-    packuswb m1, m1
-    pshufb   m2, m0, m6
-    pshufb   m0, m5
-    pshufb   m3, m1, m6
-    pshufb   m1, m5
-    ADD_DC   m0, m1, r0+FDEC_STRIDE*-4
-    ADD_DC   m2, m3, r0
-    ret
-%endmacro ; ADD16x16
-
-INIT_XMM ssse3
-ADD16x16
-INIT_XMM avx
-ADD16x16
-
-%macro ADD_DC_AVX2 3
-    mova   xm4, [r0+FDEC_STRIDE*0+%3]
-    mova   xm5, [r0+FDEC_STRIDE*1+%3]
-    vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
-    vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
-    paddusb m4, %1
-    paddusb m5, %1
-    psubusb m4, %2
-    psubusb m5, %2
-    mova [r0+FDEC_STRIDE*0+%3], xm4
-    mova [r0+FDEC_STRIDE*1+%3], xm5
-    vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
-    vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
-%endmacro
-
-INIT_YMM avx2
-cglobal add16x16_idct_dc, 2,3,6
-    add      r0, FDEC_STRIDE*4
-    mova     m0, [r1]
-    pxor     m1, m1
-    pmulhrsw m0, [pw_512]
-    psubw    m1, m0
-    mova     m4, [pb_unpackbd1]
-    mova     m5, [pb_unpackbd2]
-    packuswb m0, m0
-    packuswb m1, m1
-    pshufb   m2, m0, m4      ; row0, row2
-    pshufb   m3, m1, m4      ; row0, row2
-    pshufb   m0, m5          ; row1, row3
-    pshufb   m1, m5          ; row1, row3
-    lea      r2, [r0+FDEC_STRIDE*8]
-    ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
-    ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
-    ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
-    ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
-    RET
-
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-
-%macro DCTDC_2ROW_MMX 4
-    mova      %1, [r1+FENC_STRIDE*(0+%3)]
-    mova      m1, [r1+FENC_STRIDE*(1+%3)]
-    mova      m2, [r2+FDEC_STRIDE*(0+%4)]
-    mova      m3, [r2+FDEC_STRIDE*(1+%4)]
-    mova      %2, %1
-    punpckldq %1, m1
-    punpckhdq %2, m1
-    mova      m1, m2
-    punpckldq m2, m3
-    punpckhdq m1, m3
-    pxor      m3, m3
-    psadbw    %1, m3
-    psadbw    %2, m3
-    psadbw    m2, m3
-    psadbw    m1, m3
-    psubw     %1, m2
-    psubw     %2, m1
-%endmacro
-
-%macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
-    PSHUFLW   m1, %1, q2200  ;  s1  s1  s0  s0
-    PSHUFLW   m0, %2, q2301  ;  s3  __  s2  __
-    paddw     m1, %2         ;  s1 s13  s0 s02
-    psubw     m1, m0         ; d13 s13 d02 s02
-    PSHUFLW   m0, m1, q1010  ; d02 s02 d02 s02
-    psrlq     m1, 32         ;  __  __ d13 s13
-    paddw     m0, m1         ; d02 s02 d02+d13 s02+s13
-    psllq     m1, 32         ; d13 s13
-    psubw     m0, m1         ; d02-d13 s02-s13 d02+d13 s02+s13
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-INIT_MMX mmx2
-cglobal sub8x8_dct_dc, 3,3
-    DCTDC_2ROW_MMX m0, m4, 0, 0
-    DCTDC_2ROW_MMX m5, m6, 2, 2
-    paddw     m0, m5
-    paddw     m4, m6
-    punpckldq m0, m4
-    add       r2, FDEC_STRIDE*4
-    DCTDC_2ROW_MMX m7, m4, 4, 0
-    DCTDC_2ROW_MMX m5, m6, 6, 2
-    paddw     m7, m5
-    paddw     m4, m6
-    punpckldq m7, m4
-    DCT2x2    m0, m7
-    mova    [r0], m0
-    ret
-
-%macro DCTDC_2ROW_SSE2 4
-    movh      m1, [r1+FENC_STRIDE*(0+%1)]
-    movh      m2, [r1+FENC_STRIDE*(1+%1)]
-    punpckldq m1, m2
-    movh      m2, [r2+FDEC_STRIDE*(0+%2)]
-    punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
-    psadbw    m1, m0
-    psadbw    m2, m0
-    ACCUM  paddd, %4, 1, %3
-    psubd    m%4, m2
-%endmacro
-
-INIT_XMM sse2
-cglobal sub8x8_dct_dc, 3,3
-    pxor     m0, m0
-    DCTDC_2ROW_SSE2 0, 0, 0, 3
-    DCTDC_2ROW_SSE2 2, 2, 1, 3
-    add      r2, FDEC_STRIDE*4
-    DCTDC_2ROW_SSE2 4, 0, 0, 4
-    DCTDC_2ROW_SSE2 6, 2, 1, 4
-    packssdw m3, m3
-    packssdw m4, m4
-    DCT2x2   m3, m4
-    movq   [r0], m0
-    RET
-
-%macro SUB8x16_DCT_DC 0
-cglobal sub8x16_dct_dc, 3,3
-    pxor       m0, m0
-    DCTDC_2ROW_SSE2 0, 0, 0, 3
-    DCTDC_2ROW_SSE2 2, 2, 1, 3
-    add        r1, FENC_STRIDE*8
-    add        r2, FDEC_STRIDE*8
-    DCTDC_2ROW_SSE2 -4, -4, 0, 4
-    DCTDC_2ROW_SSE2 -2, -2, 1, 4
-    shufps     m3, m4, q2020
-    DCTDC_2ROW_SSE2 0, 0, 0, 5
-    DCTDC_2ROW_SSE2 2, 2, 1, 5
-    add        r2, FDEC_STRIDE*4
-    DCTDC_2ROW_SSE2 4, 0, 0, 4
-    DCTDC_2ROW_SSE2 6, 2, 1, 4
-    shufps     m5, m4, q2020
-%if cpuflag(ssse3)
-    %define %%sign psignw
-%else
-    %define %%sign pmullw
-%endif
-    SUMSUB_BA d, 5, 3, 0
-    packssdw   m5, m3
-    pshuflw    m0, m5, q2301
-    pshufhw    m0, m0, q2301
-    %%sign     m5, [pw_pmpmpmpm]
-    paddw      m0, m5
-    pshufd     m1, m0, q1320
-    pshufd     m0, m0, q0231
-    %%sign     m1, [pw_ppppmmmm]
-    paddw      m0, m1
-    mova     [r0], m0
-    RET
-%endmacro ; SUB8x16_DCT_DC
-
-INIT_XMM sse2
-SUB8x16_DCT_DC
-INIT_XMM ssse3
-SUB8x16_DCT_DC
-
-%endif ; !HIGH_BIT_DEPTH
-
-%macro DCTDC_4ROW_SSE2 2
-    mova       %1, [r1+FENC_STRIDEB*%2]
-    mova       m0, [r2+FDEC_STRIDEB*%2]
-%assign Y (%2+1)
-%rep 3
-    paddw      %1, [r1+FENC_STRIDEB*Y]
-    paddw      m0, [r2+FDEC_STRIDEB*Y]
-%assign Y (Y+1)
-%endrep
-    psubw      %1, m0
-    pshufd     m0, %1, q2301
-    paddw      %1, m0
-%endmacro
-
-%if HIGH_BIT_DEPTH
-%macro SUB8x8_DCT_DC_10 0
-cglobal sub8x8_dct_dc, 3,3,3
-    DCTDC_4ROW_SSE2 m1, 0
-    DCTDC_4ROW_SSE2 m2, 4
-    mova       m0, [pw_ppmmmmpp]
-    pmaddwd    m1, m0
-    pmaddwd    m2, m0
-    pshufd     m0, m1, q2200      ; -1 -1 +0 +0
-    pshufd     m1, m1, q0033      ; +0 +0 +1 +1
-    paddd      m1, m0
-    pshufd     m0, m2, q1023      ; -2 +2 -3 +3
-    paddd      m1, m2
-    paddd      m1, m0
-    mova     [r0], m1
-    RET
-%endmacro
-INIT_XMM sse2
-SUB8x8_DCT_DC_10
-
-%macro SUB8x16_DCT_DC_10 0
-cglobal sub8x16_dct_dc, 3,3,6
-    DCTDC_4ROW_SSE2 m1, 0
-    DCTDC_4ROW_SSE2 m2, 4
-    DCTDC_4ROW_SSE2 m3, 8
-    DCTDC_4ROW_SSE2 m4, 12
-    mova       m0, [pw_ppmmmmpp]
-    pmaddwd    m1, m0
-    pmaddwd    m2, m0
-    pshufd     m5, m1, q2200      ; -1 -1 +0 +0
-    pshufd     m1, m1, q0033      ; +0 +0 +1 +1
-    paddd      m1, m5
-    pshufd     m5, m2, q1023      ; -2 +2 -3 +3
-    paddd      m1, m2
-    paddd      m1, m5             ; a6 a2 a4 a0
-    pmaddwd    m3, m0
-    pmaddwd    m4, m0
-    pshufd     m5, m3, q2200
-    pshufd     m3, m3, q0033
-    paddd      m3, m5
-    pshufd     m5, m4, q1023
-    paddd      m3, m4
-    paddd      m3, m5             ; a7 a3 a5 a1
-    paddd      m0, m1, m3
-    psubd      m1, m3
-    pshufd     m0, m0, q3120
-    pshufd     m1, m1, q3120
-    punpcklqdq m2, m0, m1
-    punpckhqdq m1, m0
-    mova  [r0+ 0], m2
-    mova  [r0+16], m1
-    RET
-%endmacro
-INIT_XMM sse2
-SUB8x16_DCT_DC_10
-INIT_XMM avx
-SUB8x16_DCT_DC_10
-%endif
-
-;-----------------------------------------------------------------------------
-; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-%macro SCAN_8x8 0
-cglobal zigzag_scan_8x8_frame, 2,2,8
-    movdqa    xmm0, [r1]
-    movdqa    xmm1, [r1+16]
-    movdq2q    mm0, xmm0
-    PALIGNR   xmm1, xmm1, 14, xmm2
-    movdq2q    mm1, xmm1
-
-    movdqa    xmm2, [r1+32]
-    movdqa    xmm3, [r1+48]
-    PALIGNR   xmm2, xmm2, 12, xmm4
-    movdq2q    mm2, xmm2
-    PALIGNR   xmm3, xmm3, 10, xmm4
-    movdq2q    mm3, xmm3
-
-    punpckhwd xmm0, xmm1
-    punpckhwd xmm2, xmm3
-
-    movq       mm4, mm1
-    movq       mm5, mm1
-    movq       mm6, mm2
-    movq       mm7, mm3
-    punpckhwd  mm1, mm0
-    psllq      mm0, 16
-    psrlq      mm3, 16
-    punpckhdq  mm1, mm1
-    punpckhdq  mm2, mm0
-    punpcklwd  mm0, mm4
-    punpckhwd  mm4, mm3
-    punpcklwd  mm4, mm2
-    punpckhdq  mm0, mm2
-    punpcklwd  mm6, mm3
-    punpcklwd  mm5, mm7
-    punpcklwd  mm5, mm6
-
-    movdqa    xmm4, [r1+64]
-    movdqa    xmm5, [r1+80]
-    movdqa    xmm6, [r1+96]
-    movdqa    xmm7, [r1+112]
-
-    movq [r0+2*00], mm0
-    movq [r0+2*04], mm4
-    movd [r0+2*08], mm1
-    movq [r0+2*36], mm5
-    movq [r0+2*46], mm6
-
-    PALIGNR   xmm4, xmm4, 14, xmm3
-    movdq2q    mm4, xmm4
-    PALIGNR   xmm5, xmm5, 12, xmm3
-    movdq2q    mm5, xmm5
-    PALIGNR   xmm6, xmm6, 10, xmm3
-    movdq2q    mm6, xmm6
-%if cpuflag(ssse3)
-    PALIGNR   xmm7, xmm7, 8, xmm3
-    movdq2q    mm7, xmm7
-%else
-    movhlps   xmm3, xmm7
-    punpcklqdq xmm7, xmm7
-    movdq2q    mm7, xmm3
-%endif
-
-    punpckhwd xmm4, xmm5
-    punpckhwd xmm6, xmm7
-
-    movq       mm0, mm4
-    movq       mm1, mm5
-    movq       mm3, mm7
-    punpcklwd  mm7, mm6
-    psrlq      mm6, 16
-    punpcklwd  mm4, mm6
-    punpcklwd  mm5, mm4
-    punpckhdq  mm4, mm3
-    punpcklwd  mm3, mm6
-    punpckhwd  mm3, mm4
-    punpckhwd  mm0, mm1
-    punpckldq  mm4, mm0
-    punpckhdq  mm0, mm6
-    pshufw     mm4, mm4, q1230
-
-    movq [r0+2*14], mm4
-    movq [r0+2*25], mm0
-    movd [r0+2*54], mm7
-    movq [r0+2*56], mm5
-    movq [r0+2*60], mm3
-
-    punpckhdq xmm3, xmm0, xmm2
-    punpckldq xmm0, xmm2
-    punpckhdq xmm7, xmm4, xmm6
-    punpckldq xmm4, xmm6
-    pshufhw   xmm0, xmm0, q0123
-    pshuflw   xmm4, xmm4, q0123
-    pshufhw   xmm3, xmm3, q0123
-    pshuflw   xmm7, xmm7, q0123
-
-    movlps [r0+2*10], xmm0
-    movhps [r0+2*17], xmm0
-    movlps [r0+2*21], xmm3
-    movlps [r0+2*28], xmm4
-    movhps [r0+2*32], xmm3
-    movhps [r0+2*39], xmm4
-    movlps [r0+2*43], xmm7
-    movhps [r0+2*50], xmm7
-
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM sse2
-SCAN_8x8
-INIT_XMM ssse3
-SCAN_8x8
-%endif
-
-;-----------------------------------------------------------------------------
-; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
-;-----------------------------------------------------------------------------
-; Output order:
-;  0  8  1  2  9 16 24 17
-; 10  3  4 11 18 25 32 40
-; 33 26 19 12  5  6 13 20
-; 27 34 41 48 56 49 42 35
-; 28 21 14  7 15 22 29 36
-; 43 50 57 58 51 44 37 30
-; 23 31 38 45 52 59 60 53
-; 46 39 47 54 61 62 55 63
-%macro SCAN_8x8_FRAME 5
-cglobal zigzag_scan_8x8_frame, 2,2,8
-    mova        m0, [r1]
-    mova        m1, [r1+ 8*SIZEOF_DCTCOEF]
-    movu        m2, [r1+14*SIZEOF_DCTCOEF]
-    movu        m3, [r1+21*SIZEOF_DCTCOEF]
-    mova        m4, [r1+28*SIZEOF_DCTCOEF]
-    punpckl%4   m5, m0, m1
-    psrl%2      m0, %1
-    punpckh%4   m6, m1, m0
-    punpckl%3   m5, m0
-    punpckl%3   m1, m1
-    punpckh%4   m1, m3
-    mova        m7, [r1+52*SIZEOF_DCTCOEF]
-    mova        m0, [r1+60*SIZEOF_DCTCOEF]
-    punpckh%4   m1, m2
-    punpckl%4   m2, m4
-    punpckh%4   m4, m3
-    punpckl%3   m3, m3
-    punpckh%4   m3, m2
-    mova      [r0], m5
-    mova  [r0+ 4*SIZEOF_DCTCOEF], m1
-    mova  [r0+ 8*SIZEOF_DCTCOEF], m6
-    punpckl%4   m6, m0
-    punpckl%4   m6, m7
-    mova        m1, [r1+32*SIZEOF_DCTCOEF]
-    movu        m5, [r1+39*SIZEOF_DCTCOEF]
-    movu        m2, [r1+46*SIZEOF_DCTCOEF]
-    movu [r0+35*SIZEOF_DCTCOEF], m3
-    movu [r0+47*SIZEOF_DCTCOEF], m4
-    punpckh%4   m7, m0
-    psll%2      m0, %1
-    punpckh%3   m3, m5, m5
-    punpckl%4   m5, m1
-    punpckh%4   m1, m2
-    mova [r0+52*SIZEOF_DCTCOEF], m6
-    movu [r0+13*SIZEOF_DCTCOEF], m5
-    movu        m4, [r1+11*SIZEOF_DCTCOEF]
-    movu        m6, [r1+25*SIZEOF_DCTCOEF]
-    punpckl%4   m5, m7
-    punpckl%4   m1, m3
-    punpckh%3   m0, m7
-    mova        m3, [r1+ 4*SIZEOF_DCTCOEF]
-    movu        m7, [r1+18*SIZEOF_DCTCOEF]
-    punpckl%4   m2, m5
-    movu [r0+25*SIZEOF_DCTCOEF], m1
-    mova        m1, m4
-    mova        m5, m6
-    punpckl%4   m4, m3
-    punpckl%4   m6, m7
-    punpckh%4   m1, m3
-    punpckh%4   m5, m7
-    punpckh%3   m3, m6, m4
-    punpckh%3   m7, m5, m1
-    punpckl%3   m6, m4
-    punpckl%3   m5, m1
-    movu        m4, [r1+35*SIZEOF_DCTCOEF]
-    movu        m1, [r1+49*SIZEOF_DCTCOEF]
-    pshuf%5     m6, m6, q0123
-    pshuf%5     m5, m5, q0123
-    mova [r0+60*SIZEOF_DCTCOEF], m0
-    mova [r0+56*SIZEOF_DCTCOEF], m2
-    movu        m0, [r1+42*SIZEOF_DCTCOEF]
-    mova        m2, [r1+56*SIZEOF_DCTCOEF]
-    movu [r0+17*SIZEOF_DCTCOEF], m3
-    mova [r0+32*SIZEOF_DCTCOEF], m7
-    movu [r0+10*SIZEOF_DCTCOEF], m6
-    movu [r0+21*SIZEOF_DCTCOEF], m5
-    punpckh%4   m3, m0, m4
-    punpckh%4   m7, m2, m1
-    punpckl%4   m0, m4
-    punpckl%4   m2, m1
-    punpckl%3   m4, m2, m0
-    punpckl%3   m1, m7, m3
-    punpckh%3   m2, m0
-    punpckh%3   m7, m3
-    pshuf%5     m2, m2, q0123
-    pshuf%5     m7, m7, q0123
-    mova [r0+28*SIZEOF_DCTCOEF], m4
-    movu [r0+43*SIZEOF_DCTCOEF], m1
-    movu [r0+39*SIZEOF_DCTCOEF], m2
-    movu [r0+50*SIZEOF_DCTCOEF], m7
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-SCAN_8x8_FRAME 4 , dq, qdq, dq, d
-INIT_XMM avx
-SCAN_8x8_FRAME 4 , dq, qdq, dq, d
-%else
-INIT_MMX mmx2
-SCAN_8x8_FRAME 16, q , dq , wd, w
-%endif
-
-;-----------------------------------------------------------------------------
-; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
-;-----------------------------------------------------------------------------
-%macro SCAN_4x4 4
-cglobal zigzag_scan_4x4_frame, 2,2,6
-    mova      m0, [r1+ 0*SIZEOF_DCTCOEF]
-    mova      m1, [r1+ 4*SIZEOF_DCTCOEF]
-    mova      m2, [r1+ 8*SIZEOF_DCTCOEF]
-    mova      m3, [r1+12*SIZEOF_DCTCOEF]
-    punpckl%4 m4, m0, m1
-    psrl%2    m0, %1
-    punpckl%3 m4, m0
-    mova  [r0+ 0*SIZEOF_DCTCOEF], m4
-    punpckh%4 m0, m2
-    punpckh%4 m4, m2, m3
-    psll%2    m3, %1
-    punpckl%3 m2, m2
-    punpckl%4 m5, m1, m3
-    punpckh%3 m1, m1
-    punpckh%4 m5, m2
-    punpckl%4 m1, m0
-    punpckh%3 m3, m4
-    mova [r0+ 4*SIZEOF_DCTCOEF], m5
-    mova [r0+ 8*SIZEOF_DCTCOEF], m1
-    mova [r0+12*SIZEOF_DCTCOEF], m3
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-SCAN_4x4  4, dq, qdq, dq
-INIT_XMM avx
-SCAN_4x4  4, dq, qdq, dq
-%else
-INIT_MMX mmx
-SCAN_4x4 16, q , dq , wd
-
-;-----------------------------------------------------------------------------
-; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-%macro SCAN_4x4_FRAME 0
-cglobal zigzag_scan_4x4_frame, 2,2
-    mova    m1, [r1+16]
-    mova    m0, [r1+ 0]
-    pshufb  m1, [pb_scan4frameb]
-    pshufb  m0, [pb_scan4framea]
-    psrldq  m2, m1, 6
-    palignr m1, m0, 6
-    pslldq  m0, 10
-    palignr m2, m0, 10
-    mova [r0+ 0], m1
-    mova [r0+16], m2
-    RET
-%endmacro
-
-INIT_XMM ssse3
-SCAN_4x4_FRAME
-INIT_XMM avx
-SCAN_4x4_FRAME
-
-INIT_XMM xop
-cglobal zigzag_scan_4x4_frame, 2,2
-    mova   m0, [r1+ 0]
-    mova   m1, [r1+16]
-    vpperm m2, m0, m1, [pb_scan4frame2a]
-    vpperm m1, m0, m1, [pb_scan4frame2b]
-    mova [r0+ 0], m2
-    mova [r0+16], m1
-    RET
-%endif ; !HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal zigzag_scan_4x4_field, 2,2
-    movu       m0, [r1+ 8]
-    pshufd     m0, m0, q3102
-    mova       m1, [r1+32]
-    mova       m2, [r1+48]
-    movu  [r0+ 8], m0
-    mova  [r0+32], m1
-    mova  [r0+48], m2
-    movq      mm0, [r1]
-    movq     [r0], mm0
-    movq      mm0, [r1+24]
-    movq  [r0+24], mm0
-    RET
-%else
-;-----------------------------------------------------------------------------
-; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-INIT_XMM sse
-cglobal zigzag_scan_4x4_field, 2,2
-    mova       m0, [r1]
-    mova       m1, [r1+16]
-    pshufw    mm0, [r1+4], q3102
-    mova     [r0], m0
-    mova  [r0+16], m1
-    movq   [r0+4], mm0
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-; Output order:
-;  0  1  2  8  9  3  4 10
-; 16 11  5  6  7 12 17 24
-; 18 13 14 15 19 25 32 26
-; 20 21 22 23 27 33 40 34
-; 28 29 30 31 35 41 48 42
-; 36 37 38 39 43 49 50 44
-; 45 46 47 51 56 57 52 53
-; 54 55 58 59 60 61 62 63
-%undef SCAN_8x8
-%macro SCAN_8x8 5
-cglobal zigzag_scan_8x8_field, 2,3,8
-    mova       m0, [r1+ 0*SIZEOF_DCTCOEF]       ; 03 02 01 00
-    mova       m1, [r1+ 4*SIZEOF_DCTCOEF]       ; 07 06 05 04
-    mova       m2, [r1+ 8*SIZEOF_DCTCOEF]       ; 11 10 09 08
-    pshuf%1    m3, m0, q3333                    ; 03 03 03 03
-    movd      r2d, m2                           ; 09 08
-    pshuf%1    m2, m2, q0321                    ; 08 11 10 09
-    punpckl%2  m3, m1                           ; 05 03 04 03
-    pinsr%1    m0, r2d, 3                       ; 08 02 01 00
-    punpckl%2  m4, m2, m3                       ; 04 10 03 09
-    pshuf%1    m4, m4, q2310                    ; 10 04 03 09
-    mova  [r0+ 0*SIZEOF_DCTCOEF], m0            ; 08 02 01 00
-    mova  [r0+ 4*SIZEOF_DCTCOEF], m4            ; 10 04 03 09
-    mova       m3, [r1+12*SIZEOF_DCTCOEF]       ; 15 14 13 12
-    mova       m5, [r1+16*SIZEOF_DCTCOEF]       ; 19 18 17 16
-    punpckl%3  m6, m5                           ; 17 16 XX XX
-    psrl%4     m1, %5                           ; XX 07 06 05
-    punpckh%2  m6, m2                           ; 08 17 11 16
-    punpckl%3  m6, m1                           ; 06 05 11 16
-    mova  [r0+ 8*SIZEOF_DCTCOEF], m6            ; 06 05 11 16
-    psrl%4     m1, %5                           ; XX XX 07 06
-    punpckl%2  m1, m5                           ; 17 07 16 06
-    mova       m0, [r1+20*SIZEOF_DCTCOEF]       ; 23 22 21 20
-    mova       m2, [r1+24*SIZEOF_DCTCOEF]       ; 27 26 25 24
-    punpckh%3  m1, m1                           ; 17 07 17 07
-    punpckl%2  m6, m3, m2                       ; 25 13 24 12
-    pextr%1    r2d, m5, 2
-    mova [r0+24*SIZEOF_DCTCOEF], m0             ; 23 22 21 20
-    punpckl%2  m1, m6                           ; 24 17 12 07
-    mova [r0+12*SIZEOF_DCTCOEF], m1
-    pinsr%1    m3, r2d, 0                       ; 15 14 13 18
-    mova [r0+16*SIZEOF_DCTCOEF], m3             ; 15 14 13 18
-    mova       m7, [r1+28*SIZEOF_DCTCOEF]
-    mova       m0, [r1+32*SIZEOF_DCTCOEF]       ; 35 34 33 32
-    psrl%4     m5, %5*3                         ; XX XX XX 19
-    pshuf%1    m1, m2, q3321                    ; 27 27 26 25
-    punpckl%2  m5, m0                           ; 33 XX 32 19
-    psrl%4     m2, %5*3                         ; XX XX XX 27
-    punpckl%2  m5, m1                           ; 26 32 25 19
-    mova [r0+32*SIZEOF_DCTCOEF], m7
-    mova [r0+20*SIZEOF_DCTCOEF], m5             ; 26 32 25 19
-    mova       m7, [r1+36*SIZEOF_DCTCOEF]
-    mova       m1, [r1+40*SIZEOF_DCTCOEF]       ; 43 42 41 40
-    pshuf%1    m3, m0, q3321                    ; 35 35 34 33
-    punpckl%2  m2, m1                           ; 41 XX 40 27
-    mova [r0+40*SIZEOF_DCTCOEF], m7
-    punpckl%2  m2, m3                           ; 34 40 33 27
-    mova [r0+28*SIZEOF_DCTCOEF], m2
-    mova       m7, [r1+44*SIZEOF_DCTCOEF]       ; 47 46 45 44
-    mova       m2, [r1+48*SIZEOF_DCTCOEF]       ; 51 50 49 48
-    psrl%4     m0, %5*3                         ; XX XX XX 35
-    punpckl%2  m0, m2                           ; 49 XX 48 35
-    pshuf%1    m3, m1, q3321                    ; 43 43 42 41
-    punpckl%2  m0, m3                           ; 42 48 41 35
-    mova [r0+36*SIZEOF_DCTCOEF], m0
-    pextr%1     r2d, m2, 3                      ; 51
-    psrl%4      m1, %5*3                        ; XX XX XX 43
-    punpckl%2   m1, m7                          ; 45 XX 44 43
-    psrl%4      m2, %5                          ; XX 51 50 49
-    punpckl%2   m1, m2                          ; 50 44 49 43
-    pshuf%1     m1, m1, q2310                   ; 44 50 49 43
-    mova [r0+44*SIZEOF_DCTCOEF], m1
-    psrl%4      m7, %5                          ; XX 47 46 45
-    pinsr%1     m7, r2d, 3                      ; 51 47 46 45
-    mova [r0+48*SIZEOF_DCTCOEF], m7
-    mova        m0, [r1+56*SIZEOF_DCTCOEF]      ; 59 58 57 56
-    mova        m1, [r1+52*SIZEOF_DCTCOEF]      ; 55 54 53 52
-    mova        m7, [r1+60*SIZEOF_DCTCOEF]
-    punpckl%3   m2, m0, m1                      ; 53 52 57 56
-    punpckh%3   m1, m0                          ; 59 58 55 54
-    mova [r0+52*SIZEOF_DCTCOEF], m2
-    mova [r0+56*SIZEOF_DCTCOEF], m1
-    mova [r0+60*SIZEOF_DCTCOEF], m7
-    RET
-%endmacro
-%if HIGH_BIT_DEPTH
-INIT_XMM sse4
-SCAN_8x8 d, dq, qdq, dq, 4
-INIT_XMM avx
-SCAN_8x8 d, dq, qdq, dq, 4
-%else
-INIT_MMX mmx2
-SCAN_8x8 w, wd, dq , q , 16
-%endif
-
-;-----------------------------------------------------------------------------
-; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
-;-----------------------------------------------------------------------------
-%macro ZIGZAG_SUB_4x4 2
-%ifidn %1, ac
-cglobal zigzag_sub_4x4%1_%2, 4,4,8
-%else
-cglobal zigzag_sub_4x4%1_%2, 3,3,8
-%endif
-    movd      m0, [r1+0*FENC_STRIDE]
-    movd      m1, [r1+1*FENC_STRIDE]
-    movd      m2, [r1+2*FENC_STRIDE]
-    movd      m3, [r1+3*FENC_STRIDE]
-    movd      m4, [r2+0*FDEC_STRIDE]
-    movd      m5, [r2+1*FDEC_STRIDE]
-    movd      m6, [r2+2*FDEC_STRIDE]
-    movd      m7, [r2+3*FDEC_STRIDE]
-    movd [r2+0*FDEC_STRIDE], m0
-    movd [r2+1*FDEC_STRIDE], m1
-    movd [r2+2*FDEC_STRIDE], m2
-    movd [r2+3*FDEC_STRIDE], m3
-    punpckldq  m0, m1
-    punpckldq  m2, m3
-    punpckldq  m4, m5
-    punpckldq  m6, m7
-    punpcklqdq m0, m2
-    punpcklqdq m4, m6
-    mova      m7, [pb_sub4%2]
-    pshufb    m0, m7
-    pshufb    m4, m7
-    mova      m7, [hsub_mul]
-    punpckhbw m1, m0, m4
-    punpcklbw m0, m4
-    pmaddubsw m1, m7
-    pmaddubsw m0, m7
-%ifidn %1, ac
-    movd     r2d, m0
-    pand      m0, [pb_subacmask]
-%endif
-    mova [r0+ 0], m0
-    por       m0, m1
-    pxor      m2, m2
-    mova [r0+16], m1
-    pcmpeqb   m0, m2
-    pmovmskb eax, m0
-%ifidn %1, ac
-    mov     [r3], r2w
-%endif
-    sub      eax, 0xffff
-    shr      eax, 31
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM ssse3
-ZIGZAG_SUB_4x4   , frame
-ZIGZAG_SUB_4x4 ac, frame
-ZIGZAG_SUB_4x4   , field
-ZIGZAG_SUB_4x4 ac, field
-INIT_XMM avx
-ZIGZAG_SUB_4x4   , frame
-ZIGZAG_SUB_4x4 ac, frame
-ZIGZAG_SUB_4x4   , field
-ZIGZAG_SUB_4x4 ac, field
-%endif ; !HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM xop
-cglobal zigzag_scan_8x8_field, 2,3,7
-    lea        r2, [pb_scan8field1]
-    %define off(m) (r2+m-pb_scan8field1)
-    mova       m0, [r1+  0]
-    mova       m1, [r1+ 16]
-    vpperm     m5, m0, m1, [off(pb_scan8field1)]
-    mova [r0+  0], m5
-    vpperm     m0, m0, m1, [off(pb_scan8field2a)]
-    mova       m2, [r1+ 32]
-    mova       m3, [r1+ 48]
-    vpperm     m5, m2, m3, [off(pb_scan8field2b)]
-    por        m5, m0
-    mova [r0+ 16], m5
-    mova       m4, [off(pb_scan8field3b)]
-    vpperm     m1, m1, m2, [off(pb_scan8field3a)]
-    mova       m0, [r1+ 64]
-    vpperm     m5, m3, m0, m4
-    por        m5, m1
-    mova [r0+ 32], m5
-    ; 4b, 5b are the same as pb_scan8field3b.
-    ; 5a is the same as pb_scan8field4a.
-    mova       m5, [off(pb_scan8field4a)]
-    vpperm     m2, m2, m3, m5
-    mova       m1, [r1+ 80]
-    vpperm     m6, m0, m1, m4
-    por        m6, m2
-    mova [r0+ 48], m6
-    vpperm     m3, m3, m0, m5
-    mova       m2, [r1+ 96]
-    vpperm     m5, m1, m2, m4
-    por        m5, m3
-    mova [r0+ 64], m5
-    vpperm     m5, m0, m1, [off(pb_scan8field6)]
-    mova [r0+ 80], m5
-    vpperm     m5, m1, m2, [off(pb_scan8field7)]
-    mov       r2d, [r1+ 98]
-    mov  [r0+ 90], r2d
-    mova [r0+ 96], m5
-    mova       m3, [r1+112]
-    movd [r0+104], m3
-    mov       r2d, [r1+108]
-    mova [r0+112], m3
-    mov  [r0+112], r2d
-    %undef off
-    RET
-
-cglobal zigzag_scan_8x8_frame, 2,3,8
-    lea        r2, [pb_scan8frame1]
-    %define off(m) (r2+m-pb_scan8frame1)
-    mova       m7, [r1+ 16]
-    mova       m3, [r1+ 32]
-    vpperm     m7, m7, m3, [off(pb_scan8framet1)] ;  8  9 14 15 16 17 21 22
-    mova       m2, [r1+ 48]
-    vpperm     m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
-    mova       m1, [r1+ 80]
-    mova       m4, [r1+ 64]
-    vpperm     m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
-    vpperm     m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
-    vpperm     m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
-    vpperm     m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
-    mova       m4, [r1+ 96]
-    vpperm     m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
-    mova       m1, [r1+  0]
-    vpperm     m2, m1, m3, [off(pb_scan8framet8)] ;  0  1  2  7 24 28 29 36
-    vpperm     m1, m2, m7, [off(pb_scan8frame1)]  ;  0  8  1  2  9 16 24 17
-    mova [r0+  0], m1
-    movh       m0, [r1+  6]
-    movhps     m0, [r1+ 20]                       ;  3  4  5  6 10 11 12 13
-    vpperm     m1, m0, m6, [off(pb_scan8frame2)]  ; 10  3  4 11 18 25 32 40
-    mova [r0+ 16], m1
-    vpperm     m1, m0, m5, [off(pb_scan8frame3)]  ; 33 26 19 12  5  6 13 20
-    mova [r0+ 32], m1
-    vpperm     m1, m2, m7, [off(pb_scan8frame5)]  ; 28 21 14  7 15 22 29 36
-    mova [r0+ 64], m1
-    movh       m0, [r1+100]
-    movhps     m0, [r1+114]                       ; 50 51 52 53 57 58 59 60
-    vpperm     m1, m5, m0, [off(pb_scan8frame6)]  ; 43 50 57 58 51 44 37 30
-    mova [r0+ 80], m1
-    vpperm     m1, m6, m0, [off(pb_scan8frame7)]  ; 23 31 38 45 52 59 60 53
-    mova [r0+ 96], m1
-    mova       m1, [r1+112]
-    vpperm     m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
-    vpperm     m1, m0, m4, [off(pb_scan8frame4)]  ; 27 34 41 48 56 49 42 35
-    mova [r0+ 48], m1
-    vpperm     m1, m0, m4, [off(pb_scan8frame8)]  ; 46 39 47 54 61 62 55 63
-    mova [r0+112], m1
-    %undef off
-    RET
-%endif
-
-;-----------------------------------------------------------------------------
-; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
-;-----------------------------------------------------------------------------
-%macro INTERLEAVE 2
-    mova     m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
-    mova     m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
-    mova     m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
-    mova     m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
-    TRANSPOSE4x4%2 0,1,2,3,4
-    mova     [r0+(%1+ 0)*SIZEOF_PIXEL], m0
-    mova     [r0+(%1+32)*SIZEOF_PIXEL], m1
-    mova     [r0+(%1+64)*SIZEOF_PIXEL], m2
-    mova     [r0+(%1+96)*SIZEOF_PIXEL], m3
-    packsswb m0, m1
-    ACCUM   por, 6, 2, %1
-    ACCUM   por, 7, 3, %1
-    ACCUM   por, 5, 0, %1
-%endmacro
-
-%macro ZIGZAG_8x8_CAVLC 1
-cglobal zigzag_interleave_8x8_cavlc, 3,3,8
-    INTERLEAVE  0, %1
-    INTERLEAVE  8, %1
-    INTERLEAVE 16, %1
-    INTERLEAVE 24, %1
-    packsswb   m6, m7
-    packsswb   m5, m6
-    packsswb   m5, m5
-    pxor       m0, m0
-%if HIGH_BIT_DEPTH
-    packsswb   m5, m5
-%endif
-    pcmpeqb    m5, m0
-    paddb      m5, [pb_1]
-    movd      r0d, m5
-    mov    [r2+0], r0w
-    shr       r0d, 16
-    mov    [r2+8], r0w
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-ZIGZAG_8x8_CAVLC D
-INIT_XMM avx
-ZIGZAG_8x8_CAVLC D
-%else
-INIT_MMX mmx
-ZIGZAG_8x8_CAVLC W
-%endif
-
-%macro INTERLEAVE_XMM 1
-    mova   m0, [r1+%1*4+ 0]
-    mova   m1, [r1+%1*4+16]
-    mova   m4, [r1+%1*4+32]
-    mova   m5, [r1+%1*4+48]
-    SBUTTERFLY wd, 0, 1, 6
-    SBUTTERFLY wd, 4, 5, 7
-    SBUTTERFLY wd, 0, 1, 6
-    SBUTTERFLY wd, 4, 5, 7
-    movh   [r0+%1+  0], m0
-    movhps [r0+%1+ 32], m0
-    movh   [r0+%1+ 64], m1
-    movhps [r0+%1+ 96], m1
-    movh   [r0+%1+  8], m4
-    movhps [r0+%1+ 40], m4
-    movh   [r0+%1+ 72], m5
-    movhps [r0+%1+104], m5
-    ACCUM por, 2, 0, %1
-    ACCUM por, 3, 1, %1
-    por    m2, m4
-    por    m3, m5
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-%macro ZIGZAG_8x8_CAVLC 0
-cglobal zigzag_interleave_8x8_cavlc, 3,3,8
-    INTERLEAVE_XMM  0
-    INTERLEAVE_XMM 16
-    packsswb m2, m3
-    pxor     m5, m5
-    packsswb m2, m2
-    packsswb m2, m2
-    pcmpeqb  m5, m2
-    paddb    m5, [pb_1]
-    movd    r0d, m5
-    mov  [r2+0], r0w
-    shr     r0d, 16
-    mov  [r2+8], r0w
-    RET
-%endmacro
-
-INIT_XMM sse2
-ZIGZAG_8x8_CAVLC
-INIT_XMM avx
-ZIGZAG_8x8_CAVLC
-
-INIT_YMM avx2
-cglobal zigzag_interleave_8x8_cavlc, 3,3,6
-    mova   m0, [r1+ 0]
-    mova   m1, [r1+32]
-    mova   m2, [r1+64]
-    mova   m3, [r1+96]
-    mova   m5, [deinterleave_shufd]
-    SBUTTERFLY wd, 0, 1, 4
-    SBUTTERFLY wd, 2, 3, 4
-    SBUTTERFLY wd, 0, 1, 4
-    SBUTTERFLY wd, 2, 3, 4
-    vpermd m0, m5, m0
-    vpermd m1, m5, m1
-    vpermd m2, m5, m2
-    vpermd m3, m5, m3
-    mova [r0+  0], xm0
-    mova [r0+ 16], xm2
-    vextracti128 [r0+ 32], m0, 1
-    vextracti128 [r0+ 48], m2, 1
-    mova [r0+ 64], xm1
-    mova [r0+ 80], xm3
-    vextracti128 [r0+ 96], m1, 1
-    vextracti128 [r0+112], m3, 1
-
-    packsswb m0, m2          ; nnz0, nnz1
-    packsswb m1, m3          ; nnz2, nnz3
-    packsswb m0, m1          ; {nnz0,nnz2}, {nnz1,nnz3}
-    vpermq   m0, m0, q3120   ; {nnz0,nnz1}, {nnz2,nnz3}
-    pxor     m5, m5
-    pcmpeqq  m0, m5
-    pmovmskb r0d, m0
-    not     r0d
-    and     r0d, 0x01010101
-    mov  [r2+0], r0w
-    shr     r0d, 16
-    mov  [r2+8], r0w
-    RET
-%endif ; !HIGH_BIT_DEPTH
diff --git a/android/src/main/libenc/jni/libx264/common/x86/dct.h b/android/src/main/libenc/jni/libx264/common/x86/dct.h
deleted file mode 100755
index a851ce9..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/dct.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*****************************************************************************
- * dct.h: x86 transform and zigzag
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_I386_DCT_H
-#define X264_I386_DCT_H
-
-void x264_sub4x4_dct_mmx    ( dctcoef dct    [16], pixel   *pix1, pixel   *pix2 );
-void x264_sub8x8_dct_mmx    ( dctcoef dct[ 4][16], pixel   *pix1, pixel   *pix2 );
-void x264_sub16x16_dct_mmx  ( dctcoef dct[16][16], pixel   *pix1, pixel   *pix2 );
-void x264_sub8x8_dct_sse2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub4x4_dct_ssse3  ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_ssse3  ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_avx    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_avx  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_xop    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
-void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
-void x264_sub8x16_dct_dc_ssse3( int16_t dct  [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_avx  ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
-
-void x264_add4x4_idct_mmx       ( uint8_t *p_dst, int16_t dct    [16] );
-void x264_add4x4_idct_sse2     ( uint16_t *p_dst, int32_t dct    [16] );
-void x264_add4x4_idct_sse4      ( uint8_t *p_dst, int16_t dct    [16] );
-void x264_add4x4_idct_avx       ( pixel   *p_dst, dctcoef dct    [16] );
-void x264_add8x8_idct_mmx       ( uint8_t *p_dst, int16_t dct[ 4][16] );
-void x264_add8x8_idct_dc_mmx2   ( uint8_t *p_dst, int16_t dct    [ 4] );
-void x264_add16x16_idct_mmx     ( uint8_t *p_dst, int16_t dct[16][16] );
-void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct    [16] );
-void x264_add8x8_idct_sse2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
-void x264_add8x8_idct_avx       ( pixel   *p_dst, dctcoef dct[ 4][16] );
-void x264_add8x8_idct_avx2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
-void x264_add16x16_idct_sse2    ( pixel   *p_dst, dctcoef dct[16][16] );
-void x264_add16x16_idct_avx     ( pixel   *p_dst, dctcoef dct[16][16] );
-void x264_add16x16_idct_avx2    ( pixel   *p_dst, dctcoef dct[16][16] );
-void x264_add8x8_idct_dc_sse2   ( pixel   *p_dst, dctcoef dct    [ 4] );
-void x264_add16x16_idct_dc_sse2 ( pixel   *p_dst, dctcoef dct    [16] );
-void x264_add8x8_idct_dc_ssse3  ( uint8_t *p_dst, int16_t dct    [ 4] );
-void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct    [16] );
-void x264_add8x8_idct_dc_avx    ( pixel   *p_dst, dctcoef dct    [ 4] );
-void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );
-void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct    [16] );
-
-void x264_dct4x4dc_mmx2      ( int16_t d[16] );
-void x264_dct4x4dc_sse2      ( int32_t d[16] );
-void x264_dct4x4dc_avx       ( int32_t d[16] );
-void x264_idct4x4dc_mmx      ( int16_t d[16] );
-void x264_idct4x4dc_sse2     ( int32_t d[16] );
-void x264_idct4x4dc_avx      ( int32_t d[16] );
-
-void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
-void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
-void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
-
-void x264_sub8x8_dct8_mmx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_mmx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2   ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );
-void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
-void x264_sub8x8_dct8_ssse3  ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse4   ( int32_t dct   [64], uint16_t *pix1, uint16_t *pix2 );
-void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
-void x264_sub8x8_dct8_avx    ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );
-void x264_sub16x16_dct8_avx  ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
-void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
-
-
-void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct   [64] );
-void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
-void x264_add8x8_idct8_sse2  ( pixel *dst, dctcoef dct   [64] );
-void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
-void x264_add8x8_idct8_avx   ( pixel *dst, dctcoef dct   [64] );
-void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
-
-void x264_zigzag_scan_8x8_frame_xop  ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_avx  ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_4x4_frame_xop  ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_avx  ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_frame_mmx  ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_sse  ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_8x8_field_xop  ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_field_avx  ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
-int  x264_zigzag_sub_4x4_frame_avx    ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-int  x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-int  x264_zigzag_sub_4x4ac_frame_avx  ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-int  x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-int  x264_zigzag_sub_4x4_field_avx    ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-int  x264_zigzag_sub_4x4_field_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-int  x264_zigzag_sub_4x4ac_field_avx  ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-int  x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/deblock-a.asm b/android/src/main/libenc/jni/libx264/common/x86/deblock-a.asm
deleted file mode 100755
index e7a8240..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/deblock-a.asm
+++ /dev/null
@@ -1,2587 +0,0 @@
-;*****************************************************************************
-;* deblock-a.asm: x86 deblocking
-;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Oskar Arvidsson <oskar@irock.se>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
-insert_top_shuf: dd 0,1,4,5,7,2,3,6
-transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
-
-SECTION .text
-
-cextern pb_0
-cextern pb_1
-cextern pb_3
-cextern pb_a1
-cextern pw_2
-cextern pw_4
-cextern pw_00ff
-cextern pw_pixel_max
-cextern pb_unpackbd1
-
-%if HIGH_BIT_DEPTH
-; out: %4 = |%1-%2|-%3
-; clobbers: %5
-%macro ABS_SUB 5
-    psubusw %5, %2, %1
-    psubusw %4, %1, %2
-    por     %4, %5
-    psubw   %4, %3
-%endmacro
-
-; out: %4 = |%1-%2|<%3
-%macro DIFF_LT   5
-    psubusw %4, %2, %1
-    psubusw %5, %1, %2
-    por     %5, %4 ; |%1-%2|
-    pxor    %4, %4
-    psubw   %5, %3 ; |%1-%2|-%3
-    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
-%endmacro
-
-%macro LOAD_AB 4
-    movd       %1, %3
-    movd       %2, %4
-    SPLATW     %1, %1
-    SPLATW     %2, %2
-%endmacro
-
-; in:  %2=tc reg
-; out: %1=splatted tc
-%macro LOAD_TC 2
-%if mmsize == 8
-    pshufw      %1, [%2-1], 0
-%else
-    movd        %1, [%2]
-    punpcklbw   %1, %1
-    pshuflw     %1, %1, q1100
-    pshufd      %1, %1, q1100
-%endif
-    psraw       %1, 8
-%endmacro
-
-; in: %1=p1, %2=p0, %3=q0, %4=q1
-;     %5=alpha, %6=beta, %7-%9=tmp
-; out: %7=mask
-%macro LOAD_MASK 9
-    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
-    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
-    pand        %8, %9
-    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
-    pxor        %7, %7
-    pand        %8, %9
-    pcmpgtw     %7, %8
-%endmacro
-
-; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
-; out: %1=p0', m2=q0'
-%macro DEBLOCK_P0_Q0 7
-    psubw   %3, %4
-    pxor    %7, %7
-    paddw   %3, [pw_4]
-    psubw   %7, %5
-    psubw   %6, %2, %1
-    psllw   %6, 2
-    paddw   %3, %6
-    psraw   %3, 3
-    mova    %6, [pw_pixel_max]
-    CLIPW   %3, %7, %5
-    pxor    %7, %7
-    paddw   %1, %3
-    psubw   %2, %3
-    CLIPW   %1, %7, %6
-    CLIPW   %2, %7, %6
-%endmacro
-
-; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
-%macro LUMA_Q1 6
-    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
-    paddw       %1, %6
-    pxor        %6, %6
-    psraw       %1, 1
-    psubw       %6, %5
-    psubw       %1, %2
-    CLIPW       %1, %6, %5
-    paddw       %1, %2
-%endmacro
-
-%macro LUMA_DEBLOCK_ONE 3
-    DIFF_LT     m5, %1, bm, m4, m6
-    pxor        m6, m6
-    mova        %3, m4
-    pcmpgtw     m6, tcm
-    pand        m4, tcm
-    pandn       m6, m7
-    pand        m4, m6
-    LUMA_Q1 m5, %2, m1, m2, m4, m6
-%endmacro
-
-%macro LUMA_H_STORE 2
-%if mmsize == 8
-    movq        [r0-4], m0
-    movq        [r0+r1-4], m1
-    movq        [r0+r1*2-4], m2
-    movq        [r0+%2-4], m3
-%else
-    movq        [r0-4], m0
-    movhps      [r0+r1-4], m0
-    movq        [r0+r1*2-4], m1
-    movhps      [%1-4], m1
-    movq        [%1+r1-4], m2
-    movhps      [%1+r1*2-4], m2
-    movq        [%1+%2-4], m3
-    movhps      [%1+r1*4-4], m3
-%endif
-%endmacro
-
-%macro DEBLOCK_LUMA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_luma, 5,5,8,0-5*mmsize
-    %define tcm [rsp]
-    %define ms1 [rsp+mmsize]
-    %define ms2 [rsp+mmsize*2]
-    %define am  [rsp+mmsize*3]
-    %define bm  [rsp+mmsize*4]
-    add         r1, r1
-    LOAD_AB     m4, m5, r2d, r3d
-    mov         r3, 32/mmsize
-    mov         r2, r0
-    sub         r0, r1
-    mova        am, m4
-    sub         r0, r1
-    mova        bm, m5
-    sub         r0, r1
-.loop:
-    mova        m0, [r0+r1]
-    mova        m1, [r0+r1*2]
-    mova        m2, [r2]
-    mova        m3, [r2+r1]
-
-    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
-    LOAD_TC     m6, r4
-    mova       tcm, m6
-
-    mova        m5, [r0]
-    LUMA_DEBLOCK_ONE m1, m0, ms1
-    mova   [r0+r1], m5
-
-    mova        m5, [r2+r1*2]
-    LUMA_DEBLOCK_ONE m2, m3, ms2
-    mova   [r2+r1], m5
-
-    pxor        m5, m5
-    mova        m6, tcm
-    pcmpgtw     m5, tcm
-    psubw       m6, ms1
-    pandn       m5, m7
-    psubw       m6, ms2
-    pand        m5, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
-    mova [r0+r1*2], m1
-    mova      [r2], m2
-
-    add         r0, mmsize
-    add         r2, mmsize
-    add         r4, mmsize/8
-    dec         r3
-    jg .loop
-    RET
-
-cglobal deblock_h_luma, 5,6,8,0-7*mmsize
-    %define tcm [rsp]
-    %define ms1 [rsp+mmsize]
-    %define ms2 [rsp+mmsize*2]
-    %define p1m [rsp+mmsize*3]
-    %define p2m [rsp+mmsize*4]
-    %define am  [rsp+mmsize*5]
-    %define bm  [rsp+mmsize*6]
-    add         r1, r1
-    LOAD_AB     m4, m5, r2d, r3d
-    mov         r3, r1
-    mova        am, m4
-    add         r3, r1
-    mov         r5, 32/mmsize
-    mova        bm, m5
-    add         r3, r1
-%if mmsize == 16
-    mov         r2, r0
-    add         r2, r3
-%endif
-.loop:
-%if mmsize == 8
-    movq        m2, [r0-8]     ; y q2 q1 q0
-    movq        m7, [r0+0]
-    movq        m5, [r0+r1-8]
-    movq        m3, [r0+r1+0]
-    movq        m0, [r0+r1*2-8]
-    movq        m6, [r0+r1*2+0]
-    movq        m1, [r0+r3-8]
-    TRANSPOSE4x4W 2, 5, 0, 1, 4
-    SWAP         2, 7
-    movq        m7, [r0+r3]
-    TRANSPOSE4x4W 2, 3, 6, 7, 4
-%else
-    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
-    movu        m0, [r0+r1-8]
-    movu        m2, [r0+r1*2-8]
-    movu        m3, [r2-8]
-    TRANSPOSE4x4W 5, 0, 2, 3, 6
-    mova       tcm, m3
-
-    movu        m4, [r2+r1-8]
-    movu        m1, [r2+r1*2-8]
-    movu        m3, [r2+r3-8]
-    movu        m7, [r2+r1*4-8]
-    TRANSPOSE4x4W 4, 1, 3, 7, 6
-
-    mova        m6, tcm
-    punpcklqdq  m6, m7
-    punpckhqdq  m5, m4
-    SBUTTERFLY qdq, 0, 1, 7
-    SBUTTERFLY qdq, 2, 3, 7
-%endif
-
-    mova       p2m, m6
-    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
-    LOAD_TC     m6, r4
-    mova       tcm, m6
-
-    LUMA_DEBLOCK_ONE m1, m0, ms1
-    mova       p1m, m5
-
-    mova        m5, p2m
-    LUMA_DEBLOCK_ONE m2, m3, ms2
-    mova       p2m, m5
-
-    pxor        m5, m5
-    mova        m6, tcm
-    pcmpgtw     m5, tcm
-    psubw       m6, ms1
-    pandn       m5, m7
-    psubw       m6, ms2
-    pand        m5, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
-    mova        m0, p1m
-    mova        m3, p2m
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-    LUMA_H_STORE r2, r3
-
-    add         r4, mmsize/8
-    lea         r0, [r0+r1*(mmsize/2)]
-    lea         r2, [r2+r1*(mmsize/2)]
-    dec         r5
-    jg .loop
-    RET
-%endmacro
-
-%if ARCH_X86_64
-; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
-;      m12=alpha, m13=beta
-; out: m0=p1', m3=q1', m1=p0', m2=q0'
-; clobbers: m4, m5, m6, m7, m10, m11, m14
-%macro DEBLOCK_LUMA_INTER_SSE2 0
-    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
-    LOAD_TC     m6, r4
-    DIFF_LT     m8, m1, m13, m10, m4
-    DIFF_LT     m9, m2, m13, m11, m4
-    pand        m6, m7
-
-    mova       m14, m6
-    pxor        m4, m4
-    pcmpgtw     m6, m4
-    pand        m6, m14
-
-    mova        m5, m10
-    pand        m5, m6
-    LUMA_Q1 m8, m0, m1, m2, m5, m4
-
-    mova        m5, m11
-    pand        m5, m6
-    LUMA_Q1 m9, m3, m1, m2, m5, m4
-
-    pxor        m4, m4
-    psubw       m6, m10
-    pcmpgtw     m4, m14
-    pandn       m4, m7
-    psubw       m6, m11
-    pand        m4, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
-
-    SWAP         0, 8
-    SWAP         3, 9
-%endmacro
-
-%macro DEBLOCK_LUMA_64 0
-cglobal deblock_v_luma, 5,5,15
-    %define p2 m8
-    %define p1 m0
-    %define p0 m1
-    %define q0 m2
-    %define q1 m3
-    %define q2 m9
-    %define mask0 m7
-    %define mask1 m10
-    %define mask2 m11
-    add         r1, r1
-    LOAD_AB    m12, m13, r2d, r3d
-    mov         r2, r0
-    sub         r0, r1
-    sub         r0, r1
-    sub         r0, r1
-    mov         r3, 2
-.loop:
-    mova        p2, [r0]
-    mova        p1, [r0+r1]
-    mova        p0, [r0+r1*2]
-    mova        q0, [r2]
-    mova        q1, [r2+r1]
-    mova        q2, [r2+r1*2]
-    DEBLOCK_LUMA_INTER_SSE2
-    mova   [r0+r1], p1
-    mova [r0+r1*2], p0
-    mova      [r2], q0
-    mova   [r2+r1], q1
-    add         r0, mmsize
-    add         r2, mmsize
-    add         r4, 2
-    dec         r3
-    jg .loop
-    RET
-
-cglobal deblock_h_luma, 5,7,15
-    add         r1, r1
-    LOAD_AB    m12, m13, r2d, r3d
-    mov         r2, r1
-    add         r2, r1
-    add         r2, r1
-    mov         r5, r0
-    add         r5, r2
-    mov         r6, 2
-.loop:
-    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
-    movu        m0, [r0+r1-8]
-    movu        m2, [r0+r1*2-8]
-    movu        m9, [r5-8]
-    movu        m5, [r5+r1-8]
-    movu        m1, [r5+r1*2-8]
-    movu        m3, [r5+r2-8]
-    movu        m7, [r5+r1*4-8]
-
-    TRANSPOSE4x4W 8, 0, 2, 9, 10
-    TRANSPOSE4x4W 5, 1, 3, 7, 10
-
-    punpckhqdq  m8, m5
-    SBUTTERFLY qdq, 0, 1, 10
-    SBUTTERFLY qdq, 2, 3, 10
-    punpcklqdq  m9, m7
-
-    DEBLOCK_LUMA_INTER_SSE2
-
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-    LUMA_H_STORE r5, r2
-    add         r4, 2
-    lea         r0, [r0+r1*8]
-    lea         r5, [r5+r1*8]
-    dec         r6
-    jg .loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA_64
-INIT_XMM avx
-DEBLOCK_LUMA_64
-%endif
-
-%macro SWAPMOVA 2
-%ifnum sizeof%1
-    SWAP %1, %2
-%else
-    mova %1, %2
-%endif
-%endmacro
-
-; in: t0-t2: tmp registers
-;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
-;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
-%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
-%if ARCH_X86_64
-    paddw     t0, %3, %2
-    mova      t2, %4
-    paddw     t2, %3
-%else
-    mova      t0, %3
-    mova      t2, %4
-    paddw     t0, %2
-    paddw     t2, %3
-%endif
-    paddw     t0, %1
-    paddw     t2, t2
-    paddw     t0, %5
-    paddw     t2, %9
-    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
-    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
-
-    psrlw     t2, 3
-    psrlw     t1, t0, 2
-    psubw     t2, %3
-    psubw     t1, %2
-    pand      t2, %8
-    pand      t1, %8
-    paddw     t2, %3
-    paddw     t1, %2
-    SWAPMOVA %11, t1
-
-    psubw     t1, t0, %3
-    paddw     t0, t0
-    psubw     t1, %5
-    psubw     t0, %3
-    paddw     t1, %6
-    paddw     t1, %2
-    paddw     t0, %6
-    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
-    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
-
-    pxor      t0, t1
-    pxor      t1, %1
-    pand      t0, %8
-    pand      t1, %7
-    pxor      t0, t1
-    pxor      t0, %1
-    SWAPMOVA %10, t0
-    SWAPMOVA %12, t2
-%endmacro
-
-%macro LUMA_INTRA_INIT 1
-    %define t0 m4
-    %define t1 m5
-    %define t2 m6
-    %define t3 m7
-    %assign i 4
-%rep %1
-    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
-    %assign i i+1
-%endrep
-    add     r1, r1
-%endmacro
-
-; in: %1-%3=tmp, %4=p2, %5=q2
-%macro LUMA_INTRA_INTER 5
-    LOAD_AB t0, t1, r2d, r3d
-    mova    %1, t0
-    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
-%if ARCH_X86_64
-    mova    %2, t0        ; mask0
-    psrlw   t3, %1, 2
-%else
-    mova    t3, %1
-    mova    %2, t0        ; mask0
-    psrlw   t3, 2
-%endif
-    paddw   t3, [pw_2]    ; alpha/4+2
-    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
-    pand    t2, %2
-    mova    t3, %5        ; q2
-    mova    %1, t2        ; mask1
-    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
-    pand    t2, %1
-    mova    t3, %4        ; p2
-    mova    %3, t2        ; mask1q
-    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
-    pand    t2, %1
-    mova    %1, t2        ; mask1p
-%endmacro
-
-%macro LUMA_H_INTRA_LOAD 0
-%if mmsize == 8
-    movu    t0, [r0-8]
-    movu    t1, [r0+r1-8]
-    movu    m0, [r0+r1*2-8]
-    movu    m1, [r0+r4-8]
-    TRANSPOSE4x4W 4, 5, 0, 1, 2
-    mova    t4, t0        ; p3
-    mova    t5, t1        ; p2
-
-    movu    m2, [r0]
-    movu    m3, [r0+r1]
-    movu    t0, [r0+r1*2]
-    movu    t1, [r0+r4]
-    TRANSPOSE4x4W 2, 3, 4, 5, 6
-    mova    t6, t0        ; q2
-    mova    t7, t1        ; q3
-%else
-    movu    t0, [r0-8]
-    movu    t1, [r0+r1-8]
-    movu    m0, [r0+r1*2-8]
-    movu    m1, [r0+r5-8]
-    movu    m2, [r4-8]
-    movu    m3, [r4+r1-8]
-    movu    t2, [r4+r1*2-8]
-    movu    t3, [r4+r5-8]
-    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
-    mova    t4, t0        ; p3
-    mova    t5, t1        ; p2
-    mova    t6, t2        ; q2
-    mova    t7, t3        ; q3
-%endif
-%endmacro
-
-; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
-%macro LUMA_H_INTRA_STORE 9
-%if mmsize == 8
-    TRANSPOSE4x4W %1, %2, %3, %4, %9
-    movq       [r0-8], m%1
-    movq       [r0+r1-8], m%2
-    movq       [r0+r1*2-8], m%3
-    movq       [r0+r4-8], m%4
-    movq       m%1, %8
-    TRANSPOSE4x4W %5, %6, %7, %1, %9
-    movq       [r0], m%5
-    movq       [r0+r1], m%6
-    movq       [r0+r1*2], m%7
-    movq       [r0+r4], m%1
-%else
-    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
-    movq       [r0-8], m%1
-    movq       [r0+r1-8], m%2
-    movq       [r0+r1*2-8], m%3
-    movq       [r0+r5-8], m%4
-    movhps     [r4-8], m%1
-    movhps     [r4+r1-8], m%2
-    movhps     [r4+r1*2-8], m%3
-    movhps     [r4+r5-8], m%4
-%ifnum %8
-    SWAP       %1, %8
-%else
-    mova       m%1, %8
-%endif
-    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
-    movq       [r0], m%5
-    movq       [r0+r1], m%6
-    movq       [r0+r1*2], m%7
-    movq       [r0+r5], m%1
-    movhps     [r4], m%5
-    movhps     [r4+r1], m%6
-    movhps     [r4+r1*2], m%7
-    movhps     [r4+r5], m%1
-%endif
-%endmacro
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA_INTRA_64 0
-cglobal deblock_v_luma_intra, 4,7,16
-    %define t0 m1
-    %define t1 m2
-    %define t2 m4
-    %define p2 m8
-    %define p1 m9
-    %define p0 m10
-    %define q0 m11
-    %define q1 m12
-    %define q2 m13
-    %define aa m5
-    %define bb m14
-    add     r1, r1
-    lea     r4, [r1*4]
-    lea     r5, [r1*3] ; 3*stride
-    neg     r4
-    add     r4, r0     ; pix-4*stride
-    mov     r6, 2
-    mova    m0, [pw_2]
-    LOAD_AB aa, bb, r2d, r3d
-.loop:
-    mova    p2, [r4+r1]
-    mova    p1, [r4+2*r1]
-    mova    p0, [r4+r5]
-    mova    q0, [r0]
-    mova    q1, [r0+r1]
-    mova    q2, [r0+2*r1]
-
-    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
-    mova    t2, aa
-    psrlw   t2, 2
-    paddw   t2, m0 ; alpha/4+2
-    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
-    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
-    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
-    pand    m6, m3
-    pand    m7, m6
-    pand    m6, t1
-    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
-    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
-    add     r0, mmsize
-    add     r4, mmsize
-    dec     r6
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7,16
-    %define t0 m15
-    %define t1 m14
-    %define t2 m2
-    %define q3 m5
-    %define q2 m8
-    %define q1 m9
-    %define q0 m10
-    %define p0 m11
-    %define p1 m12
-    %define p2 m13
-    %define p3 m4
-    %define spill [rsp]
-    %assign pad 24-(stack_offset&15)
-    SUB     rsp, pad
-    add     r1, r1
-    lea     r4, [r1*4]
-    lea     r5, [r1*3] ; 3*stride
-    add     r4, r0     ; pix+4*stride
-    mov     r6, 2
-    mova    m0, [pw_2]
-.loop:
-    movu    q3, [r0-8]
-    movu    q2, [r0+r1-8]
-    movu    q1, [r0+r1*2-8]
-    movu    q0, [r0+r5-8]
-    movu    p0, [r4-8]
-    movu    p1, [r4+r1-8]
-    movu    p2, [r4+r1*2-8]
-    movu    p3, [r4+r5-8]
-    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
-
-    LOAD_AB m1, m2, r2d, r3d
-    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
-    psrlw   m1, 2
-    paddw   m1, m0 ; alpha/4+2
-    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
-    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
-    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
-    pand    m6, m3
-    pand    m7, m6
-    pand    m6, t1
-
-    mova spill, q3
-    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
-    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
-    mova    m7, spill
-
-    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
-
-    lea     r0, [r0+r1*8]
-    lea     r4, [r4+r1*8]
-    dec     r6
-    jg .loop
-    ADD    rsp, pad
-    RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA_INTRA_64
-INIT_XMM avx
-DEBLOCK_LUMA_INTRA_64
-
-%endif
-
-%macro DEBLOCK_LUMA_INTRA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize
-    LUMA_INTRA_INIT 3
-    lea     r4, [r1*4]
-    lea     r5, [r1*3]
-    neg     r4
-    add     r4, r0
-    mov     r6, 32/mmsize
-.loop:
-    mova    m0, [r4+r1*2] ; p1
-    mova    m1, [r4+r5]   ; p0
-    mova    m2, [r0]      ; q0
-    mova    m3, [r0+r1]   ; q1
-    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
-    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
-    mova    t3, [r0+r1*2] ; q2
-    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
-    add     r0, mmsize
-    add     r4, mmsize
-    dec     r6
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize
-    LUMA_INTRA_INIT 8
-%if mmsize == 8
-    lea     r4, [r1*3]
-    mov     r5, 32/mmsize
-%else
-    lea     r4, [r1*4]
-    lea     r5, [r1*3] ; 3*stride
-    add     r4, r0     ; pix+4*stride
-    mov     r6, 32/mmsize
-%endif
-.loop:
-    LUMA_H_INTRA_LOAD
-    LUMA_INTRA_INTER t8, t9, t10, t5, t6
-
-    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
-    mova    t3, t6     ; q2
-    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
-
-    mova    m2, t4
-    mova    m0, t11
-    mova    m1, t5
-    mova    m3, t8
-    mova    m6, t6
-
-    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
-
-    lea     r0, [r0+r1*(mmsize/2)]
-%if mmsize == 8
-    dec     r5
-%else
-    lea     r4, [r4+r1*(mmsize/2)]
-    dec     r6
-%endif
-    jg .loop
-    RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-INIT_XMM sse2
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-INIT_XMM avx
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-%endif
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
-    [base], [base+stride], [base+stride*2], [base3], \
-    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
-%define PASS8ROWS(base, base3, stride, stride3, offset) \
-    PASS8ROWS(base+offset, base3+offset, stride, stride3)
-
-; in: 4 rows of 8 bytes in m0..m3
-; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4B_STORE 8
-    punpckhdq  m4, m0, m0
-    punpckhdq  m5, m1, m1
-    punpckhdq  m6, m2, m2
-
-    punpcklbw  m0, m1
-    punpcklbw  m2, m3
-    punpcklwd  m1, m0, m2
-    punpckhwd  m0, m2
-    movd       %1, m1
-    punpckhdq  m1, m1
-    movd       %2, m1
-    movd       %3, m0
-    punpckhdq  m0, m0
-    movd       %4, m0
-
-    punpckhdq  m3, m3
-    punpcklbw  m4, m5
-    punpcklbw  m6, m3
-    punpcklwd  m5, m4, m6
-    punpckhwd  m4, m6
-    movd       %5, m5
-    punpckhdq  m5, m5
-    movd       %6, m5
-    movd       %7, m4
-    punpckhdq  m4, m4
-    movd       %8, m4
-%endmacro
-
-; in: 8 rows of 4 bytes in %9..%10
-; out: 8 rows of 4 bytes in %1..%8
-%macro STORE_8x4B 10
-    movd   %1, %9
-    pextrd %2, %9, 1
-    pextrd %3, %9, 2
-    pextrd %4, %9, 3
-    movd   %5, %10
-    pextrd %6, %10, 1
-    pextrd %7, %10, 2
-    pextrd %8, %10, 3
-%endmacro
-
-; in: 4 rows of 4 words in %1..%4
-; out: 4 rows of 4 word in m0..m3
-; clobbers: m4
-%macro TRANSPOSE4x4W_LOAD 4-8
-%if mmsize==8
-    SWAP  1, 4, 2, 3
-    movq  m0, %1
-    movq  m1, %2
-    movq  m2, %3
-    movq  m3, %4
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-%else
-    movq       m0, %1
-    movq       m2, %2
-    movq       m1, %3
-    movq       m3, %4
-    punpcklwd  m0, m2
-    punpcklwd  m1, m3
-    mova       m2, m0
-    punpckldq  m0, m1
-    punpckhdq  m2, m1
-    MOVHL      m1, m0
-    MOVHL      m3, m2
-%endif
-%endmacro
-
-; in: 2 rows of 4 words in m1..m2
-; out: 4 rows of 2 words in %1..%4
-; clobbers: m0, m1
-%macro TRANSPOSE4x2W_STORE 4-8
-%if mmsize==8
-    punpckhwd  m0, m1, m2
-    punpcklwd  m1, m2
-%else
-    punpcklwd  m1, m2
-    MOVHL      m0, m1
-%endif
-    movd       %3, m0
-    movd       %1, m1
-    psrlq      m1, 32
-    psrlq      m0, 32
-    movd       %2, m1
-    movd       %4, m0
-%endmacro
-
-; in: 4/8 rows of 4 words in %1..%8
-; out: 4 rows of 4/8 word in m0..m3
-; clobbers: m4, m5, m6, m7
-%macro TRANSPOSE4x8W_LOAD 8
-%if mmsize==8
-    TRANSPOSE4x4W_LOAD %1, %2, %3, %4
-%else
-    movq       m0, %1
-    movq       m2, %2
-    movq       m1, %3
-    movq       m3, %4
-    punpcklwd  m0, m2
-    punpcklwd  m1, m3
-    mova       m2, m0
-    punpckldq  m0, m1
-    punpckhdq  m2, m1
-
-    movq       m4, %5
-    movq       m6, %6
-    movq       m5, %7
-    movq       m7, %8
-    punpcklwd  m4, m6
-    punpcklwd  m5, m7
-    mova       m6, m4
-    punpckldq  m4, m5
-    punpckhdq  m6, m5
-
-    punpckhqdq m1, m0, m4
-    punpckhqdq m3, m2, m6
-    punpcklqdq m0, m4
-    punpcklqdq m2, m6
-%endif
-%endmacro
-
-; in: 2 rows of 4/8 words in m1..m2
-; out: 4/8 rows of 2 words in %1..%8
-; clobbers: m0, m1
-%macro TRANSPOSE8x2W_STORE 8
-%if mmsize==8
-    TRANSPOSE4x2W_STORE %1, %2, %3, %4
-%else
-    punpckhwd  m0, m1, m2
-    punpcklwd  m1, m2
-    movd       %5, m0
-    movd       %1, m1
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %2, m1
-    movd       %6, m0
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %3, m1
-    movd       %7, m0
-    psrldq     m1, 4
-    psrldq     m0, 4
-    movd       %4, m1
-    movd       %8, m0
-%endif
-%endmacro
-
-%macro SBUTTERFLY3 4
-    punpckh%1  %4, %2, %3
-    punpckl%1  %2, %3
-%endmacro
-
-; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
-; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
-%macro TRANSPOSE6x8_MEM 9
-    RESET_MM_PERMUTATION
-%if cpuflag(avx)
-    ; input:
-    ; _ABCDEF_
-    ; _GHIJKL_
-    ; _MNOPQR_
-    ; _STUVWX_
-    ; _YZabcd_
-    ; _efghij_
-    ; _klmnop_
-    ; _qrstuv_
-
-    movh      m0, %1
-    movh      m2, %2
-    movh      m1, %3
-    movh      m3, %4
-    punpcklbw m0, m2       ; __ AG BH CI DJ EK FL __
-    punpcklbw m1, m3       ; __ MS NT OU PV QW RX __
-    movh      m2, %5
-    movh      m3, %6
-    punpcklbw m2, m3       ; __ Ye Zf ag bh ci dj __
-    movh      m3, %7
-    movh      m4, %8
-    punpcklbw m3, m4       ; __ kq lr ms nt ou pv __
-
-    SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU
-                           ; DJ PV EK QW FL RX __ __
-    SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms
-                           ; bh nt ci ou dj pv __ __
-    SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq
-                           ; BH NT Zf lr CI FL OU RX
-    SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr
-                           ; FL RX dj pv __ __ __ __
-    movhps [%9+0x00], m0
-    movh   [%9+0x10], m2
-    movhps [%9+0x20], m2
-    movh   [%9+0x30], m1
-    movhps [%9+0x40], m1
-    movh   [%9+0x50], m3
-%else
-    movq  m0, %1
-    movq  m1, %2
-    movq  m2, %3
-    movq  m3, %4
-    movq  m4, %5
-    movq  m5, %6
-    movq  m6, %7
-    SBUTTERFLY bw, 0, 1, 7
-    SBUTTERFLY bw, 2, 3, 7
-    SBUTTERFLY bw, 4, 5, 7
-    movq  [%9+0x10], m3
-    SBUTTERFLY3 bw, m6, %8, m7
-    SBUTTERFLY wd, 0, 2, 3
-    SBUTTERFLY wd, 4, 6, 3
-    punpckhdq m0, m4
-    movq  [%9+0x00], m0
-    SBUTTERFLY3 wd, m1, [%9+0x10], m3
-    SBUTTERFLY wd, 5, 7, 0
-    SBUTTERFLY dq, 1, 5, 0
-    SBUTTERFLY dq, 2, 6, 0
-    punpckldq m3, m7
-    movq  [%9+0x10], m2
-    movq  [%9+0x20], m6
-    movq  [%9+0x30], m1
-    movq  [%9+0x40], m5
-    movq  [%9+0x50], m3
-%endif
-    RESET_MM_PERMUTATION
-%endmacro
-
-
-; in: 8 rows of 8 in %1..%8
-; out: 8 rows of 8 in %9..%16
-%macro TRANSPOSE8x8_MEM 16
-    RESET_MM_PERMUTATION
-%if cpuflag(avx)
-    movh      m0, %1
-    movh      m4, %2
-    movh      m1, %3
-    movh      m5, %4
-    movh      m2, %5
-    movh      m3, %7
-    punpcklbw m0, m4
-    punpcklbw m1, m5
-    movh      m4, %6
-    movh      m5, %8
-    punpcklbw m2, m4
-    punpcklbw m3, m5
-    SBUTTERFLY wd, 0, 1, 4
-    SBUTTERFLY wd, 2, 3, 4
-    SBUTTERFLY dq, 0, 2, 4
-    SBUTTERFLY dq, 1, 3, 4
-    movh    %9, m0
-    movhps %10, m0
-    movh   %11, m2
-    movhps %12, m2
-    movh   %13, m1
-    movhps %14, m1
-    movh   %15, m3
-    movhps %16, m3
-%else
-    movq  m0, %1
-    movq  m1, %2
-    movq  m2, %3
-    movq  m3, %4
-    movq  m4, %5
-    movq  m5, %6
-    movq  m6, %7
-    SBUTTERFLY bw, 0, 1, 7
-    SBUTTERFLY bw, 2, 3, 7
-    SBUTTERFLY bw, 4, 5, 7
-    SBUTTERFLY3 bw, m6, %8, m7
-    movq  %9,  m5
-    SBUTTERFLY wd, 0, 2, 5
-    SBUTTERFLY wd, 4, 6, 5
-    SBUTTERFLY wd, 1, 3, 5
-    movq  %11, m6
-    movq  m6,  %9
-    SBUTTERFLY wd, 6, 7, 5
-    SBUTTERFLY dq, 0, 4, 5
-    SBUTTERFLY dq, 1, 6, 5
-    movq  %9,  m0
-    movq  %10, m4
-    movq  %13, m1
-    movq  %14, m6
-    SBUTTERFLY3 dq, m2, %11, m0
-    SBUTTERFLY dq, 3, 7, 4
-    movq  %11, m2
-    movq  %12, m0
-    movq  %15, m3
-    movq  %16, m7
-%endif
-    RESET_MM_PERMUTATION
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT 5
-%if avx_enabled == 0
-    mova    %5, %2
-    mova    %4, %1
-    psubusb %5, %1
-    psubusb %4, %2
-%else
-    psubusb %5, %2, %1
-    psubusb %4, %1, %2
-%endif
-    por     %4, %5
-    psubusb %4, %3
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT2 5-6
-%if %0<6
-    psubusb %4, %1, %2
-    psubusb %5, %2, %1
-%else
-    mova    %4, %1
-    mova    %5, %2
-    psubusb %4, %2
-    psubusb %5, %1
-%endif
-    psubusb %5, %3
-    psubusb %4, %3
-    pcmpeqb %4, %5
-%endmacro
-
-; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta
-; out: m5=beta-1, m7=mask, %3=alpha-1
-; clobbers: m4,m6
-%macro LOAD_MASK 2-3
-%if cpuflag(ssse3)
-    movd     m4, %1
-    movd     m5, %2
-    pxor     m6, m6
-    pshufb   m4, m6
-    pshufb   m5, m6
-%else
-    movd     m4, %1
-    movd     m5, %2
-    punpcklbw m4, m4
-    punpcklbw m5, m5
-    SPLATW   m4, m4
-    SPLATW   m5, m5
-%endif
-    mova     m6, [pb_1]
-    psubusb  m4, m6              ; alpha - 1
-    psubusb  m5, m6              ; beta - 1
-%if %0>2
-    mova     %3, m4
-%endif
-    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
-    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
-    por      m7, m4
-    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
-    por      m7, m4
-    pxor     m6, m6
-    pcmpeqb  m7, m6
-%endmacro
-
-; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
-; out: m1=p0' m2=q0'
-; clobbers: m0,3-6
-%macro DEBLOCK_P0_Q0 0
-    pxor    m5, m1, m2   ; p0^q0
-    pand    m5, [pb_1]   ; (p0^q0)&1
-    pcmpeqb m4, m4
-    pxor    m3, m4
-    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
-    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
-    pxor    m4, m1
-    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
-    pavgb   m3, m5
-    paddusb m3, m4       ; d+128+33
-    mova    m6, [pb_a1]
-    psubusb m6, m3
-    psubusb m3, [pb_a1]
-    pminub  m6, m7
-    pminub  m3, m7
-    psubusb m1, m6
-    psubusb m2, m3
-    paddusb m1, m3
-    paddusb m2, m6
-%endmacro
-
-; in: m1=p0 m2=q0
-;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
-; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-; clobbers: q2, tmp, tc0
-%macro LUMA_Q1 6
-    pavgb   %6, m1, m2
-    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
-    pxor    %6, %3
-    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
-    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
-    psubusb %6, %1, %5
-    paddusb %5, %1
-    pmaxub  %2, %6
-    pminub  %2, %5
-    mova    %4, %2
-%endmacro
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma, 5,5,10
-    movd    m8, [r4] ; tc0
-    lea     r4, [r1*3]
-    neg     r4
-    add     r4, r0     ; pix-3*stride
-
-    mova    m0, [r4+r1]   ; p1
-    mova    m1, [r4+2*r1] ; p0
-    mova    m2, [r0]      ; q0
-    mova    m3, [r0+r1]   ; q1
-    LOAD_MASK r2d, r3d
-
-%if cpuflag(avx)
-    pshufb   m8, [pb_unpackbd1]
-    pblendvb m9, m7, m6, m8
-%else
-    punpcklbw m8, m8
-    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-    pcmpeqb m9, m9
-    pcmpeqb m9, m8
-    pandn   m9, m7
-%endif
-    pand    m8, m9
-
-    mova    m3, [r4] ; p2
-    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
-    pand    m6, m9
-    psubb   m7, m8, m6 ; tc++
-    pand    m6, m8
-    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
-
-    mova    m4, [r0+2*r1] ; q2
-    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
-    pand    m6, m9
-    pand    m8, m6
-    psubb   m7, m6
-    mova    m3, [r0+r1]
-    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
-
-    DEBLOCK_P0_Q0
-    mova    [r4+2*r1], m1
-    mova    [r0], m2
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-
-%if cpuflag(avx)
-INIT_XMM cpuname
-%else
-INIT_MMX cpuname
-%endif
-cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64
-    lea    r8, [r1*3]
-    lea    r6, [r0-4]
-    lea    r5, [r0-4+r8]
-    %xdefine pix_tmp rsp+0x30*WIN64 ; shadow space + r4
-
-    ; transpose 6x16 -> tmp space
-    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r1, r8), pix_tmp
-    lea    r6, [r6+r1*8]
-    lea    r5, [r5+r1*8]
-    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
-
-    ; vertical filter
-    ; alpha, beta, tc0 are still in r2d, r3d, r4
-    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
-    mov    r7, r1
-    lea    r0, [pix_tmp+0x30]
-    mov    r1d, 0x10
-%if WIN64
-    mov    [rsp+0x20], r4
-%endif
-    call   deblock_v_luma
-
-    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
-    add    r6, 2
-    add    r5, 2
-%if cpuflag(sse4)
-    mova   m0, [pix_tmp+0x10]
-    mova   m1, [pix_tmp+0x20]
-    mova   m2, [pix_tmp+0x30]
-    mova   m3, [pix_tmp+0x40]
-    SBUTTERFLY bw, 0, 1, 4
-    SBUTTERFLY bw, 2, 3, 4
-    SBUTTERFLY wd, 0, 2, 4
-    SBUTTERFLY wd, 1, 3, 4
-    STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3
-    shl    r7, 3
-    sub    r6, r7
-    sub    r5, r7
-    shr    r7, 3
-    STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2
-%else
-    movq   m0, [pix_tmp+0x18]
-    movq   m1, [pix_tmp+0x28]
-    movq   m2, [pix_tmp+0x38]
-    movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
-
-    shl    r7, 3
-    sub    r6, r7
-    sub    r5, r7
-    shr    r7, 3
-    movq   m0, [pix_tmp+0x10]
-    movq   m1, [pix_tmp+0x20]
-    movq   m2, [pix_tmp+0x30]
-    movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
-%endif
-
-    RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA
-INIT_XMM avx
-DEBLOCK_LUMA
-
-%else
-
-%macro DEBLOCK_LUMA 2
-;-----------------------------------------------------------------------------
-; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma, 5,5,8,2*%2
-    lea     r4, [r1*3]
-    neg     r4
-    add     r4, r0 ; pix-3*stride
-
-    mova    m0, [r4+r1]   ; p1
-    mova    m1, [r4+2*r1] ; p0
-    mova    m2, [r0]      ; q0
-    mova    m3, [r0+r1]   ; q1
-    LOAD_MASK r2d, r3d
-
-    mov     r3, r4mp
-    movd    m4, [r3] ; tc0
-%if cpuflag(avx)
-    pshufb   m4, [pb_unpackbd1]
-    mova   [esp+%2], m4 ; tc
-    pblendvb m4, m7, m6, m4
-%else
-    punpcklbw m4, m4
-    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-    mova   [esp+%2], m4 ; tc
-    pcmpeqb m3, m3
-    pcmpgtb m4, m3
-    pand    m4, m7
-%endif
-    mova   [esp], m4 ; mask
-
-    mova    m3, [r4] ; p2
-    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
-    pand    m6, m4
-    pand    m4, [esp+%2] ; tc
-    psubb   m7, m4, m6
-    pand    m6, m4
-    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
-
-    mova    m4, [r0+2*r1] ; q2
-    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
-    mova    m5, [esp] ; mask
-    pand    m6, m5
-    mova    m5, [esp+%2] ; tc
-    pand    m5, m6
-    psubb   m7, m6
-    mova    m3, [r0+r1]
-    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
-
-    DEBLOCK_P0_Q0
-    mova    [r4+2*r1], m1
-    mova    [r0], m2
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-%if cpuflag(avx)
-INIT_XMM cpuname
-%else
-INIT_MMX cpuname
-%endif
-cglobal deblock_h_luma, 1,5,8,0x60+12
-    mov    r3, r1m
-    lea    r4, [r3*3]
-    sub    r0, 4
-    lea    r1, [r0+r4]
-    %define pix_tmp esp+12
-    ; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma.
-
-    ; transpose 6x16 -> tmp space
-    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
-    lea    r0, [r0+r3*8]
-    lea    r1, [r1+r3*8]
-    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
-
-    ; vertical filter
-    lea    r0, [pix_tmp+0x30]
-    PUSH   dword r4m
-    PUSH   dword r3m
-    PUSH   dword r2m
-    PUSH   dword 16
-    PUSH   dword r0
-    call   deblock_%1_luma
-%ifidn %1, v8
-    add    dword [esp   ], 8 ; pix_tmp+0x38
-    add    dword [esp+16], 2 ; tc0+2
-    call   deblock_%1_luma
-%endif
-    ADD    esp, 20
-
-    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
-    mov    r0, r0mp
-    sub    r0, 2
-    lea    r1, [r0+r4]
-
-%if cpuflag(avx)
-    mova   m0, [pix_tmp+0x10]
-    mova   m1, [pix_tmp+0x20]
-    mova   m2, [pix_tmp+0x30]
-    mova   m3, [pix_tmp+0x40]
-    SBUTTERFLY bw, 0, 1, 4
-    SBUTTERFLY bw, 2, 3, 4
-    SBUTTERFLY wd, 0, 2, 4
-    SBUTTERFLY wd, 1, 3, 4
-    STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2
-    lea    r0, [r0+r3*8]
-    lea    r1, [r1+r3*8]
-    STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3
-%else
-    movq   m0, [pix_tmp+0x10]
-    movq   m1, [pix_tmp+0x20]
-    movq   m2, [pix_tmp+0x30]
-    movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
-
-    lea    r0, [r0+r3*8]
-    lea    r1, [r1+r3*8]
-    movq   m0, [pix_tmp+0x18]
-    movq   m1, [pix_tmp+0x28]
-    movq   m2, [pix_tmp+0x38]
-    movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
-%endif
-
-    RET
-%endmacro ; DEBLOCK_LUMA
-
-INIT_MMX mmx2
-DEBLOCK_LUMA v8, 8
-INIT_XMM sse2
-DEBLOCK_LUMA v, 16
-INIT_XMM avx
-DEBLOCK_LUMA v, 16
-
-%endif ; ARCH
-
-
-
-%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
-%if ARCH_X86_64
-    pavgb t0, p2, p1
-    pavgb t1, p0, q0
-%else
-    mova  t0, p2
-    mova  t1, p0
-    pavgb t0, p1
-    pavgb t1, q0
-%endif
-    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
-    mova  t5, t1
-%if ARCH_X86_64
-    paddb t2, p2, p1
-    paddb t3, p0, q0
-%else
-    mova  t2, p2
-    mova  t3, p0
-    paddb t2, p1
-    paddb t3, q0
-%endif
-    paddb t2, t3
-    mova  t3, t2
-    mova  t4, t2
-    psrlw t2, 1
-    pavgb t2, mpb_0
-    pxor  t2, t0
-    pand  t2, mpb_1
-    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
-
-%if ARCH_X86_64
-    pavgb t1, p2, q1
-    psubb t2, p2, q1
-%else
-    mova  t1, p2
-    mova  t2, p2
-    pavgb t1, q1
-    psubb t2, q1
-%endif
-    paddb t3, t3
-    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
-    pand  t2, mpb_1
-    psubb t1, t2
-    pavgb t1, p1
-    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
-    psrlw t3, 2
-    pavgb t3, mpb_0
-    pxor  t3, t1
-    pand  t3, mpb_1
-    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
-
-    pxor  t3, p0, q1
-    pavgb t2, p0, q1
-    pand  t3, mpb_1
-    psubb t2, t3
-    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
-
-    pxor  t1, t2
-    pxor  t2, p0
-    pand  t1, mask1p
-    pand  t2, mask0
-    pxor  t1, t2
-    pxor  t1, p0
-    mova  %1, t1 ; store p0
-
-    mova  t1, %4 ; p3
-    paddb t2, t1, p2
-    pavgb t1, p2
-    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
-    paddb t2, t2
-    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
-    psrlw t2, 2
-    pavgb t2, mpb_0
-    pxor  t2, t1
-    pand  t2, mpb_1
-    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
-
-    pxor  t0, p1
-    pxor  t1, p2
-    pand  t0, mask1p
-    pand  t1, mask1p
-    pxor  t0, p1
-    pxor  t1, p2
-    mova  %2, t0 ; store p1
-    mova  %3, t1 ; store p2
-%endmacro
-
-%macro LUMA_INTRA_SWAP_PQ 0
-    %define q1 m0
-    %define q0 m1
-    %define p0 m2
-    %define p1 m3
-    %define p2 q2
-    %define mask1p mask1q
-%endmacro
-
-%macro DEBLOCK_LUMA_INTRA 1
-    %define p1 m0
-    %define p0 m1
-    %define q0 m2
-    %define q1 m3
-    %define t0 m4
-    %define t1 m5
-    %define t2 m6
-    %define t3 m7
-%if ARCH_X86_64
-    %define p2 m8
-    %define q2 m9
-    %define t4 m10
-    %define t5 m11
-    %define mask0 m12
-    %define mask1p m13
-%if WIN64
-    %define mask1q [rsp]
-%else
-    %define mask1q [rsp-24]
-%endif
-    %define mpb_0 m14
-    %define mpb_1 m15
-%else
-    %define spill(x) [esp+16*x]
-    %define p2 [r4+r1]
-    %define q2 [r0+2*r1]
-    %define t4 spill(0)
-    %define t5 spill(1)
-    %define mask0 spill(2)
-    %define mask1p spill(3)
-    %define mask1q spill(4)
-    %define mpb_0 [pb_0]
-    %define mpb_1 [pb_1]
-%endif
-
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10
-    lea     r4, [r1*4]
-    lea     r5, [r1*3] ; 3*stride
-    neg     r4
-    add     r4, r0     ; pix-4*stride
-    mova    p1, [r4+2*r1]
-    mova    p0, [r4+r5]
-    mova    q0, [r0]
-    mova    q1, [r0+r1]
-%if ARCH_X86_64
-    pxor    mpb_0, mpb_0
-    mova    mpb_1, [pb_1]
-    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
-    SWAP    7, 12 ; m12=mask0
-    pavgb   t5, mpb_0
-    pavgb   t5, mpb_1 ; alpha/4+1
-    movdqa  p2, [r4+r1]
-    movdqa  q2, [r0+2*r1]
-    DIFF_GT2 p0, q0, t5, t0, t3    ; t0 = |p0-q0| > alpha/4+1
-    DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1
-    DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1
-    pand    t0, mask0
-    pand    t4, t0
-    pand    t2, t0
-    mova    mask1q, t4
-    mova    mask1p, t2
-%else
-    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
-    mova    m4, t5
-    mova    mask0, m7
-    pavgb   m4, [pb_0]
-    pavgb   m4, [pb_1] ; alpha/4+1
-    DIFF_GT2 p0, q0, m4, m6, m7    ; m6 = |p0-q0| > alpha/4+1
-    pand    m6, mask0
-    DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1
-    pand    m4, m6
-    mova    mask1p, m4
-    DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1
-    pand    m4, m6
-    mova    mask1q, m4
-%endif
-    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
-    LUMA_INTRA_SWAP_PQ
-    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
-.end:
-    REP_RET
-
-%if cpuflag(avx)
-INIT_XMM cpuname
-%else
-INIT_MMX cpuname
-%endif
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,9,0,0x80
-    lea    r8, [r1*3]
-    lea    r6, [r0-4]
-    lea    r5, [r0-4+r8]
-%if WIN64
-    %define pix_tmp rsp+0x20 ; shadow space
-%else
-    %define pix_tmp rsp
-%endif
-
-    ; transpose 8x16 -> tmp space
-    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
-    lea    r6, [r6+r1*8]
-    lea    r5, [r5+r1*8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
-    mov    r7, r1
-    lea    r0, [pix_tmp+0x40]
-    mov    r1, 0x10
-    call   deblock_v_luma_intra
-
-    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
-    lea    r5, [r6+r8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
-    shl    r7, 3
-    sub    r6, r7
-    sub    r5, r7
-    shr    r7, 3
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
-    RET
-%else
-cglobal deblock_h_luma_intra, 2,4,8,0x80
-    lea    r3,  [r1*3]
-    sub    r0,  4
-    lea    r2,  [r0+r3]
-    %define pix_tmp rsp
-
-    ; transpose 8x16 -> tmp space
-    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
-    lea    r0,  [r0+r1*8]
-    lea    r2,  [r2+r1*8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
-    lea    r0,  [pix_tmp+0x40]
-    PUSH   dword r3m
-    PUSH   dword r2m
-    PUSH   dword 16
-    PUSH   r0
-    call   deblock_%1_luma_intra
-%ifidn %1, v8
-    add    dword [rsp], 8 ; pix_tmp+8
-    call   deblock_%1_luma_intra
-%endif
-    ADD    esp, 16
-
-    mov    r1,  r1m
-    mov    r0,  r0mp
-    lea    r3,  [r1*3]
-    sub    r0,  4
-    lea    r2,  [r0+r3]
-    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
-    lea    r0,  [r0+r1*8]
-    lea    r2,  [r2+r1*8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
-    RET
-%endif ; ARCH_X86_64
-%endmacro ; DEBLOCK_LUMA_INTRA
-
-INIT_XMM sse2
-DEBLOCK_LUMA_INTRA v
-INIT_XMM avx
-DEBLOCK_LUMA_INTRA v
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEBLOCK_LUMA_INTRA v8
-%endif
-%endif ; !HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
-; out: %1=p0', %2=q0'
-%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
-    mova    %6, [pw_2]
-    paddw   %6, %3
-    paddw   %6, %4
-    paddw   %7, %6, %2
-    paddw   %6, %1
-    paddw   %6, %3
-    paddw   %7, %4
-    psraw   %6, 2
-    psraw   %7, 2
-    psubw   %6, %1
-    psubw   %7, %2
-    pand    %6, %5
-    pand    %7, %5
-    paddw   %1, %6
-    paddw   %2, %7
-%endmacro
-
-; out: m0-m3
-; clobbers: m4-m7
-%macro CHROMA_H_LOAD 0-1
-    movq        m0, [r0-8] ; p1 p1 p0 p0
-    movq        m2, [r0]   ; q0 q0 q1 q1
-    movq        m5, [r0+r1-8]
-    movq        m7, [r0+r1]
-%if mmsize == 8
-    mova        m1, m0
-    mova        m3, m2
-    punpckldq   m0, m5 ; p1
-    punpckhdq   m1, m5 ; p0
-    punpckldq   m2, m7 ; q0
-    punpckhdq   m3, m7 ; q1
-%else
-    movq        m4, [r0+r1*2-8]
-    movq        m6, [r0+r1*2]
-    movq        m1, [r0+%1-8]
-    movq        m3, [r0+%1]
-    punpckldq   m0, m5 ; p1 ... p0 ...
-    punpckldq   m2, m7 ; q0 ... q1 ...
-    punpckldq   m4, m1
-    punpckldq   m6, m3
-    punpckhqdq  m1, m0, m4 ; p0
-    punpcklqdq  m0, m4 ; p1
-    punpckhqdq  m3, m2, m6 ; q1
-    punpcklqdq  m2, m6 ; q0
-%endif
-%endmacro
-
-%macro CHROMA_V_LOAD 1
-    mova        m0, [r0]    ; p1
-    mova        m1, [r0+r1] ; p0
-    mova        m2, [%1]    ; q0
-    mova        m3, [%1+r1] ; q1
-%endmacro
-
-; clobbers: m1, m2, m3
-%macro CHROMA_H_STORE 0-1
-    SBUTTERFLY dq, 1, 2, 3
-%if mmsize == 8
-    movq      [r0-4], m1
-    movq   [r0+r1-4], m2
-%else
-    movq      [r0-4], m1
-    movq [r0+r1*2-4], m2
-    movhps [r0+r1-4], m1
-    movhps [r0+%1-4], m2
-%endif
-%endmacro
-
-%macro CHROMA_V_STORE 0
-    mova [r0+1*r1], m1
-    mova [r0+2*r1], m2
-%endmacro
-
-%macro DEBLOCK_CHROMA 0
-cglobal deblock_inter_body
-    LOAD_AB     m4, m5, r2d, r3d
-    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
-    pxor        m4, m4
-    LOAD_TC     m6, r4
-    pmaxsw      m6, m4
-    pand        m7, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
-    ret
-
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 5,7,8
-    FIX_STRIDES r1
-    mov         r5, r0
-    sub         r0, r1
-    sub         r0, r1
-    mov         r6, 32/mmsize
-.loop:
-    CHROMA_V_LOAD r5
-    call        deblock_inter_body
-    CHROMA_V_STORE
-    add         r0, mmsize
-    add         r5, mmsize
-    add         r4, mmsize/8
-    dec         r6
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma, 5,7,8
-    add         r1, r1
-    mov         r5, 32/mmsize
-%if mmsize == 16
-    lea         r6, [r1*3]
-%endif
-.loop:
-    CHROMA_H_LOAD r6
-    call        deblock_inter_body
-    CHROMA_H_STORE r6
-    lea         r0, [r0+r1*(mmsize/4)]
-    add         r4, mmsize/8
-    dec         r5
-    jg .loop
-    RET
-
-
-cglobal deblock_intra_body
-    LOAD_AB     m4, m5, r2d, r3d
-    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
-    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
-    ret
-
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra, 4,6,8
-    add         r1, r1
-    mov         r5, 32/mmsize
-    movd        m5, r3d
-    mov         r4, r0
-    sub         r0, r1
-    sub         r0, r1
-    SPLATW      m5, m5
-.loop:
-    CHROMA_V_LOAD r4
-    call        deblock_intra_body
-    CHROMA_V_STORE
-    add         r0, mmsize
-    add         r4, mmsize
-    dec         r5
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra, 4,6,8
-    add         r1, r1
-    mov         r4, 32/mmsize
-%if mmsize == 16
-    lea         r5, [r1*3]
-%endif
-.loop:
-    CHROMA_H_LOAD r5
-    call        deblock_intra_body
-    CHROMA_H_STORE r5
-    lea         r0, [r0+r1*(mmsize/4)]
-    dec         r4
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_mbaff, 4,6,8
-    add         r1, r1
-%if mmsize == 8
-    mov         r4, 16/mmsize
-.loop:
-%else
-    lea         r5, [r1*3]
-%endif
-    CHROMA_H_LOAD r5
-    LOAD_AB     m4, m5, r2d, r3d
-    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
-    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
-    CHROMA_H_STORE r5
-%if mmsize == 8
-    lea         r0, [r0+r1*(mmsize/4)]
-    dec         r4
-    jg .loop
-%endif
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_mbaff, 5,7,8
-    add         r1, r1
-    lea         r6, [r1*3]
-%if mmsize == 8
-    mov         r5, 16/mmsize
-.loop:
-%endif
-    CHROMA_H_LOAD r6
-    LOAD_AB     m4, m5, r2d, r3d
-    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
-    movd      m6, [r4]
-    punpcklbw m6, m6
-    psraw m6, 8
-    punpcklwd m6, m6
-    pand m7, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
-    CHROMA_H_STORE r6
-%if mmsize == 8
-    lea         r0, [r0+r1*(mmsize/4)]
-    add         r4, mmsize/4
-    dec         r5
-    jg .loop
-%endif
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_422_intra, 4,6,8
-    add         r1, r1
-    mov         r4, 64/mmsize
-%if mmsize == 16
-    lea         r5, [r1*3]
-%endif
-.loop:
-    CHROMA_H_LOAD r5
-    call        deblock_intra_body
-    CHROMA_H_STORE r5
-    lea         r0, [r0+r1*(mmsize/4)]
-    dec         r4
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_422, 5,7,8
-    add         r1, r1
-    mov         r5, 64/mmsize
-    lea         r6, [r1*3]
-.loop:
-    CHROMA_H_LOAD r6
-    LOAD_AB     m4, m5, r2m, r3d
-    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
-    pxor        m4, m4
-    movd        m6, [r4-1]
-    psraw       m6, 8
-    SPLATW      m6, m6
-    pmaxsw      m6, m4
-    pand        m7, m6
-    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
-    CHROMA_H_STORE r6
-    lea         r0, [r0+r1*(mmsize/4)]
-%if mmsize == 16
-    inc         r4
-%else
-    mov         r2, r5
-    and         r2, 1
-    add         r4, r2 ; increment once every 2 iterations
-%endif
-    dec         r5
-    jg .loop
-    RET
-%endmacro ; DEBLOCK_CHROMA
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEBLOCK_CHROMA
-%endif
-INIT_XMM sse2
-DEBLOCK_CHROMA
-INIT_XMM avx
-DEBLOCK_CHROMA
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-%macro CHROMA_V_START 0
-    mov    t5, r0
-    sub    t5, r1
-    sub    t5, r1
-%if mmsize==8
-    mov   dword r0m, 2
-.loop:
-%endif
-%endmacro
-
-%macro CHROMA_H_START 0
-    sub    r0, 4
-    lea    t6, [r1*3]
-    mov    t5, r0
-    add    r0, t6
-%endmacro
-
-%macro CHROMA_V_LOOP 1
-%if mmsize==8
-    add   r0, 8
-    add   t5, 8
-%if %1
-    add   r4, 2
-%endif
-    dec   dword r0m
-    jg .loop
-%endif
-%endmacro
-
-%macro CHROMA_H_LOOP 1
-%if mmsize==8
-    lea   r0, [r0+r1*4]
-    lea   t5, [t5+r1*4]
-%if %1
-    add   r4, 2
-%endif
-    dec   dword r0m
-    jg .loop
-%endif
-%endmacro
-
-%define t5 r5
-%define t6 r6
-
-%macro DEBLOCK_CHROMA 0
-cglobal chroma_inter_body
-    LOAD_MASK  r2d, r3d
-    movd       m6, [r4] ; tc0
-    punpcklbw  m6, m6
-    punpcklbw  m6, m6
-    pand       m7, m6
-    DEBLOCK_P0_Q0
-    ret
-
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 5,6,8
-    CHROMA_V_START
-    mova  m0, [t5]
-    mova  m1, [t5+r1]
-    mova  m2, [r0]
-    mova  m3, [r0+r1]
-    call chroma_inter_body
-    mova  [t5+r1], m1
-    mova  [r0], m2
-    CHROMA_V_LOOP 1
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma, 5,7,8
-    CHROMA_H_START
-%if mmsize==8
-    mov   dword r0m, 2
-.loop:
-%endif
-    TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
-    call chroma_inter_body
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    CHROMA_H_LOOP 1
-    RET
-%endmacro ; DEBLOCK_CHROMA
-
-INIT_XMM sse2
-DEBLOCK_CHROMA
-INIT_XMM avx
-DEBLOCK_CHROMA
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEBLOCK_CHROMA
-%endif
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-%macro DEBLOCK_H_CHROMA_420_MBAFF 0
-cglobal deblock_h_chroma_mbaff, 5,7,8
-    CHROMA_H_START
-    TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
-    LOAD_MASK  r2d, r3d
-    movd       m6, [r4] ; tc0
-    punpcklbw  m6, m6
-    pand       m7, m6
-    DEBLOCK_P0_Q0
-    TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_H_CHROMA_420_MBAFF
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEBLOCK_H_CHROMA_420_MBAFF
-%endif
-
-%macro DEBLOCK_H_CHROMA_422 0
-cglobal deblock_h_chroma_422, 5,8,8
-%if ARCH_X86_64
-    %define cntr r7
-%else
-    %define cntr dword r0m
-%endif
-    CHROMA_H_START
-    mov  cntr, 32/mmsize
-.loop:
-    TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
-    LOAD_MASK  r2d, r3d
-    movd       m6, [r4] ; tc0
-    punpcklbw  m6, m6
-%if mmsize == 16
-    punpcklbw  m6, m6
-    punpcklbw  m6, m6
-%else
-    pshufw     m6, m6, q0000
-%endif
-    pand       m7, m6
-    DEBLOCK_P0_Q0
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    lea   r0, [r0+r1*(mmsize/2)]
-    lea   t5, [t5+r1*(mmsize/2)]
-    add   r4, mmsize/8
-    dec   cntr
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-DEBLOCK_H_CHROMA_422
-INIT_XMM sse2
-DEBLOCK_H_CHROMA_422
-INIT_XMM avx
-DEBLOCK_H_CHROMA_422
-
-; in: %1=p0 %2=p1 %3=q1
-; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
-%macro CHROMA_INTRA_P0 3
-    pxor    m4, %1, %3
-    pand    m4, [pb_1] ; m4 = (p0^q1)&1
-    pavgb   %1, %3
-    psubusb %1, m4
-    pavgb   %1, %2     ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
-%endmacro
-
-%define t5 r4
-%define t6 r5
-
-%macro DEBLOCK_CHROMA_INTRA_BODY 0
-cglobal chroma_intra_body
-    LOAD_MASK r2d, r3d
-    mova   m5, m1
-    mova   m6, m2
-    CHROMA_INTRA_P0  m1, m0, m3
-    CHROMA_INTRA_P0  m2, m3, m0
-    psubb  m1, m5
-    psubb  m2, m6
-    pand   m1, m7
-    pand   m2, m7
-    paddb  m1, m5
-    paddb  m2, m6
-    ret
-%endmacro
-
-%macro DEBLOCK_CHROMA_INTRA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra, 4,5,8
-    CHROMA_V_START
-    mova  m0, [t5]
-    mova  m1, [t5+r1]
-    mova  m2, [r0]
-    mova  m3, [r0+r1]
-    call chroma_intra_body
-    mova  [t5+r1], m1
-    mova  [r0], m2
-    CHROMA_V_LOOP 0
-    RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra, 4,6,8
-    CHROMA_H_START
-%if mmsize==8
-    mov   dword r0m, 2
-.loop:
-%endif
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(t5, r0, r1, t6)
-    call chroma_intra_body
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    CHROMA_H_LOOP 0
-    RET
-
-cglobal deblock_h_chroma_422_intra, 4,7,8
-    CHROMA_H_START
-    mov   r6d, 32/mmsize
-.loop:
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(t5, r0, r1, t6)
-    call chroma_intra_body
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    lea   r0, [r0+r1*(mmsize/2)]
-    lea   t5, [t5+r1*(mmsize/2)]
-    dec  r6d
-    jg .loop
-    RET
-%endmacro ; DEBLOCK_CHROMA_INTRA
-
-INIT_XMM sse2
-DEBLOCK_CHROMA_INTRA_BODY
-DEBLOCK_CHROMA_INTRA
-INIT_XMM avx
-DEBLOCK_CHROMA_INTRA_BODY
-DEBLOCK_CHROMA_INTRA
-INIT_MMX mmx2
-DEBLOCK_CHROMA_INTRA_BODY
-%if ARCH_X86_64 == 0
-DEBLOCK_CHROMA_INTRA
-%endif
-
-;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal deblock_h_chroma_intra_mbaff, 4,6,8
-    CHROMA_H_START
-    TRANSPOSE4x4W_LOAD  PASS8ROWS(t5, r0, r1, t6)
-    call chroma_intra_body
-    TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
-    RET
-%endif ; !HIGH_BIT_DEPTH
-
-
-
-;-----------------------------------------------------------------------------
-; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
-;                               uint8_t bs[2][4][4], int mvy_limit, int bframe )
-;-----------------------------------------------------------------------------
-
-%define scan8start (4+1*8)
-%define nnz r0+scan8start
-%define ref r1+scan8start
-%define mv  r2+scan8start*4
-%define bs0 r3
-%define bs1 r3+32
-
-%macro LOAD_BYTES_MMX 1
-    movd      m2, [%1+8*0-1]
-    movd      m0, [%1+8*0]
-    movd      m3, [%1+8*2-1]
-    movd      m1, [%1+8*2]
-    punpckldq m2, [%1+8*1-1]
-    punpckldq m0, [%1+8*1]
-    punpckldq m3, [%1+8*3-1]
-    punpckldq m1, [%1+8*3]
-%endmacro
-
-%macro DEBLOCK_STRENGTH_REFS_MMX 0
-    LOAD_BYTES_MMX ref
-    pxor      m2, m0
-    pxor      m3, m1
-    por       m2, [bs0+0]
-    por       m3, [bs0+8]
-    movq [bs0+0], m2
-    movq [bs0+8], m3
-
-    movd      m2, [ref-8*1]
-    movd      m3, [ref+8*1]
-    punpckldq m2, m0  ; row -1, row 0
-    punpckldq m3, m1  ; row  1, row 2
-    pxor      m0, m2
-    pxor      m1, m3
-    por       m0, [bs1+0]
-    por       m1, [bs1+8]
-    movq [bs1+0], m0
-    movq [bs1+8], m1
-%endmacro
-
-%macro DEBLOCK_STRENGTH_MVS_MMX 2
-    mova      m0, [mv-%2]
-    mova      m1, [mv-%2+8]
-    psubw     m0, [mv]
-    psubw     m1, [mv+8]
-    packsswb  m0, m1
-    ABSB      m0, m1
-    psubusb   m0, m7
-    packsswb  m0, m0
-    por       m0, [%1]
-    movd    [%1], m0
-%endmacro
-
-%macro DEBLOCK_STRENGTH_NNZ_MMX 1
-    por       m2, m0
-    por       m3, m1
-    mova      m4, [%1]
-    mova      m5, [%1+8]
-    pminub    m2, m6
-    pminub    m3, m6
-    pminub    m4, m6 ; mv ? 1 : 0
-    pminub    m5, m6
-    paddb     m2, m2 ; nnz ? 2 : 0
-    paddb     m3, m3
-    pmaxub    m2, m4
-    pmaxub    m3, m5
-%endmacro
-
-%macro LOAD_BYTES_XMM 1
-    movu      m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
-    movu      m1, [%1+12]
-    pslldq    m0, m2, 1
-    shufps    m2, m1, q3131 ; cur nnz, all rows
-    pslldq    m1, 1
-    shufps    m0, m1, q3131 ; left neighbors
-    pslldq    m1, m2, 4
-    movd      m3, [%1-8] ; could be palignr if nnz was aligned
-    por       m1, m3 ; top neighbors
-%endmacro
-
-INIT_MMX mmx2
-cglobal deblock_strength, 6,6
-    ; Prepare mv comparison register
-    shl      r4d, 8
-    add      r4d, 3 - (1<<8)
-    movd      m7, r4d
-    SPLATW    m7, m7
-    mova      m6, [pb_1]
-    pxor      m0, m0
-    mova [bs0+0], m0
-    mova [bs0+8], m0
-    mova [bs1+0], m0
-    mova [bs1+8], m0
-
-.lists:
-    DEBLOCK_STRENGTH_REFS_MMX
-    mov      r4d, 4
-.mvs:
-    DEBLOCK_STRENGTH_MVS_MMX bs0, 4
-    DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
-    add       r2, 4*8
-    add       r3, 4
-    dec      r4d
-    jg .mvs
-    add       r1, 40
-    add       r2, 4*8
-    sub       r3, 16
-    dec      r5d
-    jge .lists
-
-    ; Check nnz
-    LOAD_BYTES_MMX nnz
-    DEBLOCK_STRENGTH_NNZ_MMX bs0
-    ; Transpose column output
-    SBUTTERFLY bw, 2, 3, 4
-    SBUTTERFLY bw, 2, 3, 4
-    mova [bs0+0], m2
-    mova [bs0+8], m3
-    movd      m2, [nnz-8*1]
-    movd      m3, [nnz+8*1]
-    punpckldq m2, m0  ; row -1, row 0
-    punpckldq m3, m1  ; row  1, row 2
-    DEBLOCK_STRENGTH_NNZ_MMX bs1
-    mova [bs1+0], m2
-    mova [bs1+8], m3
-    RET
-
-%macro DEBLOCK_STRENGTH_XMM 0
-cglobal deblock_strength, 6,6,7
-    ; Prepare mv comparison register
-    shl      r4d, 8
-    add      r4d, 3 - (1<<8)
-    movd      m6, r4d
-    SPLATW    m6, m6
-    pxor      m4, m4 ; bs0
-    pxor      m5, m5 ; bs1
-
-.lists:
-    ; Check refs
-    LOAD_BYTES_XMM ref
-    pxor      m0, m2
-    pxor      m1, m2
-    por       m4, m0
-    por       m5, m1
-
-    ; Check mvs
-%if cpuflag(ssse3)
-    mova      m0, [mv+4*8*0]
-    mova      m1, [mv+4*8*1]
-    palignr   m3, m0, [mv+4*8*0-16], 12
-    palignr   m2, m1, [mv+4*8*1-16], 12
-    psubw     m0, m3
-    psubw     m1, m2
-    packsswb  m0, m1
-
-    mova      m2, [mv+4*8*2]
-    mova      m1, [mv+4*8*3]
-    palignr   m3, m2, [mv+4*8*2-16], 12
-    psubw     m2, m3
-    palignr   m3, m1, [mv+4*8*3-16], 12
-    psubw     m1, m3
-    packsswb  m2, m1
-%else
-    movu      m0, [mv-4+4*8*0]
-    movu      m1, [mv-4+4*8*1]
-    movu      m2, [mv-4+4*8*2]
-    movu      m3, [mv-4+4*8*3]
-    psubw     m0, [mv+4*8*0]
-    psubw     m1, [mv+4*8*1]
-    psubw     m2, [mv+4*8*2]
-    psubw     m3, [mv+4*8*3]
-    packsswb  m0, m1
-    packsswb  m2, m3
-%endif
-    ABSB      m0, m1
-    ABSB      m2, m3
-    psubusb   m0, m6
-    psubusb   m2, m6
-    packsswb  m0, m2
-    por       m4, m0
-
-    mova      m0, [mv+4*8*-1]
-    mova      m1, [mv+4*8* 0]
-    mova      m2, [mv+4*8* 1]
-    mova      m3, [mv+4*8* 2]
-    psubw     m0, m1
-    psubw     m1, m2
-    psubw     m2, m3
-    psubw     m3, [mv+4*8* 3]
-    packsswb  m0, m1
-    packsswb  m2, m3
-    ABSB      m0, m1
-    ABSB      m2, m3
-    psubusb   m0, m6
-    psubusb   m2, m6
-    packsswb  m0, m2
-    por       m5, m0
-    add       r1, 40
-    add       r2, 4*8*5
-    dec      r5d
-    jge .lists
-
-    ; Check nnz
-    LOAD_BYTES_XMM nnz
-    por       m0, m2
-    por       m1, m2
-    mova      m6, [pb_1]
-    pminub    m0, m6
-    pminub    m1, m6
-    pminub    m4, m6 ; mv ? 1 : 0
-    pminub    m5, m6
-    paddb     m0, m0 ; nnz ? 2 : 0
-    paddb     m1, m1
-    pmaxub    m4, m0
-    pmaxub    m5, m1
-%if cpuflag(ssse3)
-    pshufb    m4, [transpose_shuf]
-%else
-    movhlps   m3, m4
-    punpcklbw m4, m3
-    movhlps   m3, m4
-    punpcklbw m4, m3
-%endif
-    mova   [bs1], m5
-    mova   [bs0], m4
-    RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_STRENGTH_XMM
-INIT_XMM ssse3
-DEBLOCK_STRENGTH_XMM
-INIT_XMM avx
-DEBLOCK_STRENGTH_XMM
-
-%macro LOAD_BYTES_YMM 1
-    movu         m0, [%1-4]             ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
-    pshufb       m0, [load_bytes_shuf]  ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
-    mova         m2, [insert_top_shuf]
-    vpermq       m1, m0, q3131          ; FGHI KLMN PQRS UVWX x2
-    vpermd       m0, m2, m0             ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
-    vpbroadcastd m2, [%1-8]             ; ABCD ....
-    vpblendd     m0, m0, m2, 00010000b  ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
-%endmacro
-
-INIT_YMM avx2
-cglobal deblock_strength, 6,6,7
-    ; Prepare mv comparison register
-    shl      r4d, 8
-    add      r4d, 3 - (1<<8)
-    movd     xm6, r4d
-    vpbroadcastw m6, xm6
-    pxor      m5, m5 ; bs0,bs1
-
-.lists:
-    ; Check refs
-    LOAD_BYTES_YMM ref
-    pxor      m0, m1
-    por       m5, m0
-
-    ; Check mvs
-    movu     xm0, [mv-4+4*8*0]
-    vinserti128 m0, m0, [mv+4*8*-1], 1
-    vbroadcasti128  m2, [mv+4*8* 0]
-    vinserti128 m1, m2, [mv-4+4*8*1], 0
-    vbroadcasti128  m3, [mv+4*8* 1]
-    psubw     m0, m2
-    psubw     m1, m3
-
-    vinserti128 m2, m3, [mv-4+4*8*2], 0
-    vbroadcasti128  m4, [mv+4*8* 2]
-    vinserti128 m3, m4, [mv-4+4*8*3], 0
-    psubw     m2, m4
-    vbroadcasti128  m4, [mv+4*8* 3]
-    psubw     m3, m4
-    packsswb  m0, m1
-    packsswb  m2, m3
-    pabsb     m0, m0
-    pabsb     m2, m2
-    psubusb   m0, m6
-    psubusb   m2, m6
-    packsswb  m0, m2
-    por       m5, m0
-
-    add       r1, 40
-    add       r2, 4*8*5
-    dec      r5d
-    jge .lists
-
-    ; Check nnz
-    LOAD_BYTES_YMM nnz
-    por       m0, m1
-    mova      m6, [pb_1]
-    pminub    m0, m6
-    pminub    m5, m6 ; mv ? 1 : 0
-    paddb     m0, m0 ; nnz ? 2 : 0
-    pmaxub    m5, m0
-    vextracti128 [bs1], m5, 1
-    pshufb   xm5, [transpose_shuf]
-    mova   [bs0], xm5
-    RET
diff --git a/android/src/main/libenc/jni/libx264/common/x86/mc-a.asm b/android/src/main/libenc/jni/libx264/common/x86/mc-a.asm
deleted file mode 100755
index 608efcd..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/mc-a.asm
+++ /dev/null
@@ -1,2134 +0,0 @@
-;*****************************************************************************
-;* mc-a.asm: x86 motion compensation
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Dylan Yudaken <dyudaken@gmail.com>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Min Chen <chenm001.163.com>
-;*          Oskar Arvidsson <oskar@irock.se>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
-ch_shuf_adj: times 8 db 0
-             times 8 db 2
-             times 8 db 4
-             times 8 db 6
-sq_1: times 1 dq 1
-
-SECTION .text
-
-cextern pb_0
-cextern pw_1
-cextern pw_4
-cextern pw_8
-cextern pw_32
-cextern pw_64
-cextern pw_512
-cextern pw_00ff
-cextern pw_pixel_max
-cextern sw_64
-cextern pd_32
-cextern deinterleave_shufd
-
-;=============================================================================
-; implicit weighted biprediction
-;=============================================================================
-; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%if WIN64
-    DECLARE_REG_TMP 0,1,2,3,4,5,4,5
-    %macro AVG_START 0-1 0
-        PROLOGUE 6,7,%1
-    %endmacro
-%elif UNIX64
-    DECLARE_REG_TMP 0,1,2,3,4,5,7,8
-    %macro AVG_START 0-1 0
-        PROLOGUE 6,9,%1
-    %endmacro
-%else
-    DECLARE_REG_TMP 1,2,3,4,5,6,1,2
-    %macro AVG_START 0-1 0
-        PROLOGUE 0,7,%1
-        mov t0, r0m
-        mov t1, r1m
-        mov t2, r2m
-        mov t3, r3m
-        mov t4, r4m
-        mov t5, r5m
-    %endmacro
-%endif
-
-%macro AVG_END 0
-    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
-    lea  t2, [t2+t3*2*SIZEOF_PIXEL]
-    lea  t0, [t0+t1*2*SIZEOF_PIXEL]
-    sub eax, 2
-    jg .height_loop
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-
-%macro BIWEIGHT_MMX 2
-    movh      m0, %1
-    movh      m1, %2
-    punpcklwd m0, m1
-    pmaddwd   m0, m3
-    paddd     m0, m4
-    psrad     m0, 6
-%endmacro
-
-%macro BIWEIGHT_START_MMX 0
-    movzx  t6d, word r6m
-    mov    t7d, 64
-    sub    t7d, t6d
-    shl    t7d, 16
-    add    t6d, t7d
-    movd    m3, t6d
-    SPLATD  m3, m3
-    mova    m4, [pd_32]
-    pxor    m5, m5
-%endmacro
-
-%else ;!HIGH_BIT_DEPTH
-%macro BIWEIGHT_MMX 2
-    movh      m0, %1
-    movh      m1, %2
-    punpcklbw m0, m5
-    punpcklbw m1, m5
-    pmullw    m0, m2
-    pmullw    m1, m3
-    paddw     m0, m1
-    paddw     m0, m4
-    psraw     m0, 6
-%endmacro
-
-%macro BIWEIGHT_START_MMX 0
-    movd    m2, r6m
-    SPLATW  m2, m2   ; weight_dst
-    mova    m3, [pw_64]
-    psubw   m3, m2   ; weight_src
-    mova    m4, [pw_32] ; rounding
-    pxor    m5, m5
-%endmacro
-%endif ;HIGH_BIT_DEPTH
-
-%macro BIWEIGHT_SSSE3 2
-    movh      m0, %1
-    movh      m1, %2
-    punpcklbw m0, m1
-    pmaddubsw m0, m3
-    pmulhrsw  m0, m4
-%endmacro
-
-%macro BIWEIGHT_START_SSSE3 0
-    movzx  t6d, byte r6m ; FIXME x86_64
-    mov    t7d, 64
-    sub    t7d, t6d
-    shl    t7d, 8
-    add    t6d, t7d
-    mova    m4, [pw_512]
-    movd   xm3, t6d
-%if cpuflag(avx2)
-    vpbroadcastw m3, xm3
-%else
-    SPLATW  m3, m3   ; weight_dst,src
-%endif
-%endmacro
-
-%if HIGH_BIT_DEPTH
-%macro BIWEIGHT_ROW 4
-    BIWEIGHT   [%2], [%3]
-%if %4==mmsize/4
-    packssdw     m0, m0
-    CLIPW        m0, m5, m7
-    movh       [%1], m0
-%else
-    SWAP 0, 6
-    BIWEIGHT   [%2+mmsize/2], [%3+mmsize/2]
-    packssdw     m6, m0
-    CLIPW        m6, m5, m7
-    mova       [%1], m6
-%endif
-%endmacro
-
-%else ;!HIGH_BIT_DEPTH
-%macro BIWEIGHT_ROW 4
-    BIWEIGHT [%2], [%3]
-%if %4==mmsize/2
-    packuswb   m0, m0
-    movh     [%1], m0
-%else
-    SWAP 0, 6
-    BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
-    packuswb   m6, m0
-    mova     [%1], m6
-%endif
-%endmacro
-
-%endif ;HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
-;-----------------------------------------------------------------------------
-%macro AVG_WEIGHT 1-2 0
-cglobal pixel_avg_weight_w%1
-    BIWEIGHT_START
-    AVG_START %2
-%if HIGH_BIT_DEPTH
-    mova    m7, [pw_pixel_max]
-%endif
-.height_loop:
-%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
-    BIWEIGHT [t2], [t4]
-    SWAP 0, 6
-    BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
-%if HIGH_BIT_DEPTH
-    packssdw m6, m0
-    CLIPW    m6, m5, m7
-%else ;!HIGH_BIT_DEPTH
-    packuswb m6, m0
-%endif ;HIGH_BIT_DEPTH
-    movlps   [t0], m6
-    movhps   [t0+SIZEOF_PIXEL*t1], m6
-%else
-%assign x 0
-%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
-    BIWEIGHT_ROW   t0+x,                   t2+x,                   t4+x,                 %1
-    BIWEIGHT_ROW   t0+x+SIZEOF_PIXEL*t1,   t2+x+SIZEOF_PIXEL*t3,   t4+x+SIZEOF_PIXEL*t5, %1
-%assign x x+mmsize
-%endrep
-%endif
-    AVG_END
-%endmacro
-
-%define BIWEIGHT BIWEIGHT_MMX
-%define BIWEIGHT_START BIWEIGHT_START_MMX
-INIT_MMX mmx2
-AVG_WEIGHT 4
-AVG_WEIGHT 8
-AVG_WEIGHT 16
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-AVG_WEIGHT 4,  8
-AVG_WEIGHT 8,  8
-AVG_WEIGHT 16, 8
-%else ;!HIGH_BIT_DEPTH
-INIT_XMM sse2
-AVG_WEIGHT 8,  7
-AVG_WEIGHT 16, 7
-%define BIWEIGHT BIWEIGHT_SSSE3
-%define BIWEIGHT_START BIWEIGHT_START_SSSE3
-INIT_MMX ssse3
-AVG_WEIGHT 4
-INIT_XMM ssse3
-AVG_WEIGHT 8,  7
-AVG_WEIGHT 16, 7
-
-INIT_YMM avx2
-cglobal pixel_avg_weight_w16
-    BIWEIGHT_START
-    AVG_START 5
-.height_loop:
-    movu     xm0, [t2]
-    movu     xm1, [t4]
-    vinserti128 m0, m0, [t2+t3], 1
-    vinserti128 m1, m1, [t4+t5], 1
-    SBUTTERFLY bw, 0, 1, 2
-    pmaddubsw m0, m3
-    pmaddubsw m1, m3
-    pmulhrsw  m0, m4
-    pmulhrsw  m1, m4
-    packuswb  m0, m1
-    mova    [t0], xm0
-    vextracti128 [t0+t1], m0, 1
-    AVG_END
-%endif ;HIGH_BIT_DEPTH
-
-;=============================================================================
-; P frame explicit weighted prediction
-;=============================================================================
-
-%if HIGH_BIT_DEPTH
-; width
-%macro WEIGHT_START 1
-    mova        m0, [r4+ 0]         ; 1<<denom
-    mova        m3, [r4+16]
-    movd        m2, [r4+32]         ; denom
-    mova        m4, [pw_pixel_max]
-    paddw       m2, [sq_1]          ; denom+1
-%endmacro
-
-; src1, src2
-%macro WEIGHT 2
-    movh        m5, [%1]
-    movh        m6, [%2]
-    punpcklwd   m5, m0
-    punpcklwd   m6, m0
-    pmaddwd     m5, m3
-    pmaddwd     m6, m3
-    psrad       m5, m2
-    psrad       m6, m2
-    packssdw    m5, m6
-%endmacro
-
-; src, dst, width
-%macro WEIGHT_TWO_ROW 4
-    %assign x 0
-%rep (%3+mmsize/2-1)/(mmsize/2)
-%if %3-x/2 <= 4 && mmsize == 16
-    WEIGHT      %1+x, %1+r3+x
-    CLIPW         m5, [pb_0], m4
-    movh      [%2+x], m5
-    movhps [%2+r1+x], m5
-%else
-    WEIGHT      %1+x, %1+x+mmsize/2
-    SWAP           5,  7
-    WEIGHT   %1+r3+x, %1+r3+x+mmsize/2
-    CLIPW         m5, [pb_0], m4
-    CLIPW         m7, [pb_0], m4
-    mova      [%2+x], m7
-    mova   [%2+r1+x], m5
-%endif
-    %assign x x+mmsize
-%endrep
-%endmacro
-
-%else ; !HIGH_BIT_DEPTH
-
-%macro WEIGHT_START 1
-%if cpuflag(avx2)
-    vbroadcasti128 m3, [r4]
-    vbroadcasti128 m4, [r4+16]
-%else
-    mova     m3, [r4]
-    mova     m4, [r4+16]
-%if notcpuflag(ssse3)
-    movd     m5, [r4+32]
-%endif
-%endif
-    pxor     m2, m2
-%endmacro
-
-; src1, src2, dst1, dst2, fast
-%macro WEIGHT_ROWx2 5
-    movh      m0, [%1         ]
-    movh      m1, [%1+mmsize/2]
-    movh      m6, [%2         ]
-    movh      m7, [%2+mmsize/2]
-    punpcklbw m0, m2
-    punpcklbw m1, m2
-    punpcklbw m6, m2
-    punpcklbw m7, m2
-%if cpuflag(ssse3)
-%if %5==0
-    psllw     m0, 7
-    psllw     m1, 7
-    psllw     m6, 7
-    psllw     m7, 7
-%endif
-    pmulhrsw  m0, m3
-    pmulhrsw  m1, m3
-    pmulhrsw  m6, m3
-    pmulhrsw  m7, m3
-    paddw     m0, m4
-    paddw     m1, m4
-    paddw     m6, m4
-    paddw     m7, m4
-%else
-    pmullw    m0, m3
-    pmullw    m1, m3
-    pmullw    m6, m3
-    pmullw    m7, m3
-    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
-    paddsw    m1, m4
-    paddsw    m6, m4
-    paddsw    m7, m4
-    psraw     m0, m5
-    psraw     m1, m5
-    psraw     m6, m5
-    psraw     m7, m5
-%endif
-    packuswb  m0, m1
-    packuswb  m6, m7
-    mova    [%3], m0
-    mova    [%4], m6
-%endmacro
-
-; src1, src2, dst1, dst2, width, fast
-%macro WEIGHT_COL 6
-%if cpuflag(avx2)
-%if %5==16
-    movu     xm0, [%1]
-    vinserti128 m0, m0, [%2], 1
-    punpckhbw m1, m0, m2
-    punpcklbw m0, m0, m2
-%if %6==0
-    psllw     m0, 7
-    psllw     m1, 7
-%endif
-    pmulhrsw  m0, m3
-    pmulhrsw  m1, m3
-    paddw     m0, m4
-    paddw     m1, m4
-    packuswb  m0, m1
-    mova    [%3], xm0
-    vextracti128 [%4], m0, 1
-%else
-    movq     xm0, [%1]
-    vinserti128 m0, m0, [%2], 1
-    punpcklbw m0, m2
-%if %6==0
-    psllw     m0, 7
-%endif
-    pmulhrsw  m0, m3
-    paddw     m0, m4
-    packuswb  m0, m0
-    vextracti128 xm1, m0, 1
-%if %5 == 8
-    movq    [%3], xm0
-    movq    [%4], xm1
-%else
-    movd    [%3], xm0
-    movd    [%4], xm1
-%endif
-%endif
-%else
-    movh      m0, [%1]
-    movh      m1, [%2]
-    punpcklbw m0, m2
-    punpcklbw m1, m2
-%if cpuflag(ssse3)
-%if %6==0
-    psllw     m0, 7
-    psllw     m1, 7
-%endif
-    pmulhrsw  m0, m3
-    pmulhrsw  m1, m3
-    paddw     m0, m4
-    paddw     m1, m4
-%else
-    pmullw    m0, m3
-    pmullw    m1, m3
-    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
-    paddsw    m1, m4
-    psraw     m0, m5
-    psraw     m1, m5
-%endif
-%if %5 == 8
-    packuswb  m0, m1
-    movh    [%3], m0
-    movhps  [%4], m0
-%else
-    packuswb  m0, m0
-    packuswb  m1, m1
-    movd    [%3], m0    ; width 2 can write garbage for the last 2 bytes
-    movd    [%4], m1
-%endif
-%endif
-%endmacro
-; src, dst, width
-%macro WEIGHT_TWO_ROW 4
-%assign x 0
-%rep %3
-%if (%3-x) >= mmsize
-    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
-    %assign x (x+mmsize)
-%else
-    %assign w %3-x
-%if w == 20
-    %assign w 16
-%endif
-    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
-    %assign x (x+w)
-%endif
-%if x >= %3
-    %exitrep
-%endif
-%endrep
-%endmacro
-
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
-;-----------------------------------------------------------------------------
-
-%macro WEIGHTER 1
-cglobal mc_weight_w%1, 6,6,8
-    FIX_STRIDES r1, r3
-    WEIGHT_START %1
-%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
-    ; we can merge the shift step into the scale factor
-    ; if (m3<<7) doesn't overflow an int16_t
-    cmp byte [r4+1], 0
-    jz .fast
-%endif
-.loop:
-    WEIGHT_TWO_ROW r2, r0, %1, 0
-    lea  r0, [r0+r1*2]
-    lea  r2, [r2+r3*2]
-    sub r5d, 2
-    jg .loop
-    RET
-%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
-.fast:
-    psllw m3, 7
-.fastloop:
-    WEIGHT_TWO_ROW r2, r0, %1, 1
-    lea  r0, [r0+r1*2]
-    lea  r2, [r2+r3*2]
-    sub r5d, 2
-    jg .fastloop
-    RET
-%endif
-%endmacro
-
-INIT_MMX mmx2
-WEIGHTER  4
-WEIGHTER  8
-WEIGHTER 12
-WEIGHTER 16
-WEIGHTER 20
-INIT_XMM sse2
-WEIGHTER  8
-WEIGHTER 16
-WEIGHTER 20
-%if HIGH_BIT_DEPTH
-WEIGHTER 12
-%else
-INIT_MMX ssse3
-WEIGHTER  4
-INIT_XMM ssse3
-WEIGHTER  8
-WEIGHTER 16
-WEIGHTER 20
-INIT_YMM avx2
-WEIGHTER 8
-WEIGHTER 16
-WEIGHTER 20
-%endif
-
-%macro OFFSET_OP 7
-    mov%6        m0, [%1]
-    mov%6        m1, [%2]
-%if HIGH_BIT_DEPTH
-    p%5usw       m0, m2
-    p%5usw       m1, m2
-%ifidn %5,add
-    pminsw       m0, m3
-    pminsw       m1, m3
-%endif
-%else
-    p%5usb       m0, m2
-    p%5usb       m1, m2
-%endif
-    mov%7      [%3], m0
-    mov%7      [%4], m1
-%endmacro
-
-%macro OFFSET_TWO_ROW 4
-%assign x 0
-%rep %3
-%if (%3*SIZEOF_PIXEL-x) >= mmsize
-    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
-    %assign x (x+mmsize)
-%else
-%if HIGH_BIT_DEPTH
-    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
-%else
-    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
-%endif
-    %exitrep
-%endif
-%if x >= %3*SIZEOF_PIXEL
-    %exitrep
-%endif
-%endrep
-%endmacro
-
-;-----------------------------------------------------------------------------
-;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
-;-----------------------------------------------------------------------------
-%macro OFFSET 2
-cglobal mc_offset%2_w%1, 6,6
-    FIX_STRIDES r1, r3
-    mova m2, [r4]
-%if HIGH_BIT_DEPTH
-%ifidn %2,add
-    mova m3, [pw_pixel_max]
-%endif
-%endif
-.loop:
-    OFFSET_TWO_ROW r2, r0, %1, %2
-    lea  r0, [r0+r1*2]
-    lea  r2, [r2+r3*2]
-    sub r5d, 2
-    jg .loop
-    RET
-%endmacro
-
-%macro OFFSETPN 1
-       OFFSET %1, add
-       OFFSET %1, sub
-%endmacro
-INIT_MMX mmx2
-OFFSETPN  4
-OFFSETPN  8
-OFFSETPN 12
-OFFSETPN 16
-OFFSETPN 20
-INIT_XMM sse2
-OFFSETPN 12
-OFFSETPN 16
-OFFSETPN 20
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-OFFSETPN  8
-%endif
-
-
-;=============================================================================
-; pixel avg
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
-;                     pixel *src2, intptr_t src2_stride, int weight );
-;-----------------------------------------------------------------------------
-%macro AVGH 2
-cglobal pixel_avg_%1x%2
-    mov eax, %2
-    cmp dword r6m, 32
-    jne pixel_avg_weight_w%1 %+ SUFFIX
-%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
-    jmp pixel_avg_w%1_avx2
-%else
-%if mmsize == 16 && %1 == 16
-    test dword r4m, 15
-    jz pixel_avg_w%1_sse2
-%endif
-    jmp pixel_avg_w%1_mmx2
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
-;                    pixel *src2, intptr_t src2_stride, int height, int weight );
-;-----------------------------------------------------------------------------
-
-%macro AVG_FUNC 3
-cglobal pixel_avg_w%1
-    AVG_START
-.height_loop:
-%assign x 0
-%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
-    %2     m0, [t2+x]
-    %2     m1, [t2+x+SIZEOF_PIXEL*t3]
-%if HIGH_BIT_DEPTH
-    pavgw  m0, [t4+x]
-    pavgw  m1, [t4+x+SIZEOF_PIXEL*t5]
-%else ;!HIGH_BIT_DEPTH
-    pavgb  m0, [t4+x]
-    pavgb  m1, [t4+x+SIZEOF_PIXEL*t5]
-%endif
-    %3     [t0+x], m0
-    %3     [t0+x+SIZEOF_PIXEL*t1], m1
-%assign x x+mmsize
-%endrep
-    AVG_END
-%endmacro
-
-%if HIGH_BIT_DEPTH
-
-INIT_MMX mmx2
-AVG_FUNC 4, movq, movq
-AVGH 4, 16
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
-
-AVG_FUNC 8, movq, movq
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
-
-AVG_FUNC 16, movq, movq
-AVGH 16, 16
-AVGH 16,  8
-
-INIT_XMM sse2
-AVG_FUNC 4, movq, movq
-AVGH  4, 16
-AVGH  4, 8
-AVGH  4, 4
-AVGH  4, 2
-
-AVG_FUNC 8, movdqu, movdqa
-AVGH  8, 16
-AVGH  8,  8
-AVGH  8,  4
-
-AVG_FUNC 16, movdqu, movdqa
-AVGH  16, 16
-AVGH  16,  8
-
-%else ;!HIGH_BIT_DEPTH
-
-INIT_MMX mmx2
-AVG_FUNC 4, movd, movd
-AVGH 4, 16
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
-
-AVG_FUNC 8, movq, movq
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
-
-AVG_FUNC 16, movq, movq
-AVGH 16, 16
-AVGH 16, 8
-
-INIT_XMM sse2
-AVG_FUNC 16, movdqu, movdqa
-AVGH 16, 16
-AVGH 16,  8
-AVGH  8, 16
-AVGH  8,  8
-AVGH  8,  4
-INIT_XMM ssse3
-AVGH 16, 16
-AVGH 16,  8
-AVGH  8, 16
-AVGH  8,  8
-AVGH  8,  4
-INIT_MMX ssse3
-AVGH  4, 16
-AVGH  4,  8
-AVGH  4,  4
-AVGH  4,  2
-INIT_XMM avx2
-AVG_FUNC 16, movdqu, movdqa
-AVGH 16, 16
-AVGH 16,  8
-
-%endif ;HIGH_BIT_DEPTH
-
-
-
-;=============================================================================
-; pixel avg2
-;=============================================================================
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void pixel_avg2_wN( uint16_t *dst,  intptr_t dst_stride,
-;                     uint16_t *src1, intptr_t src_stride,
-;                     uint16_t *src2, int height );
-;-----------------------------------------------------------------------------
-%macro AVG2_W_ONE 1
-cglobal pixel_avg2_w%1, 6,7,4
-    sub     r4, r2
-    lea     r6, [r4+r3*2]
-.height_loop:
-    movu    m0, [r2]
-    movu    m1, [r2+r3*2]
-%if cpuflag(avx) || mmsize == 8
-    pavgw   m0, [r2+r4]
-    pavgw   m1, [r2+r6]
-%else
-    movu    m2, [r2+r4]
-    movu    m3, [r2+r6]
-    pavgw   m0, m2
-    pavgw   m1, m3
-%endif
-    mova   [r0], m0
-    mova   [r0+r1*2], m1
-    lea     r2, [r2+r3*4]
-    lea     r0, [r0+r1*4]
-    sub    r5d, 2
-    jg .height_loop
-    RET
-%endmacro
-
-%macro AVG2_W_TWO 3
-cglobal pixel_avg2_w%1, 6,7,8
-    sub     r4, r2
-    lea     r6, [r4+r3*2]
-.height_loop:
-    movu    m0, [r2]
-    %2      m1, [r2+mmsize]
-    movu    m2, [r2+r3*2]
-    %2      m3, [r2+r3*2+mmsize]
-%if mmsize == 8
-    pavgw   m0, [r2+r4]
-    pavgw   m1, [r2+r4+mmsize]
-    pavgw   m2, [r2+r6]
-    pavgw   m3, [r2+r6+mmsize]
-%else
-    movu    m4, [r2+r4]
-    %2      m5, [r2+r4+mmsize]
-    movu    m6, [r2+r6]
-    %2      m7, [r2+r6+mmsize]
-    pavgw   m0, m4
-    pavgw   m1, m5
-    pavgw   m2, m6
-    pavgw   m3, m7
-%endif
-    mova   [r0], m0
-    %3     [r0+mmsize], m1
-    mova   [r0+r1*2], m2
-    %3     [r0+r1*2+mmsize], m3
-    lea     r2, [r2+r3*4]
-    lea     r0, [r0+r1*4]
-    sub    r5d, 2
-    jg .height_loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-AVG2_W_ONE  4
-AVG2_W_TWO  8, movu, mova
-INIT_XMM sse2
-AVG2_W_ONE  8
-AVG2_W_TWO 10, movd, movd
-AVG2_W_TWO 16, movu, mova
-INIT_YMM avx2
-AVG2_W_ONE 16
-
-INIT_MMX
-cglobal pixel_avg2_w10_mmx2, 6,7
-    sub     r4, r2
-    lea     r6, [r4+r3*2]
-.height_loop:
-    movu    m0, [r2+ 0]
-    movu    m1, [r2+ 8]
-    movh    m2, [r2+16]
-    movu    m3, [r2+r3*2+ 0]
-    movu    m4, [r2+r3*2+ 8]
-    movh    m5, [r2+r3*2+16]
-    pavgw   m0, [r2+r4+ 0]
-    pavgw   m1, [r2+r4+ 8]
-    pavgw   m2, [r2+r4+16]
-    pavgw   m3, [r2+r6+ 0]
-    pavgw   m4, [r2+r6+ 8]
-    pavgw   m5, [r2+r6+16]
-    mova   [r0+ 0], m0
-    mova   [r0+ 8], m1
-    movh   [r0+16], m2
-    mova   [r0+r1*2+ 0], m3
-    mova   [r0+r1*2+ 8], m4
-    movh   [r0+r1*2+16], m5
-    lea     r2, [r2+r3*2*2]
-    lea     r0, [r0+r1*2*2]
-    sub    r5d, 2
-    jg .height_loop
-    RET
-
-cglobal pixel_avg2_w16_mmx2, 6,7
-    sub     r4, r2
-    lea     r6, [r4+r3*2]
-.height_loop:
-    movu    m0, [r2+ 0]
-    movu    m1, [r2+ 8]
-    movu    m2, [r2+16]
-    movu    m3, [r2+24]
-    movu    m4, [r2+r3*2+ 0]
-    movu    m5, [r2+r3*2+ 8]
-    movu    m6, [r2+r3*2+16]
-    movu    m7, [r2+r3*2+24]
-    pavgw   m0, [r2+r4+ 0]
-    pavgw   m1, [r2+r4+ 8]
-    pavgw   m2, [r2+r4+16]
-    pavgw   m3, [r2+r4+24]
-    pavgw   m4, [r2+r6+ 0]
-    pavgw   m5, [r2+r6+ 8]
-    pavgw   m6, [r2+r6+16]
-    pavgw   m7, [r2+r6+24]
-    mova   [r0+ 0], m0
-    mova   [r0+ 8], m1
-    mova   [r0+16], m2
-    mova   [r0+24], m3
-    mova   [r0+r1*2+ 0], m4
-    mova   [r0+r1*2+ 8], m5
-    mova   [r0+r1*2+16], m6
-    mova   [r0+r1*2+24], m7
-    lea     r2, [r2+r3*2*2]
-    lea     r0, [r0+r1*2*2]
-    sub    r5d, 2
-    jg .height_loop
-    RET
-
-cglobal pixel_avg2_w18_mmx2, 6,7
-    sub     r4, r2
-.height_loop:
-    movu    m0, [r2+ 0]
-    movu    m1, [r2+ 8]
-    movu    m2, [r2+16]
-    movu    m3, [r2+24]
-    movh    m4, [r2+32]
-    pavgw   m0, [r2+r4+ 0]
-    pavgw   m1, [r2+r4+ 8]
-    pavgw   m2, [r2+r4+16]
-    pavgw   m3, [r2+r4+24]
-    pavgw   m4, [r2+r4+32]
-    mova   [r0+ 0], m0
-    mova   [r0+ 8], m1
-    mova   [r0+16], m2
-    mova   [r0+24], m3
-    movh   [r0+32], m4
-    lea     r2, [r2+r3*2]
-    lea     r0, [r0+r1*2]
-    dec    r5d
-    jg .height_loop
-    RET
-
-%macro PIXEL_AVG_W18 0
-cglobal pixel_avg2_w18, 6,7
-    sub     r4, r2
-.height_loop:
-    movu    m0, [r2+ 0]
-    movd   xm2, [r2+32]
-%if mmsize == 32
-    pavgw   m0, [r2+r4+ 0]
-    movd   xm1, [r2+r4+32]
-    pavgw  xm2, xm1
-%else
-    movu    m1, [r2+16]
-    movu    m3, [r2+r4+ 0]
-    movu    m4, [r2+r4+16]
-    movd    m5, [r2+r4+32]
-    pavgw   m0, m3
-    pavgw   m1, m4
-    pavgw   m2, m5
-    mova   [r0+16], m1
-%endif
-    mova   [r0+ 0], m0
-    movd   [r0+32], xm2
-    lea     r2, [r2+r3*2]
-    lea     r0, [r0+r1*2]
-    dec    r5d
-    jg .height_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PIXEL_AVG_W18
-INIT_YMM avx2
-PIXEL_AVG_W18
-
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-;-----------------------------------------------------------------------------
-; void pixel_avg2_w4( uint8_t *dst,  intptr_t dst_stride,
-;                     uint8_t *src1, intptr_t src_stride,
-;                     uint8_t *src2, int height );
-;-----------------------------------------------------------------------------
-%macro AVG2_W8 2
-cglobal pixel_avg2_w%1_mmx2, 6,7
-    sub    r4, r2
-    lea    r6, [r4+r3]
-.height_loop:
-    %2     mm0, [r2]
-    %2     mm1, [r2+r3]
-    pavgb  mm0, [r2+r4]
-    pavgb  mm1, [r2+r6]
-    lea    r2, [r2+r3*2]
-    %2     [r0], mm0
-    %2     [r0+r1], mm1
-    lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
-    RET
-%endmacro
-
-INIT_MMX
-AVG2_W8 4, movd
-AVG2_W8 8, movq
-
-%macro AVG2_W16 2
-cglobal pixel_avg2_w%1_mmx2, 6,7
-    sub    r2, r4
-    lea    r6, [r2+r3]
-.height_loop:
-    movq   mm0, [r4]
-    %2     mm1, [r4+8]
-    movq   mm2, [r4+r3]
-    %2     mm3, [r4+r3+8]
-    pavgb  mm0, [r4+r2]
-    pavgb  mm1, [r4+r2+8]
-    pavgb  mm2, [r4+r6]
-    pavgb  mm3, [r4+r6+8]
-    lea    r4, [r4+r3*2]
-    movq   [r0], mm0
-    %2     [r0+8], mm1
-    movq   [r0+r1], mm2
-    %2     [r0+r1+8], mm3
-    lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
-    RET
-%endmacro
-
-AVG2_W16 12, movd
-AVG2_W16 16, movq
-
-cglobal pixel_avg2_w20_mmx2, 6,7
-    sub    r2, r4
-    lea    r6, [r2+r3]
-.height_loop:
-    movq   mm0, [r4]
-    movq   mm1, [r4+8]
-    movd   mm2, [r4+16]
-    movq   mm3, [r4+r3]
-    movq   mm4, [r4+r3+8]
-    movd   mm5, [r4+r3+16]
-    pavgb  mm0, [r4+r2]
-    pavgb  mm1, [r4+r2+8]
-    pavgb  mm2, [r4+r2+16]
-    pavgb  mm3, [r4+r6]
-    pavgb  mm4, [r4+r6+8]
-    pavgb  mm5, [r4+r6+16]
-    lea    r4, [r4+r3*2]
-    movq   [r0], mm0
-    movq   [r0+8], mm1
-    movd   [r0+16], mm2
-    movq   [r0+r1], mm3
-    movq   [r0+r1+8], mm4
-    movd   [r0+r1+16], mm5
-    lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
-    RET
-
-INIT_XMM
-cglobal pixel_avg2_w16_sse2, 6,7
-    sub    r4, r2
-    lea    r6, [r4+r3]
-.height_loop:
-    movu   m0, [r2]
-    movu   m2, [r2+r3]
-    movu   m1, [r2+r4]
-    movu   m3, [r2+r6]
-    lea    r2, [r2+r3*2]
-    pavgb  m0, m1
-    pavgb  m2, m3
-    mova [r0], m0
-    mova [r0+r1], m2
-    lea    r0, [r0+r1*2]
-    sub   r5d, 2
-    jg .height_loop
-    RET
-
-cglobal pixel_avg2_w20_sse2, 6,7
-    sub    r2, r4
-    lea    r6, [r2+r3]
-.height_loop:
-    movu   m0, [r4]
-    movu   m2, [r4+r3]
-    movu   m1, [r4+r2]
-    movu   m3, [r4+r6]
-    movd  mm4, [r4+16]
-    movd  mm5, [r4+r3+16]
-    pavgb  m0, m1
-    pavgb  m2, m3
-    pavgb mm4, [r4+r2+16]
-    pavgb mm5, [r4+r6+16]
-    lea    r4, [r4+r3*2]
-    mova [r0], m0
-    mova [r0+r1], m2
-    movd [r0+16], mm4
-    movd [r0+r1+16], mm5
-    lea    r0, [r0+r1*2]
-    sub   r5d, 2
-    jg .height_loop
-    RET
-
-INIT_YMM avx2
-cglobal pixel_avg2_w20, 6,7
-    sub    r2, r4
-    lea    r6, [r2+r3]
-.height_loop:
-    movu   m0, [r4]
-    movu   m1, [r4+r3]
-    pavgb  m0, [r4+r2]
-    pavgb  m1, [r4+r6]
-    lea    r4, [r4+r3*2]
-    mova [r0], m0
-    mova [r0+r1], m1
-    lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
-    RET
-
-; Cacheline split code for processors with high latencies for loads
-; split over cache lines.  See sad-a.asm for a more detailed explanation.
-; This particular instance is complicated by the fact that src1 and src2
-; can have different alignments.  For simplicity and code size, only the
-; MMX cacheline workaround is used.  As a result, in the case of SSE2
-; pixel_avg, the cacheline check functions calls the SSE2 version if there
-; is no cacheline split, and the MMX workaround if there is.
-
-%macro INIT_SHIFT 2
-    and    eax, 7
-    shl    eax, 3
-    movd   %1, [sw_64]
-    movd   %2, eax
-    psubw  %1, %2
-%endmacro
-
-%macro AVG_CACHELINE_START 0
-    %assign stack_offset 0
-    INIT_SHIFT mm6, mm7
-    mov    eax, r4m
-    INIT_SHIFT mm4, mm5
-    PROLOGUE 6,6
-    and    r2, ~7
-    and    r4, ~7
-    sub    r4, r2
-.height_loop:
-%endmacro
-
-%macro AVG_CACHELINE_LOOP 2
-    movq   mm1, [r2+%1]
-    movq   mm0, [r2+8+%1]
-    movq   mm3, [r2+r4+%1]
-    movq   mm2, [r2+r4+8+%1]
-    psrlq  mm1, mm7
-    psllq  mm0, mm6
-    psrlq  mm3, mm5
-    psllq  mm2, mm4
-    por    mm0, mm1
-    por    mm2, mm3
-    pavgb  mm2, mm0
-    %2 [r0+%1], mm2
-%endmacro
-
-%macro AVG_CACHELINE_FUNC 2
-pixel_avg2_w%1_cache_mmx2:
-    AVG_CACHELINE_START
-    AVG_CACHELINE_LOOP 0, movq
-%if %1>8
-    AVG_CACHELINE_LOOP 8, movq
-%if %1>16
-    AVG_CACHELINE_LOOP 16, movd
-%endif
-%endif
-    add    r2, r3
-    add    r0, r1
-    dec    r5d
-    jg .height_loop
-    RET
-%endmacro
-
-%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
-%if %1 == 12
-;w12 isn't needed because w16 is just as fast if there's no cacheline split
-%define cachesplit pixel_avg2_w16_cache_mmx2
-%else
-%define cachesplit pixel_avg2_w%1_cache_mmx2
-%endif
-cglobal pixel_avg2_w%1_cache%2_%3
-    mov    eax, r2m
-    and    eax, %2-1
-    cmp    eax, (%2-%1-(%1 % 8))
-%if %1==12||%1==20
-    jbe pixel_avg2_w%1_%3
-%else
-    jb pixel_avg2_w%1_%3
-%endif
-%if 0 ; or %1==8 - but the extra branch seems too expensive
-    ja cachesplit
-%if ARCH_X86_64
-    test      r4b, 1
-%else
-    test byte r4m, 1
-%endif
-    jz pixel_avg2_w%1_%3
-%else
-    or     eax, r4m
-    and    eax, 7
-    jz pixel_avg2_w%1_%3
-    mov    eax, r2m
-%endif
-%if mmsize==16 || (%1==8 && %2==64)
-    AVG_CACHELINE_FUNC %1, %2
-%else
-    jmp cachesplit
-%endif
-%endmacro
-
-INIT_MMX
-AVG_CACHELINE_CHECK  8, 64, mmx2
-AVG_CACHELINE_CHECK 12, 64, mmx2
-%if ARCH_X86_64 == 0
-AVG_CACHELINE_CHECK 16, 64, mmx2
-AVG_CACHELINE_CHECK 20, 64, mmx2
-AVG_CACHELINE_CHECK  8, 32, mmx2
-AVG_CACHELINE_CHECK 12, 32, mmx2
-AVG_CACHELINE_CHECK 16, 32, mmx2
-AVG_CACHELINE_CHECK 20, 32, mmx2
-%endif
-INIT_XMM
-AVG_CACHELINE_CHECK 16, 64, sse2
-AVG_CACHELINE_CHECK 20, 64, sse2
-
-; computed jump assumes this loop is exactly 48 bytes
-%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
-ALIGN 16
-avg_w16_align%1_%2_ssse3:
-%if %1==0 && %2==0
-    movdqa  xmm1, [r2]
-    pavgb   xmm1, [r2+r4]
-    add    r2, r3
-%elif %1==0
-    movdqa  xmm1, [r2+r4+16]
-    palignr xmm1, [r2+r4], %2
-    pavgb   xmm1, [r2]
-    add    r2, r3
-%elif %2&15==0
-    movdqa  xmm1, [r2+16]
-    palignr xmm1, [r2], %1
-    pavgb   xmm1, [r2+r4]
-    add    r2, r3
-%else
-    movdqa  xmm1, [r2+16]
-    movdqa  xmm2, [r2+r4+16]
-    palignr xmm1, [r2], %1
-    palignr xmm2, [r2+r4], %2&15
-    add    r2, r3
-    pavgb   xmm1, xmm2
-%endif
-    movdqa  [r0], xmm1
-    add    r0, r1
-    dec    r5d
-    jg     avg_w16_align%1_%2_ssse3
-    ret
-%if %1==0
-    ; make sure the first ones don't end up short
-    ALIGN 16
-    times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
-%endif
-%endmacro
-
-cglobal pixel_avg2_w16_cache64_ssse3
-%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
-    mov   eax, r2m
-    and   eax, 0x3f
-    cmp   eax, 0x30
-    jb x264_pixel_avg2_w16_sse2
-    or    eax, r4m
-    and   eax, 7
-    jz x264_pixel_avg2_w16_sse2
-%endif
-    PROLOGUE 6, 8
-    lea    r6, [r4+r2]
-    and    r4, ~0xf
-    and    r6, 0x1f
-    and    r2, ~0xf
-    lea    r6, [r6*3]    ;(offset + align*2)*3
-    sub    r4, r2
-    shl    r6, 4         ;jump = (offset + align*2)*48
-%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
-%ifdef PIC
-    lea    r7, [avg_w16_addr]
-    add    r6, r7
-%else
-    lea    r6, [avg_w16_addr + r6]
-%endif
-    TAIL_CALL r6, 1
-
-%assign j 0
-%assign k 1
-%rep 16
-AVG16_CACHELINE_LOOP_SSSE3 j, j
-AVG16_CACHELINE_LOOP_SSSE3 j, k
-%assign j j+1
-%assign k k+1
-%endrep
-%endif ; !HIGH_BIT_DEPTH
-
-;=============================================================================
-; pixel copy
-;=============================================================================
-
-%macro COPY1 2
-    movu  m0, [r2]
-    movu  m1, [r2+r3]
-    movu  m2, [r2+r3*2]
-    movu  m3, [r2+%2]
-    mova  [r0],      m0
-    mova  [r0+r1],   m1
-    mova  [r0+r1*2], m2
-    mova  [r0+%1],   m3
-%endmacro
-
-%macro COPY2 2-4 0, 1
-    movu  m0, [r2+%3*mmsize]
-    movu  m1, [r2+%4*mmsize]
-    movu  m2, [r2+r3+%3*mmsize]
-    movu  m3, [r2+r3+%4*mmsize]
-    mova  [r0+%3*mmsize],      m0
-    mova  [r0+%4*mmsize],      m1
-    mova  [r0+r1+%3*mmsize],   m2
-    mova  [r0+r1+%4*mmsize],   m3
-    movu  m0, [r2+r3*2+%3*mmsize]
-    movu  m1, [r2+r3*2+%4*mmsize]
-    movu  m2, [r2+%2+%3*mmsize]
-    movu  m3, [r2+%2+%4*mmsize]
-    mova  [r0+r1*2+%3*mmsize], m0
-    mova  [r0+r1*2+%4*mmsize], m1
-    mova  [r0+%1+%3*mmsize],   m2
-    mova  [r0+%1+%4*mmsize],   m3
-%endmacro
-
-%macro COPY4 2
-    COPY2 %1, %2, 0, 1
-    COPY2 %1, %2, 2, 3
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
-;                  uint8_t *src, intptr_t i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal mc_copy_w4_mmx, 4,6
-    FIX_STRIDES r1, r3
-    cmp dword r4m, 4
-    lea     r5, [r3*3]
-    lea     r4, [r1*3]
-    je .end
-%if HIGH_BIT_DEPTH == 0
-    %define mova movd
-    %define movu movd
-%endif
-    COPY1   r4, r5
-    lea     r2, [r2+r3*4]
-    lea     r0, [r0+r1*4]
-.end:
-    COPY1   r4, r5
-    RET
-
-%macro MC_COPY 1
-%assign %%w %1*SIZEOF_PIXEL/mmsize
-%if %%w > 0
-cglobal mc_copy_w%1, 5,7
-    FIX_STRIDES r1, r3
-    lea     r6, [r3*3]
-    lea     r5, [r1*3]
-.height_loop:
-    COPY %+ %%w r5, r6
-    lea     r2, [r2+r3*4]
-    lea     r0, [r0+r1*4]
-    sub    r4d, 4
-    jg .height_loop
-    RET
-%endif
-%endmacro
-
-INIT_MMX mmx
-MC_COPY  8
-MC_COPY 16
-INIT_XMM sse
-MC_COPY  8
-MC_COPY 16
-INIT_XMM aligned, sse
-MC_COPY 16
-%if HIGH_BIT_DEPTH
-INIT_YMM avx
-MC_COPY 16
-INIT_YMM aligned, avx
-MC_COPY 16
-%endif
-
-;=============================================================================
-; prefetch
-;=============================================================================
-; assumes 64 byte cachelines
-; FIXME doesn't cover all pixels in high depth and/or 4:4:4
-
-;-----------------------------------------------------------------------------
-; void prefetch_fenc( pixel *pix_y,  intptr_t stride_y,
-;                     pixel *pix_uv, intptr_t stride_uv, int mb_x )
-;-----------------------------------------------------------------------------
-
-%macro PREFETCH_FENC 1
-%if ARCH_X86_64
-cglobal prefetch_fenc_%1, 5,5
-    FIX_STRIDES r1, r3
-    and    r4d, 3
-    mov    eax, r4d
-    imul   r4d, r1d
-    lea    r0,  [r0+r4*4+64*SIZEOF_PIXEL]
-    prefetcht0  [r0]
-    prefetcht0  [r0+r1]
-    lea    r0,  [r0+r1*2]
-    prefetcht0  [r0]
-    prefetcht0  [r0+r1]
-
-    imul   eax, r3d
-    lea    r2,  [r2+rax*2+64*SIZEOF_PIXEL]
-    prefetcht0  [r2]
-    prefetcht0  [r2+r3]
-%ifidn %1, 422
-    lea    r2,  [r2+r3*2]
-    prefetcht0  [r2]
-    prefetcht0  [r2+r3]
-%endif
-    RET
-
-%else
-cglobal prefetch_fenc_%1, 0,3
-    mov    r2, r4m
-    mov    r1, r1m
-    mov    r0, r0m
-    FIX_STRIDES r1
-    and    r2, 3
-    imul   r2, r1
-    lea    r0, [r0+r2*4+64*SIZEOF_PIXEL]
-    prefetcht0 [r0]
-    prefetcht0 [r0+r1]
-    lea    r0, [r0+r1*2]
-    prefetcht0 [r0]
-    prefetcht0 [r0+r1]
-
-    mov    r2, r4m
-    mov    r1, r3m
-    mov    r0, r2m
-    FIX_STRIDES r1
-    and    r2, 3
-    imul   r2, r1
-    lea    r0, [r0+r2*2+64*SIZEOF_PIXEL]
-    prefetcht0 [r0]
-    prefetcht0 [r0+r1]
-%ifidn %1, 422
-    lea    r0,  [r0+r1*2]
-    prefetcht0  [r0]
-    prefetcht0  [r0+r1]
-%endif
-    ret
-%endif ; ARCH_X86_64
-%endmacro
-
-INIT_MMX mmx2
-PREFETCH_FENC 420
-PREFETCH_FENC 422
-
-;-----------------------------------------------------------------------------
-; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal prefetch_ref, 3,3
-    FIX_STRIDES r1
-    dec    r2d
-    and    r2d, r1d
-    lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
-    lea    r2,  [r1*3]
-    prefetcht0  [r0]
-    prefetcht0  [r0+r1]
-    prefetcht0  [r0+r1*2]
-    prefetcht0  [r0+r2]
-    lea    r0,  [r0+r1*4]
-    prefetcht0  [r0]
-    prefetcht0  [r0+r1]
-    prefetcht0  [r0+r1*2]
-    prefetcht0  [r0+r2]
-    RET
-
-
-
-;=============================================================================
-; chroma MC
-;=============================================================================
-
-%if ARCH_X86_64
-    DECLARE_REG_TMP 6,7,8
-%else
-    DECLARE_REG_TMP 0,1,2
-%endif
-
-%macro MC_CHROMA_START 1
-%if ARCH_X86_64
-    PROLOGUE 0,9,%1
-%else
-    PROLOGUE 0,6,%1
-%endif
-    movifnidn r3,  r3mp
-    movifnidn r4d, r4m
-    movifnidn r5d, r5m
-    movifnidn t0d, r6m
-    mov       t2d, t0d
-    mov       t1d, r5d
-    sar       t0d, 3
-    sar       t1d, 3
-    imul      t0d, r4d
-    lea       t0d, [t0+t1*2]
-    FIX_STRIDES t0d
-    movsxdifnidn t0, t0d
-    add       r3,  t0            ; src += (dx>>3) + (dy>>3) * src_stride
-%endmacro
-
-%if HIGH_BIT_DEPTH
-%macro UNPACK_UNALIGNED 4
-    movu       %1, [%4+0]
-    movu       %2, [%4+4]
-    punpckhwd  %3, %1, %2
-    punpcklwd  %1, %2
-%if mmsize == 8
-    mova       %2, %1
-    punpcklwd  %1, %3
-    punpckhwd  %2, %3
-%else
-    shufps     %2, %1, %3, q3131
-    shufps     %1, %3, q2020
-%endif
-%endmacro
-%else ; !HIGH_BIT_DEPTH
-%macro UNPACK_UNALIGNED 3
-%if mmsize == 8
-    punpcklwd  %1, %3
-%else
-    movh       %2, %3
-    punpcklwd  %1, %2
-%endif
-%endmacro
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
-;                 uint8_t *src, intptr_t src_stride,
-;                 int dx, int dy,
-;                 int width, int height )
-;-----------------------------------------------------------------------------
-%macro MC_CHROMA 0
-cglobal mc_chroma
-    MC_CHROMA_START 0
-    FIX_STRIDES r4
-    and       r5d, 7
-%if ARCH_X86_64
-    jz .mc1dy
-%endif
-    and       t2d, 7
-%if ARCH_X86_64
-    jz .mc1dx
-%endif
-    shl       r5d, 16
-    add       t2d, r5d
-    mov       t0d, t2d
-    shl       t2d, 8
-    sub       t2d, t0d
-    add       t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
-    cmp dword r7m, 4
-%if mmsize==8
-.skip_prologue:
-%else
-    jl mc_chroma_mmx2 %+ .skip_prologue
-    WIN64_SPILL_XMM 9
-%endif
-    movd       m5, t2d
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
-    movifnidn r2d, r2m
-    movifnidn r5d, r8m
-    pxor       m6, m6
-    punpcklbw  m5, m6
-%if mmsize==8
-    pshufw     m7, m5, q3232
-    pshufw     m6, m5, q0000
-    pshufw     m5, m5, q1111
-    jge .width4
-%else
-%if WIN64
-    cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
-%endif
-    pshufd     m7, m5, q1111
-    punpcklwd  m5, m5
-    pshufd     m6, m5, q0000
-    pshufd     m5, m5, q1111
-    jg .width8
-%endif
-%if HIGH_BIT_DEPTH
-    add        r2, r2
-    UNPACK_UNALIGNED m0, m1, m2, r3
-%else
-    movu       m0, [r3]
-    UNPACK_UNALIGNED m0, m1, [r3+2]
-    mova       m1, m0
-    pand       m0, [pw_00ff]
-    psrlw      m1, 8
-%endif ; HIGH_BIT_DEPTH
-    pmaddwd    m0, m7
-    pmaddwd    m1, m7
-    packssdw   m0, m1
-    SWAP        3, 0
-ALIGN 4
-.loop2:
-%if HIGH_BIT_DEPTH
-    UNPACK_UNALIGNED m0, m1, m2, r3+r4
-    pmullw     m3, m6
-%else ; !HIGH_BIT_DEPTH
-    movu       m0, [r3+r4]
-    UNPACK_UNALIGNED m0, m1, [r3+r4+2]
-    pmullw     m3, m6
-    mova       m1, m0
-    pand       m0, [pw_00ff]
-    psrlw      m1, 8
-%endif ; HIGH_BIT_DEPTH
-    pmaddwd    m0, m7
-    pmaddwd    m1, m7
-    mova       m2, [pw_32]
-    packssdw   m0, m1
-    paddw      m2, m3
-    mova       m3, m0
-    pmullw     m0, m5
-    paddw      m0, m2
-    psrlw      m0, 6
-%if HIGH_BIT_DEPTH
-    movh     [r0], m0
-%if mmsize == 8
-    psrlq      m0, 32
-    movh     [r1], m0
-%else
-    movhps   [r1], m0
-%endif
-%else ; !HIGH_BIT_DEPTH
-    packuswb   m0, m0
-    movd     [r0], m0
-%if mmsize==8
-    psrlq      m0, 16
-%else
-    psrldq     m0, 4
-%endif
-    movd     [r1], m0
-%endif ; HIGH_BIT_DEPTH
-    add        r3, r4
-    add        r0, r2
-    add        r1, r2
-    dec       r5d
-    jg .loop2
-    RET
-
-%if mmsize==8
-.width4:
-%if ARCH_X86_64
-    mov        t0, r0
-    mov        t1, r1
-    mov        t2, r3
-%if WIN64
-    %define multy0 r4m
-%else
-    %define multy0 [rsp-8]
-%endif
-    mova    multy0, m5
-%else
-    mov       r3m, r3
-    %define multy0 r4m
-    mova    multy0, m5
-%endif
-%else
-.width8:
-%if ARCH_X86_64
-    %define multy0 m8
-    SWAP        8, 5
-%else
-    %define multy0 r0m
-    mova    multy0, m5
-%endif
-%endif
-    FIX_STRIDES r2
-.loopx:
-%if HIGH_BIT_DEPTH
-    UNPACK_UNALIGNED m0, m2, m4, r3
-    UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
-%else
-    movu       m0, [r3]
-    movu       m1, [r3+mmsize/2]
-    UNPACK_UNALIGNED m0, m2, [r3+2]
-    UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
-    psrlw      m2, m0, 8
-    psrlw      m3, m1, 8
-    pand       m0, [pw_00ff]
-    pand       m1, [pw_00ff]
-%endif
-    pmaddwd    m0, m7
-    pmaddwd    m2, m7
-    pmaddwd    m1, m7
-    pmaddwd    m3, m7
-    packssdw   m0, m2
-    packssdw   m1, m3
-    SWAP        4, 0
-    SWAP        5, 1
-    add        r3, r4
-ALIGN 4
-.loop4:
-%if HIGH_BIT_DEPTH
-    UNPACK_UNALIGNED m0, m1, m2, r3
-    pmaddwd    m0, m7
-    pmaddwd    m1, m7
-    packssdw   m0, m1
-    UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
-    pmaddwd    m1, m7
-    pmaddwd    m2, m7
-    packssdw   m1, m2
-%else ; !HIGH_BIT_DEPTH
-    movu       m0, [r3]
-    movu       m1, [r3+mmsize/2]
-    UNPACK_UNALIGNED m0, m2, [r3+2]
-    UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
-    psrlw      m2, m0, 8
-    psrlw      m3, m1, 8
-    pand       m0, [pw_00ff]
-    pand       m1, [pw_00ff]
-    pmaddwd    m0, m7
-    pmaddwd    m2, m7
-    pmaddwd    m1, m7
-    pmaddwd    m3, m7
-    packssdw   m0, m2
-    packssdw   m1, m3
-%endif ; HIGH_BIT_DEPTH
-    pmullw     m4, m6
-    pmullw     m5, m6
-    mova       m2, [pw_32]
-    paddw      m3, m2, m5
-    paddw      m2, m4
-    mova       m4, m0
-    mova       m5, m1
-    pmullw     m0, multy0
-    pmullw     m1, multy0
-    paddw      m0, m2
-    paddw      m1, m3
-    psrlw      m0, 6
-    psrlw      m1, 6
-%if HIGH_BIT_DEPTH
-    movh     [r0], m0
-    movh     [r0+mmsize/2], m1
-%if mmsize==8
-    psrlq      m0, 32
-    psrlq      m1, 32
-    movh     [r1], m0
-    movh     [r1+mmsize/2], m1
-%else
-    movhps   [r1], m0
-    movhps   [r1+mmsize/2], m1
-%endif
-%else ; !HIGH_BIT_DEPTH
-    packuswb   m0, m1
-%if mmsize==8
-    pshufw     m1, m0, q0020
-    pshufw     m0, m0, q0031
-    movd     [r0], m1
-    movd     [r1], m0
-%else
-    pshufd     m0, m0, q3120
-    movq     [r0], m0
-    movhps   [r1], m0
-%endif
-%endif ; HIGH_BIT_DEPTH
-    add        r3, r4
-    add        r0, r2
-    add        r1, r2
-    dec       r5d
-    jg .loop4
-%if mmsize!=8
-    RET
-%else
-    sub dword r7m, 4
-    jg .width8
-    RET
-.width8:
-%if ARCH_X86_64
-    lea        r3, [t2+8*SIZEOF_PIXEL]
-    lea        r0, [t0+4*SIZEOF_PIXEL]
-    lea        r1, [t1+4*SIZEOF_PIXEL]
-%else
-    mov        r3, r3m
-    mov        r0, r0m
-    mov        r1, r1m
-    add        r3, 8*SIZEOF_PIXEL
-    add        r0, 4*SIZEOF_PIXEL
-    add        r1, 4*SIZEOF_PIXEL
-%endif
-    mov       r5d, r8m
-    jmp .loopx
-%endif
-
-%if ARCH_X86_64 ; too many regs for x86_32
-    RESET_MM_PERMUTATION
-%if WIN64
-    %assign stack_offset stack_offset - stack_size_padded
-    %assign stack_size_padded 0
-    %assign xmm_regs_used 0
-%endif
-.mc1dy:
-    and       t2d, 7
-    movd       m5, t2d
-    mov       r6d, r4d ; pel_offset = dx ? 2 : src_stride
-    jmp .mc1d
-.mc1dx:
-    movd       m5, r5d
-    mov       r6d, 2*SIZEOF_PIXEL
-.mc1d:
-%if HIGH_BIT_DEPTH && mmsize == 16
-    WIN64_SPILL_XMM 8
-%endif
-    mova       m4, [pw_8]
-    SPLATW     m5, m5
-    psubw      m4, m5
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
-    movifnidn r2d, r2m
-    FIX_STRIDES r2
-    movifnidn r5d, r8m
-    cmp dword r7m, 4
-    jg .mc1d_w8
-    mov        r7, r2
-    mov        r8, r4
-%if mmsize!=8
-    shr       r5d, 1
-%endif
-.loop1d_w4:
-%if HIGH_BIT_DEPTH
-%if mmsize == 8
-    movq       m0, [r3+0]
-    movq       m2, [r3+8]
-    movq       m1, [r3+r6+0]
-    movq       m3, [r3+r6+8]
-%else
-    movu       m0, [r3]
-    movu       m1, [r3+r6]
-    add        r3, r8
-    movu       m2, [r3]
-    movu       m3, [r3+r6]
-%endif
-    SBUTTERFLY wd, 0, 2, 6
-    SBUTTERFLY wd, 1, 3, 7
-    SBUTTERFLY wd, 0, 2, 6
-    SBUTTERFLY wd, 1, 3, 7
-%if mmsize == 16
-    SBUTTERFLY wd, 0, 2, 6
-    SBUTTERFLY wd, 1, 3, 7
-%endif
-%else ; !HIGH_BIT_DEPTH
-    movq       m0, [r3]
-    movq       m1, [r3+r6]
-%if mmsize!=8
-    add        r3, r8
-    movhps     m0, [r3]
-    movhps     m1, [r3+r6]
-%endif
-    psrlw      m2, m0, 8
-    psrlw      m3, m1, 8
-    pand       m0, [pw_00ff]
-    pand       m1, [pw_00ff]
-%endif ; HIGH_BIT_DEPTH
-    pmullw     m0, m4
-    pmullw     m1, m5
-    pmullw     m2, m4
-    pmullw     m3, m5
-    paddw      m0, [pw_4]
-    paddw      m2, [pw_4]
-    paddw      m0, m1
-    paddw      m2, m3
-    psrlw      m0, 3
-    psrlw      m2, 3
-%if HIGH_BIT_DEPTH
-%if mmsize == 8
-    xchg       r4, r8
-    xchg       r2, r7
-%endif
-    movq     [r0], m0
-    movq     [r1], m2
-%if mmsize == 16
-    add        r0, r7
-    add        r1, r7
-    movhps   [r0], m0
-    movhps   [r1], m2
-%endif
-%else ; !HIGH_BIT_DEPTH
-    packuswb   m0, m2
-%if mmsize==8
-    xchg       r4, r8
-    xchg       r2, r7
-    movd     [r0], m0
-    psrlq      m0, 32
-    movd     [r1], m0
-%else
-    movhlps    m1, m0
-    movd     [r0], m0
-    movd     [r1], m1
-    add        r0, r7
-    add        r1, r7
-    psrldq     m0, 4
-    psrldq     m1, 4
-    movd     [r0], m0
-    movd     [r1], m1
-%endif
-%endif ; HIGH_BIT_DEPTH
-    add        r3, r4
-    add        r0, r2
-    add        r1, r2
-    dec       r5d
-    jg .loop1d_w4
-    RET
-.mc1d_w8:
-    sub       r2, 4*SIZEOF_PIXEL
-    sub       r4, 8*SIZEOF_PIXEL
-    mov       r7, 4*SIZEOF_PIXEL
-    mov       r8, 8*SIZEOF_PIXEL
-%if mmsize==8
-    shl       r5d, 1
-%endif
-    jmp .loop1d_w4
-%endif ; ARCH_X86_64
-%endmacro ; MC_CHROMA
-
-%macro MC_CHROMA_SSSE3 0
-cglobal mc_chroma
-    MC_CHROMA_START 10-cpuflag(avx2)
-    and       r5d, 7
-    and       t2d, 7
-    mov       t0d, r5d
-    shl       t0d, 8
-    sub       t0d, r5d
-    mov       r5d, 8
-    add       t0d, 8
-    sub       r5d, t2d
-    imul      t2d, t0d ; (x*255+8)*y
-    imul      r5d, t0d ; (x*255+8)*(8-y)
-    movd      xm6, t2d
-    movd      xm7, r5d
-%if cpuflag(cache64)
-    mov       t0d, r3d
-    and       t0d, 7
-%ifdef PIC
-    lea        t1, [ch_shuf_adj]
-    movddup   xm5, [t1 + t0*4]
-%else
-    movddup   xm5, [ch_shuf_adj + t0*4]
-%endif
-    paddb     xm5, [ch_shuf]
-    and        r3, ~7
-%else
-    mova       m5, [ch_shuf]
-%endif
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
-    movifnidn r2d, r2m
-    movifnidn r5d, r8m
-%if cpuflag(avx2)
-    vpbroadcastw m6, xm6
-    vpbroadcastw m7, xm7
-%else
-    SPLATW     m6, m6
-    SPLATW     m7, m7
-%endif
-%if ARCH_X86_64
-    %define shiftround m8
-    mova       m8, [pw_512]
-%else
-    %define shiftround [pw_512]
-%endif
-    cmp dword r7m, 4
-    jg .width8
-
-%if cpuflag(avx2)
-.loop4:
-    movu      xm0, [r3]
-    movu      xm1, [r3+r4]
-    vinserti128 m0, m0, [r3+r4], 1
-    vinserti128 m1, m1, [r3+r4*2], 1
-    pshufb     m0, m5
-    pshufb     m1, m5
-    pmaddubsw  m0, m7
-    pmaddubsw  m1, m6
-    paddw      m0, m1
-    pmulhrsw   m0, shiftround
-    packuswb   m0, m0
-    vextracti128 xm1, m0, 1
-    movd     [r0], xm0
-    movd  [r0+r2], xm1
-    psrldq    xm0, 4
-    psrldq    xm1, 4
-    movd     [r1], xm0
-    movd  [r1+r2], xm1
-    lea        r3, [r3+r4*2]
-    lea        r0, [r0+r2*2]
-    lea        r1, [r1+r2*2]
-    sub       r5d, 2
-    jg .loop4
-    RET
-.width8:
-    movu      xm0, [r3]
-    vinserti128 m0, m0, [r3+8], 1
-    pshufb     m0, m5
-.loop8:
-    movu      xm3, [r3+r4]
-    vinserti128 m3, m3, [r3+r4+8], 1
-    pshufb     m3, m5
-    pmaddubsw  m1, m0, m7
-    pmaddubsw  m2, m3, m6
-    pmaddubsw  m3, m3, m7
-
-    movu      xm0, [r3+r4*2]
-    vinserti128 m0, m0, [r3+r4*2+8], 1
-    pshufb     m0, m5
-    pmaddubsw  m4, m0, m6
-
-    paddw      m1, m2
-    paddw      m3, m4
-    pmulhrsw   m1, shiftround
-    pmulhrsw   m3, shiftround
-    packuswb   m1, m3
-    mova       m2, [deinterleave_shufd]
-    vpermd     m1, m2, m1
-    vextracti128 xm2, m1, 1
-    movq      [r0], xm1
-    movhps    [r1], xm1
-    movq   [r0+r2], xm2
-    movhps [r1+r2], xm2
-%else
-    movu       m0, [r3]
-    pshufb     m0, m5
-.loop4:
-    movu       m1, [r3+r4]
-    pshufb     m1, m5
-    movu       m3, [r3+r4*2]
-    pshufb     m3, m5
-    mova       m4, m3
-    pmaddubsw  m0, m7
-    pmaddubsw  m2, m1, m7
-    pmaddubsw  m1, m6
-    pmaddubsw  m3, m6
-    paddw      m1, m0
-    paddw      m3, m2
-    pmulhrsw   m1, shiftround
-    pmulhrsw   m3, shiftround
-    mova       m0, m4
-    packuswb   m1, m3
-    movd     [r0], m1
-%if cpuflag(sse4)
-    pextrd    [r1], m1, 1
-    pextrd [r0+r2], m1, 2
-    pextrd [r1+r2], m1, 3
-%else
-    movhlps    m3, m1
-    movd  [r0+r2], m3
-    psrldq     m1, 4
-    psrldq     m3, 4
-    movd     [r1], m1
-    movd  [r1+r2], m3
-%endif
-    lea        r3, [r3+r4*2]
-    lea        r0, [r0+r2*2]
-    lea        r1, [r1+r2*2]
-    sub       r5d, 2
-    jg .loop4
-    RET
-.width8:
-    movu       m0, [r3]
-    pshufb     m0, m5
-    movu       m1, [r3+8]
-    pshufb     m1, m5
-%if ARCH_X86_64
-    SWAP        9, 6
-    %define  mult1 m9
-%else
-    mova      r0m, m6
-    %define  mult1 r0m
-%endif
-.loop8:
-    movu       m2, [r3+r4]
-    pshufb     m2, m5
-    movu       m3, [r3+r4+8]
-    pshufb     m3, m5
-    mova       m4, m2
-    mova       m6, m3
-    pmaddubsw  m0, m7
-    pmaddubsw  m1, m7
-    pmaddubsw  m2, mult1
-    pmaddubsw  m3, mult1
-    paddw      m0, m2
-    paddw      m1, m3
-    pmulhrsw   m0, shiftround ; x + 32 >> 6
-    pmulhrsw   m1, shiftround
-    packuswb   m0, m1
-    pshufd     m0, m0, q3120
-    movq     [r0], m0
-    movhps   [r1], m0
-
-    movu       m2, [r3+r4*2]
-    pshufb     m2, m5
-    movu       m3, [r3+r4*2+8]
-    pshufb     m3, m5
-    mova       m0, m2
-    mova       m1, m3
-    pmaddubsw  m4, m7
-    pmaddubsw  m6, m7
-    pmaddubsw  m2, mult1
-    pmaddubsw  m3, mult1
-    paddw      m2, m4
-    paddw      m3, m6
-    pmulhrsw   m2, shiftround
-    pmulhrsw   m3, shiftround
-    packuswb   m2, m3
-    pshufd     m2, m2, q3120
-    movq   [r0+r2], m2
-    movhps [r1+r2], m2
-%endif
-    lea        r3, [r3+r4*2]
-    lea        r0, [r0+r2*2]
-    lea        r1, [r1+r2*2]
-    sub       r5d, 2
-    jg .loop8
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_MMX mmx2
-MC_CHROMA
-INIT_XMM sse2
-MC_CHROMA
-INIT_XMM avx
-MC_CHROMA
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-MC_CHROMA
-INIT_XMM sse2
-MC_CHROMA
-INIT_XMM ssse3
-MC_CHROMA_SSSE3
-INIT_XMM ssse3, cache64
-MC_CHROMA_SSSE3
-INIT_XMM avx
-MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
-INIT_YMM avx2
-MC_CHROMA_SSSE3
-%endif ; HIGH_BIT_DEPTH
diff --git a/android/src/main/libenc/jni/libx264/common/x86/mc-a2.asm b/android/src/main/libenc/jni/libx264/common/x86/mc-a2.asm
deleted file mode 100755
index 50a4a31..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/mc-a2.asm
+++ /dev/null
@@ -1,2437 +0,0 @@
-;*****************************************************************************
-;* mc-a2.asm: x86 motion compensation
-;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Mathieu Monnier <manao@melix.net>
-;*          Oskar Arvidsson <oskar@irock.se>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-pw_1024: times 16 dw 1024
-filt_mul20: times 32 db 20
-filt_mul15: times 16 db 1, -5
-filt_mul51: times 16 db -5, 1
-hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
-deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
-
-%if HIGH_BIT_DEPTH
-copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
-v210_mask: times 4 dq 0xc00ffc003ff003ff
-v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
-v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
-; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
-v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
-           dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
-
-deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
-deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
-%else
-copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
-deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
-                       db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
-
-deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
-deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
-%endif ; !HIGH_BIT_DEPTH
-
-mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
-                         db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
-mbtree_fix8_pack_shuf:   db  1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
-
-pf_256:    times 4 dd 256.0
-pf_inv256: times 4 dd 0.00390625
-
-pd_16: times 4 dd 16
-
-pad10: times 8 dw    10*PIXEL_MAX
-pad20: times 8 dw    20*PIXEL_MAX
-pad30: times 8 dw    30*PIXEL_MAX
-depad: times 4 dd 32*20*PIXEL_MAX + 512
-
-tap1: times 4 dw  1, -5
-tap2: times 4 dw 20, 20
-tap3: times 4 dw -5,  1
-
-pw_0xc000: times 8 dw 0xc000
-pw_31: times 8 dw 31
-pd_4: times 4 dd 4
-
-SECTION .text
-
-cextern pb_0
-cextern pw_1
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_512
-cextern pw_00ff
-cextern pw_3fff
-cextern pw_pixel_max
-cextern pw_0to15
-cextern pd_8
-cextern pd_0123
-cextern pd_ffff
-
-%macro LOAD_ADD 4
-    movh       %4, %3
-    movh       %1, %2
-    punpcklbw  %4, m0
-    punpcklbw  %1, m0
-    paddw      %1, %4
-%endmacro
-
-%macro LOAD_ADD_2 6
-    mova       %5, %3
-    mova       %1, %4
-    punpckhbw  %6, %5, m0
-    punpcklbw  %5, m0
-    punpckhbw  %2, %1, m0
-    punpcklbw  %1, m0
-    paddw      %1, %5
-    paddw      %2, %6
-%endmacro
-
-%macro FILT_V2 6
-    psubw  %1, %2  ; a-b
-    psubw  %4, %5
-    psubw  %2, %3  ; b-c
-    psubw  %5, %6
-    psllw  %2, 2
-    psllw  %5, 2
-    psubw  %1, %2  ; a-5*b+4*c
-    psllw  %3, 4
-    psubw  %4, %5
-    psllw  %6, 4
-    paddw  %1, %3  ; a-5*b+20*c
-    paddw  %4, %6
-%endmacro
-
-%macro FILT_H 3
-    psubw  %1, %2  ; a-b
-    psraw  %1, 2   ; (a-b)/4
-    psubw  %1, %2  ; (a-b)/4-b
-    paddw  %1, %3  ; (a-b)/4-b+c
-    psraw  %1, 2   ; ((a-b)/4-b+c)/4
-    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-%endmacro
-
-%macro FILT_H2 6
-    psubw  %1, %2
-    psubw  %4, %5
-    psraw  %1, 2
-    psraw  %4, 2
-    psubw  %1, %2
-    psubw  %4, %5
-    paddw  %1, %3
-    paddw  %4, %6
-    psraw  %1, 2
-    psraw  %4, 2
-    paddw  %1, %3
-    paddw  %4, %6
-%endmacro
-
-%macro FILT_PACK 3-5
-%if cpuflag(ssse3)
-    pmulhrsw %1, %3
-    pmulhrsw %2, %3
-%else
-    paddw    %1, %3
-    paddw    %2, %3
-%if %0 == 5
-    psubusw  %1, %5
-    psubusw  %2, %5
-    psrlw    %1, %4
-    psrlw    %2, %4
-%else
-    psraw    %1, %4
-    psraw    %2, %4
-%endif
-%endif
-%if HIGH_BIT_DEPTH == 0
-    packuswb %1, %2
-%endif
-%endmacro
-
-;The hpel_filter routines use non-temporal writes for output.
-;The following defines may be uncommented for testing.
-;Doing the hpel_filter temporal may be a win if the last level cache
-;is big enough (preliminary benching suggests on the order of 4* framesize).
-
-;%define movntq movq
-;%define movntps movaps
-;%define sfence
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
-;-----------------------------------------------------------------------------
-%macro HPEL_FILTER 0
-cglobal hpel_filter_v, 5,6,11
-    FIX_STRIDES r3, r4
-    lea        r5, [r1+r3]
-    sub        r1, r3
-    sub        r1, r3
-%if num_mmregs > 8
-    mova       m8, [pad10]
-    mova       m9, [pad20]
-    mova      m10, [pad30]
-    %define s10 m8
-    %define s20 m9
-    %define s30 m10
-%else
-    %define s10 [pad10]
-    %define s20 [pad20]
-    %define s30 [pad30]
-%endif
-    add        r0, r4
-    add        r2, r4
-    neg        r4
-    mova       m7, [pw_pixel_max]
-    pxor       m0, m0
-.loop:
-    mova       m1, [r1]
-    mova       m2, [r1+r3]
-    mova       m3, [r1+r3*2]
-    mova       m4, [r1+mmsize]
-    mova       m5, [r1+r3+mmsize]
-    mova       m6, [r1+r3*2+mmsize]
-    paddw      m1, [r5+r3*2]
-    paddw      m2, [r5+r3]
-    paddw      m3, [r5]
-    paddw      m4, [r5+r3*2+mmsize]
-    paddw      m5, [r5+r3+mmsize]
-    paddw      m6, [r5+mmsize]
-    add        r1, 2*mmsize
-    add        r5, 2*mmsize
-    FILT_V2    m1, m2, m3, m4, m5, m6
-    mova       m6, [pw_16]
-    psubw      m1, s20
-    psubw      m4, s20
-    mova      [r2+r4], m1
-    mova      [r2+r4+mmsize], m4
-    paddw      m1, s30
-    paddw      m4, s30
-    FILT_PACK  m1, m4, m6, 5, s10
-    CLIPW      m1, m0, m7
-    CLIPW      m4, m0, m7
-    mova      [r0+r4], m1
-    mova      [r0+r4+mmsize], m4
-    add        r4, 2*mmsize
-    jl .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_c, 3,3,10
-    add        r2, r2
-    add        r0, r2
-    add        r1, r2
-    neg        r2
-    mova       m0, [tap1]
-    mova       m7, [tap3]
-%if num_mmregs > 8
-    mova       m8, [tap2]
-    mova       m9, [depad]
-    %define s1 m8
-    %define s2 m9
-%else
-    %define s1 [tap2]
-    %define s2 [depad]
-%endif
-.loop:
-    movu       m1, [r1+r2-4]
-    movu       m2, [r1+r2-2]
-    mova       m3, [r1+r2+0]
-    movu       m4, [r1+r2+2]
-    movu       m5, [r1+r2+4]
-    movu       m6, [r1+r2+6]
-    pmaddwd    m1, m0
-    pmaddwd    m2, m0
-    pmaddwd    m3, s1
-    pmaddwd    m4, s1
-    pmaddwd    m5, m7
-    pmaddwd    m6, m7
-    paddd      m1, s2
-    paddd      m2, s2
-    paddd      m3, m5
-    paddd      m4, m6
-    paddd      m1, m3
-    paddd      m2, m4
-    psrad      m1, 10
-    psrad      m2, 10
-    pslld      m2, 16
-    pand       m1, [pd_ffff]
-    por        m1, m2
-    CLIPW      m1, [pb_0], [pw_pixel_max]
-    mova  [r0+r2], m1
-    add        r2, mmsize
-    jl .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_h, 3,4,8
-    %define src r1+r2
-    add        r2, r2
-    add        r0, r2
-    add        r1, r2
-    neg        r2
-    mova       m0, [pw_pixel_max]
-.loop:
-    movu       m1, [src-4]
-    movu       m2, [src-2]
-    mova       m3, [src+0]
-    movu       m6, [src+2]
-    movu       m4, [src+4]
-    movu       m5, [src+6]
-    paddw      m3, m6 ; c0
-    paddw      m2, m4 ; b0
-    paddw      m1, m5 ; a0
-%if mmsize == 16
-    movu       m4, [src-4+mmsize]
-    movu       m5, [src-2+mmsize]
-%endif
-    movu       m7, [src+4+mmsize]
-    movu       m6, [src+6+mmsize]
-    paddw      m5, m7 ; b1
-    paddw      m4, m6 ; a1
-    movu       m7, [src+2+mmsize]
-    mova       m6, [src+0+mmsize]
-    paddw      m6, m7 ; c1
-    FILT_H2    m1, m2, m3, m4, m5, m6
-    mova       m7, [pw_1]
-    pxor       m2, m2
-    FILT_PACK  m1, m4, m7, 1
-    CLIPW      m1, m2, m0
-    CLIPW      m4, m2, m0
-    mova      [r0+r2], m1
-    mova      [r0+r2+mmsize], m4
-    add        r2, mmsize*2
-    jl .loop
-    RET
-%endmacro ; HPEL_FILTER
-
-INIT_MMX mmx2
-HPEL_FILTER
-INIT_XMM sse2
-HPEL_FILTER
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-%macro HPEL_V 1
-;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_v, 5,6,%1
-    lea r5, [r1+r3]
-    sub r1, r3
-    sub r1, r3
-    add r0, r4
-    lea r2, [r2+r4*2]
-    neg r4
-%if cpuflag(ssse3)
-    mova m0, [filt_mul15]
-%else
-    pxor m0, m0
-%endif
-.loop:
-%if cpuflag(ssse3)
-    mova m1, [r1]
-    mova m4, [r1+r3]
-    mova m2, [r5+r3*2]
-    mova m5, [r5+r3]
-    mova m3, [r1+r3*2]
-    mova m6, [r5]
-    SBUTTERFLY bw, 1, 4, 7
-    SBUTTERFLY bw, 2, 5, 7
-    SBUTTERFLY bw, 3, 6, 7
-    pmaddubsw m1, m0
-    pmaddubsw m4, m0
-    pmaddubsw m2, m0
-    pmaddubsw m5, m0
-    pmaddubsw m3, [filt_mul20]
-    pmaddubsw m6, [filt_mul20]
-    paddw  m1, m2
-    paddw  m4, m5
-    paddw  m1, m3
-    paddw  m4, m6
-    mova   m7, [pw_1024]
-%else
-    LOAD_ADD_2 m1, m4, [r1     ], [r5+r3*2], m6, m7            ; a0 / a1
-    LOAD_ADD_2 m2, m5, [r1+r3  ], [r5+r3  ], m6, m7            ; b0 / b1
-    LOAD_ADD   m3,     [r1+r3*2], [r5     ], m7                ; c0
-    LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7   ; c1
-    FILT_V2 m1, m2, m3, m4, m5, m6
-    mova   m7, [pw_16]
-%endif
-%if mmsize==32
-    mova         [r2+r4*2], xm1
-    mova         [r2+r4*2+mmsize/2], xm4
-    vextracti128 [r2+r4*2+mmsize], m1, 1
-    vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
-%else
-    mova      [r2+r4*2], m1
-    mova      [r2+r4*2+mmsize], m4
-%endif
-    FILT_PACK m1, m4, m7, 5
-    movnta    [r0+r4], m1
-    add r1, mmsize
-    add r5, mmsize
-    add r4, mmsize
-    jl .loop
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal hpel_filter_c, 3,3
-    add r0, r2
-    lea r1, [r1+r2*2]
-    neg r2
-    %define src r1+r2*2
-    movq m7, [pw_32]
-.loop:
-    movq   m1, [src-4]
-    movq   m2, [src-2]
-    movq   m3, [src  ]
-    movq   m4, [src+4]
-    movq   m5, [src+6]
-    paddw  m3, [src+2]  ; c0
-    paddw  m2, m4       ; b0
-    paddw  m1, m5       ; a0
-    movq   m6, [src+8]
-    paddw  m4, [src+14] ; a1
-    paddw  m5, [src+12] ; b1
-    paddw  m6, [src+10] ; c1
-    FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, m7, 6
-    movntq [r0+r2], m1
-    add r2, 8
-    jl .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal hpel_filter_h, 3,3
-    add r0, r2
-    add r1, r2
-    neg r2
-    %define src r1+r2
-    pxor m0, m0
-.loop:
-    movd       m1, [src-2]
-    movd       m2, [src-1]
-    movd       m3, [src  ]
-    movd       m6, [src+1]
-    movd       m4, [src+2]
-    movd       m5, [src+3]
-    punpcklbw  m1, m0
-    punpcklbw  m2, m0
-    punpcklbw  m3, m0
-    punpcklbw  m6, m0
-    punpcklbw  m4, m0
-    punpcklbw  m5, m0
-    paddw      m3, m6 ; c0
-    paddw      m2, m4 ; b0
-    paddw      m1, m5 ; a0
-    movd       m7, [src+7]
-    movd       m6, [src+6]
-    punpcklbw  m7, m0
-    punpcklbw  m6, m0
-    paddw      m4, m7 ; c1
-    paddw      m5, m6 ; b1
-    movd       m7, [src+5]
-    movd       m6, [src+4]
-    punpcklbw  m7, m0
-    punpcklbw  m6, m0
-    paddw      m6, m7 ; a1
-    movq       m7, [pw_1]
-    FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, m7, 1
-    movntq     [r0+r2], m1
-    add r2, 8
-    jl .loop
-    RET
-
-%macro HPEL_C 0
-;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
-;-----------------------------------------------------------------------------
-cglobal hpel_filter_c, 3,3,9
-    add r0, r2
-    lea r1, [r1+r2*2]
-    neg r2
-    %define src r1+r2*2
-%ifnidn cpuname, sse2
-%if cpuflag(ssse3)
-    mova    m7, [pw_512]
-%else
-    mova    m7, [pw_32]
-%endif
-    %define pw_rnd m7
-%elif ARCH_X86_64
-    mova    m8, [pw_32]
-    %define pw_rnd m8
-%else
-    %define pw_rnd [pw_32]
-%endif
-; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if mmsize==32
-.loop:
-    movu    m4, [src-4]
-    movu    m5, [src-2]
-    mova    m6, [src+0]
-    movu    m3, [src-4+mmsize]
-    movu    m2, [src-2+mmsize]
-    mova    m1, [src+0+mmsize]
-    paddw   m4, [src+6]
-    paddw   m5, [src+4]
-    paddw   m6, [src+2]
-    paddw   m3, [src+6+mmsize]
-    paddw   m2, [src+4+mmsize]
-    paddw   m1, [src+2+mmsize]
-    FILT_H2 m4, m5, m6, m3, m2, m1
-%else
-    mova      m0, [src-16]
-    mova      m1, [src]
-.loop:
-    mova      m2, [src+16]
-    PALIGNR   m4, m1, m0, 12, m7
-    PALIGNR   m5, m1, m0, 14, m0
-    PALIGNR   m0, m2, m1, 6, m7
-    paddw     m4, m0
-    PALIGNR   m0, m2, m1, 4, m7
-    paddw     m5, m0
-    PALIGNR   m6, m2, m1, 2, m7
-    paddw     m6, m1
-    FILT_H    m4, m5, m6
-
-    mova      m0, m2
-    mova      m5, m2
-    PALIGNR   m2, m1, 12, m7
-    PALIGNR   m5, m1, 14, m1
-    mova      m1, [src+32]
-    PALIGNR   m3, m1, m0, 6, m7
-    paddw     m3, m2
-    PALIGNR   m6, m1, m0, 4, m7
-    paddw     m5, m6
-    PALIGNR   m6, m1, m0, 2, m7
-    paddw     m6, m0
-    FILT_H    m3, m5, m6
-%endif
-    FILT_PACK m4, m3, pw_rnd, 6
-%if mmsize==32
-    vpermq    m4, m4, q3120
-%endif
-    movnta [r0+r2], m4
-    add       r2, mmsize
-    jl .loop
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal hpel_filter_h, 3,3,8
-    add r0, r2
-    add r1, r2
-    neg r2
-    %define src r1+r2
-    pxor m0, m0
-.loop:
-    movh       m1, [src-2]
-    movh       m2, [src-1]
-    movh       m3, [src  ]
-    movh       m4, [src+1]
-    movh       m5, [src+2]
-    movh       m6, [src+3]
-    punpcklbw  m1, m0
-    punpcklbw  m2, m0
-    punpcklbw  m3, m0
-    punpcklbw  m4, m0
-    punpcklbw  m5, m0
-    punpcklbw  m6, m0
-    paddw      m3, m4 ; c0
-    paddw      m2, m5 ; b0
-    paddw      m1, m6 ; a0
-    movh       m4, [src+6]
-    movh       m5, [src+7]
-    movh       m6, [src+10]
-    movh       m7, [src+11]
-    punpcklbw  m4, m0
-    punpcklbw  m5, m0
-    punpcklbw  m6, m0
-    punpcklbw  m7, m0
-    paddw      m5, m6 ; b1
-    paddw      m4, m7 ; a1
-    movh       m6, [src+8]
-    movh       m7, [src+9]
-    punpcklbw  m6, m0
-    punpcklbw  m7, m0
-    paddw      m6, m7 ; c1
-    mova       m7, [pw_1] ; FIXME xmm8
-    FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, m7, 1
-    movntps    [r0+r2], m1
-    add r2, 16
-    jl .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
-;-----------------------------------------------------------------------------
-%macro HPEL_H 0
-cglobal hpel_filter_h, 3,3
-    add r0, r2
-    add r1, r2
-    neg r2
-    %define src r1+r2
-    mova      m0, [src-16]
-    mova      m1, [src]
-    mova      m7, [pw_1024]
-.loop:
-    mova      m2, [src+16]
-    ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
-    ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
-    ; the repeated loads of constants for pmaddubsw.
-    palignr   m3, m1, m0, 14
-    palignr   m4, m1, m0, 15
-    palignr   m0, m2, m1, 2
-    pmaddubsw m3, [filt_mul15]
-    pmaddubsw m4, [filt_mul15]
-    pmaddubsw m0, [filt_mul51]
-    palignr   m5, m2, m1, 1
-    palignr   m6, m2, m1, 3
-    paddw     m3, m0
-    mova      m0, m1
-    pmaddubsw m1, [filt_mul20]
-    pmaddubsw m5, [filt_mul20]
-    pmaddubsw m6, [filt_mul51]
-    paddw     m3, m1
-    paddw     m4, m5
-    paddw     m4, m6
-    FILT_PACK m3, m4, m7, 5
-    pshufb    m3, [hpel_shuf]
-    mova      m1, m2
-    movntps [r0+r2], m3
-    add r2, 16
-    jl .loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-HPEL_V 0
-INIT_XMM sse2
-HPEL_V 8
-%if ARCH_X86_64 == 0
-INIT_XMM sse2
-HPEL_C
-INIT_XMM ssse3
-HPEL_C
-HPEL_V 0
-HPEL_H
-INIT_XMM avx
-HPEL_C
-HPEL_V 0
-HPEL_H
-INIT_YMM avx2
-HPEL_V 8
-HPEL_C
-
-INIT_YMM avx2
-cglobal hpel_filter_h, 3,3,8
-    add       r0, r2
-    add       r1, r2
-    neg       r2
-    %define src r1+r2
-    mova      m5, [filt_mul15]
-    mova      m6, [filt_mul20]
-    mova      m7, [filt_mul51]
-.loop:
-    movu      m0, [src-2]
-    movu      m1, [src-1]
-    movu      m2, [src+2]
-    pmaddubsw m0, m5
-    pmaddubsw m1, m5
-    pmaddubsw m2, m7
-    paddw     m0, m2
-
-    mova      m2, [src+0]
-    movu      m3, [src+1]
-    movu      m4, [src+3]
-    pmaddubsw m2, m6
-    pmaddubsw m3, m6
-    pmaddubsw m4, m7
-    paddw     m0, m2
-    paddw     m1, m3
-    paddw     m1, m4
-
-    mova      m2, [pw_1024]
-    FILT_PACK m0, m1, m2, 5
-    pshufb    m0, [hpel_shuf]
-    movnta [r0+r2], m0
-    add       r2, mmsize
-    jl .loop
-    RET
-%endif
-
-%if ARCH_X86_64
-%macro DO_FILT_V 5
-    ;The optimum prefetch distance is difficult to determine in checkasm:
-    ;any prefetch seems slower than not prefetching.
-    ;In real use, the prefetch seems to be a slight win.
-    ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
-    ;loop iteration is going to take longer than the prefetch.
-    prefetcht0 [r1+r2*2+mmsize]
-%if cpuflag(ssse3)
-    mova m1, [r3]
-    mova m2, [r3+r2]
-    mova %3, [r3+r2*2]
-    mova m3, [r1]
-    mova %1, [r1+r2]
-    mova %2, [r1+r2*2]
-    punpckhbw m4, m1, m2
-    punpcklbw m1, m2
-    punpckhbw m2, %1, %2
-    punpcklbw %1, %2
-    punpckhbw %2, m3, %3
-    punpcklbw m3, %3
-
-    pmaddubsw m1, m12
-    pmaddubsw m4, m12
-    pmaddubsw %1, m0
-    pmaddubsw m2, m0
-    pmaddubsw m3, m14
-    pmaddubsw %2, m14
-
-    paddw m1, %1
-    paddw m4, m2
-    paddw m1, m3
-    paddw m4, %2
-%else
-    LOAD_ADD_2 m1, m4, [r3     ], [r1+r2*2], m2, m5            ; a0 / a1
-    LOAD_ADD_2 m2, m5, [r3+r2  ], [r1+r2  ], m3, m6            ; b0 / b1
-    LOAD_ADD_2 m3, m6, [r3+r2*2], [r1     ], %3, %4            ; c0 / c1
-    packuswb %3, %4
-    FILT_V2 m1, m2, m3, m4, m5, m6
-%endif
-    add       r3, mmsize
-    add       r1, mmsize
-%if mmsize==32
-    vinserti128 %1, m1, xm4, 1
-    vperm2i128  %2, m1, m4, q0301
-%else
-    mova      %1, m1
-    mova      %2, m4
-%endif
-    FILT_PACK m1, m4, m15, 5
-    movntps  [r8+r4+%5], m1
-%endmacro
-
-%macro FILT_C 3
-%if mmsize==32
-    vperm2i128 m3, %2, %1, q0003
-%endif
-    PALIGNR   m1, %2, %1, (mmsize-4), m3
-    PALIGNR   m2, %2, %1, (mmsize-2), m3
-%if mmsize==32
-    vperm2i128 %1, %3, %2, q0003
-%endif
-    PALIGNR   m3, %3, %2, 4, %1
-    PALIGNR   m4, %3, %2, 2, %1
-    paddw     m3, m2
-%if mmsize==32
-    mova      m2, %1
-%endif
-    mova      %1, %3
-    PALIGNR   %3, %3, %2, 6, m2
-    paddw     m4, %2
-    paddw     %3, m1
-    FILT_H    %3, m3, m4
-%endmacro
-
-%macro DO_FILT_C 4
-    FILT_C %1, %2, %3
-    FILT_C %2, %1, %4
-    FILT_PACK %3, %4, m15, 6
-%if mmsize==32
-    vpermq %3, %3, q3120
-%endif
-    movntps   [r5+r4], %3
-%endmacro
-
-%macro ADD8TO16 5
-    punpckhbw %3, %1, %5
-    punpcklbw %1, %5
-    punpcklbw %4, %2, %5
-    punpckhbw %2, %5
-    paddw     %2, %3
-    paddw     %1, %4
-%endmacro
-
-%macro DO_FILT_H 3
-%if mmsize==32
-    vperm2i128 m3, %2, %1, q0003
-%endif
-    PALIGNR   m1, %2, %1, (mmsize-2), m3
-    PALIGNR   m2, %2, %1, (mmsize-1), m3
-%if mmsize==32
-    vperm2i128 m3, %3, %2, q0003
-%endif
-    PALIGNR   m4, %3, %2, 1 , m3
-    PALIGNR   m5, %3, %2, 2 , m3
-    PALIGNR   m6, %3, %2, 3 , m3
-    mova      %1, %2
-%if cpuflag(ssse3)
-    pmaddubsw m1, m12
-    pmaddubsw m2, m12
-    pmaddubsw %2, m14
-    pmaddubsw m4, m14
-    pmaddubsw m5, m0
-    pmaddubsw m6, m0
-    paddw     m1, %2
-    paddw     m2, m4
-    paddw     m1, m5
-    paddw     m2, m6
-    FILT_PACK m1, m2, m15, 5
-    pshufb    m1, [hpel_shuf]
-%else ; ssse3, avx
-    ADD8TO16  m1, m6, m12, m3, m0 ; a
-    ADD8TO16  m2, m5, m12, m3, m0 ; b
-    ADD8TO16  %2, m4, m12, m3, m0 ; c
-    FILT_V2   m1, m2, %2, m6, m5, m4
-    FILT_PACK m1, m6, m15, 5
-%endif
-    movntps [r0+r4], m1
-    mova      %2, %3
-%endmacro
-
-%macro HPEL 0
-;-----------------------------------------------------------------------------
-; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                   uint8_t *src, intptr_t stride, int width, int height )
-;-----------------------------------------------------------------------------
-cglobal hpel_filter, 7,9,16
-    mov       r7, r3
-    sub      r5d, mmsize
-    mov       r8, r1
-    and       r7, mmsize-1
-    sub       r3, r7
-    add       r0, r5
-    add       r8, r5
-    add       r7, r5
-    add       r5, r2
-    mov       r2, r4
-    neg       r7
-    lea       r1, [r3+r2]
-    sub       r3, r2
-    sub       r3, r2
-    mov       r4, r7
-%if cpuflag(ssse3)
-    mova      m0, [filt_mul51]
-    mova     m12, [filt_mul15]
-    mova     m14, [filt_mul20]
-    mova     m15, [pw_1024]
-%else
-    pxor      m0, m0
-    mova     m15, [pw_16]
-%endif
-;ALIGN 16
-.loopy:
-; first filter_v
-    DO_FILT_V m8, m7, m13, m12, 0
-;ALIGN 16
-.loopx:
-    DO_FILT_V m6, m5, m11, m12, mmsize
-.lastx:
-%if cpuflag(ssse3)
-    psrlw   m15, 1   ; pw_512
-%else
-    paddw   m15, m15 ; pw_32
-%endif
-    DO_FILT_C m9, m8, m7, m6
-%if cpuflag(ssse3)
-    paddw   m15, m15 ; pw_1024
-%else
-    psrlw   m15, 1   ; pw_16
-%endif
-    mova     m7, m5
-    DO_FILT_H m10, m13, m11
-    add      r4, mmsize
-    jl .loopx
-    cmp      r4, mmsize
-    jl .lastx
-; setup regs for next y
-    sub      r4, r7
-    sub      r4, r2
-    sub      r1, r4
-    sub      r3, r4
-    add      r0, r2
-    add      r8, r2
-    add      r5, r2
-    mov      r4, r7
-    sub     r6d, 1
-    jg .loopy
-    sfence
-    RET
-%endmacro
-
-INIT_XMM sse2
-HPEL
-INIT_XMM ssse3
-HPEL
-INIT_XMM avx
-HPEL
-INIT_YMM avx2
-HPEL
-%endif ; ARCH_X86_64
-
-%undef movntq
-%undef movntps
-%undef sfence
-%endif ; !HIGH_BIT_DEPTH
-
-%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
-    %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
-    %rep (%2+63) / 64  ; assume 64 byte cache lines
-        prefetchnta [%1+%%i]
-        %assign %%i %%i + 64
-    %endrep
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
-;                              pixel *src, intptr_t i_src, int w, int h )
-;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of mmsize, and i_dst>w
-%macro PLANE_COPY_CORE 1 ; swap
-%if %1
-cglobal plane_copy_swap_core, 6,7
-    mova   m4, [copy_swap_shuf]
-%else
-cglobal plane_copy_core, 6,7
-%endif
-    FIX_STRIDES r1, r3
-%if %1 && HIGH_BIT_DEPTH
-    shl   r4d, 2
-%elif %1 || HIGH_BIT_DEPTH
-    add   r4d, r4d
-%else
-    movsxdifnidn r4, r4d
-%endif
-    add    r0, r4
-    add    r2, r4
-    neg    r4
-.loopy:
-    lea    r6, [r4+4*mmsize]
-%if %1
-    test  r6d, r6d
-    jg .skip
-%endif
-.loopx:
-    PREFETCHNT_ITER r2+r6, 4*mmsize
-    movu   m0, [r2+r6-4*mmsize]
-    movu   m1, [r2+r6-3*mmsize]
-    movu   m2, [r2+r6-2*mmsize]
-    movu   m3, [r2+r6-1*mmsize]
-%if %1
-    pshufb m0, m4
-    pshufb m1, m4
-    pshufb m2, m4
-    pshufb m3, m4
-%endif
-    movnta [r0+r6-4*mmsize], m0
-    movnta [r0+r6-3*mmsize], m1
-    movnta [r0+r6-2*mmsize], m2
-    movnta [r0+r6-1*mmsize], m3
-    add    r6, 4*mmsize
-    jle .loopx
-.skip:
-    PREFETCHNT_ITER r2+r6, 4*mmsize
-    sub    r6, 4*mmsize
-    jz .end
-.loop_end:
-    movu   m0, [r2+r6]
-%if %1
-    pshufb m0, m4
-%endif
-    movnta [r0+r6], m0
-    add    r6, mmsize
-    jl .loop_end
-.end:
-    add    r0, r1
-    add    r2, r3
-    dec   r5d
-    jg .loopy
-    sfence
-    RET
-%endmacro
-
-INIT_XMM sse
-PLANE_COPY_CORE 0
-INIT_XMM ssse3
-PLANE_COPY_CORE 1
-INIT_YMM avx
-PLANE_COPY_CORE 0
-INIT_YMM avx2
-PLANE_COPY_CORE 1
-
-%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
-%if HIGH_BIT_DEPTH
-%assign x 0
-%rep 16/mmsize
-    mov%4     m0, [%2+(x/2)*mmsize]
-    mov%4     m1, [%3+(x/2)*mmsize]
-    punpckhwd m2, m0, m1
-    punpcklwd m0, m1
-    mov%5a    [%1+(x+0)*mmsize], m0
-    mov%5a    [%1+(x+1)*mmsize], m2
-    %assign x (x+2)
-%endrep
-%else
-    movq   m0, [%2]
-%if mmsize==16
-%ifidn %4, a
-    punpcklbw m0, [%3]
-%else
-    movq   m1, [%3]
-    punpcklbw m0, m1
-%endif
-    mov%5a [%1], m0
-%else
-    movq   m1, [%3]
-    punpckhbw m2, m0, m1
-    punpcklbw m0, m1
-    mov%5a [%1+0], m0
-    mov%5a [%1+8], m2
-%endif
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
-%if HIGH_BIT_DEPTH
-%assign n 0
-%rep 16/mmsize
-    mova     m0, [%3+(n+0)*mmsize]
-    mova     m1, [%3+(n+1)*mmsize]
-    psrld    m2, m0, 16
-    psrld    m3, m1, 16
-    pand     m0, %5
-    pand     m1, %5
-    packssdw m0, m1
-    packssdw m2, m3
-    mov%6    [%1+(n/2)*mmsize], m0
-    mov%6    [%2+(n/2)*mmsize], m2
-    %assign n (n+2)
-%endrep
-%else ; !HIGH_BIT_DEPTH
-%if mmsize==16
-    mova   m0, [%3]
-%if cpuflag(ssse3)
-    pshufb m0, %5
-%else
-    mova   m1, m0
-    pand   m0, %5
-    psrlw  m1, 8
-    packuswb m0, m1
-%endif
-%if %4
-    mova   [%1], m0
-%else
-    movq   [%1], m0
-    movhps [%2], m0
-%endif
-%else
-    mova   m0, [%3]
-    mova   m1, [%3+8]
-    mova   m2, m0
-    mova   m3, m1
-    pand   m0, %5
-    pand   m1, %5
-    psrlw  m2, 8
-    psrlw  m3, 8
-    packuswb m0, m1
-    packuswb m2, m3
-    mova   [%1], m0
-    mova   [%2], m2
-%endif ; mmsize == 16
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro PLANE_INTERLEAVE 0
-;-----------------------------------------------------------------------------
-; void plane_copy_interleave_core( uint8_t *dst,  intptr_t i_dst,
-;                                  uint8_t *srcu, intptr_t i_srcu,
-;                                  uint8_t *srcv, intptr_t i_srcv, int w, int h )
-;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 6,9
-    mov   r6d, r6m
-%if HIGH_BIT_DEPTH
-    FIX_STRIDES r1, r3, r5, r6d
-    movifnidn r1mp, r1
-    movifnidn r3mp, r3
-    mov  r6m, r6d
-%endif
-    lea    r0, [r0+r6*2]
-    add    r2,  r6
-    add    r4,  r6
-%if ARCH_X86_64
-    DECLARE_REG_TMP 7,8
-%else
-    DECLARE_REG_TMP 1,3
-%endif
-    mov  t1, r1
-    shr  t1, SIZEOF_PIXEL
-    sub  t1, r6
-    mov  t0d, r7m
-.loopy:
-    mov    r6d, r6m
-    neg    r6
-.prefetch:
-    prefetchnta [r2+r6]
-    prefetchnta [r4+r6]
-    add    r6, 64
-    jl .prefetch
-    mov    r6d, r6m
-    neg    r6
-.loopx:
-    INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
-    INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
-    add    r6, 16*SIZEOF_PIXEL
-    jl .loopx
-.pad:
-%assign n 0
-%rep SIZEOF_PIXEL
-%if mmsize==8
-    movntq [r0+r6*2+(n+ 0)], m0
-    movntq [r0+r6*2+(n+ 8)], m0
-    movntq [r0+r6*2+(n+16)], m0
-    movntq [r0+r6*2+(n+24)], m0
-%else
-    movntdq [r0+r6*2+(n+ 0)], m0
-    movntdq [r0+r6*2+(n+16)], m0
-%endif
-    %assign n n+32
-%endrep
-    add    r6, 16*SIZEOF_PIXEL
-    cmp    r6, t1
-    jl .pad
-    add    r0, r1mp
-    add    r2, r3mp
-    add    r4, r5
-    dec    t0d
-    jg .loopy
-    sfence
-    emms
-    RET
-
-;-----------------------------------------------------------------------------
-; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
-;-----------------------------------------------------------------------------
-cglobal store_interleave_chroma, 5,5
-    FIX_STRIDES r1
-.loop:
-    INTERLEAVE r0+ 0, r2+           0, r3+           0, a
-    INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
-    add    r2, FDEC_STRIDEB*2
-    add    r3, FDEC_STRIDEB*2
-    lea    r0, [r0+r1*2]
-    sub   r4d, 2
-    jg .loop
-    RET
-%endmacro ; PLANE_INTERLEAVE
-
-%macro DEINTERLEAVE_START 0
-%if HIGH_BIT_DEPTH
-    mova   m4, [pd_ffff]
-%elif cpuflag(ssse3)
-    mova   m4, [deinterleave_shuf]
-%else
-    mova   m4, [pw_00ff]
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro PLANE_DEINTERLEAVE 0
-;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
-;                               pixel *dstv, intptr_t i_dstv,
-;                               pixel *src,  intptr_t i_src, int w, int h )
-;-----------------------------------------------------------------------------
-cglobal plane_copy_deinterleave, 6,7
-    DEINTERLEAVE_START
-    mov    r6d, r6m
-    FIX_STRIDES r1, r3, r5, r6d
-%if HIGH_BIT_DEPTH
-    mov    r6m, r6d
-%endif
-    add    r0,  r6
-    add    r2,  r6
-    lea    r4, [r4+r6*2]
-.loopy:
-    mov    r6d, r6m
-    neg    r6
-.loopx:
-    DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
-    DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
-    add    r6, 16*SIZEOF_PIXEL
-    jl .loopx
-    add    r0, r1
-    add    r2, r3
-    add    r4, r5
-    dec dword r7m
-    jg .loopy
-    RET
-
-;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
-;-----------------------------------------------------------------------------
-cglobal load_deinterleave_chroma_fenc, 4,4
-    DEINTERLEAVE_START
-    FIX_STRIDES r2
-.loop:
-    DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
-    DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
-    add    r0, FENC_STRIDEB*2
-    lea    r1, [r1+r2*2]
-    sub   r3d, 2
-    jg .loop
-    RET
-
-;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
-;-----------------------------------------------------------------------------
-cglobal load_deinterleave_chroma_fdec, 4,4
-    DEINTERLEAVE_START
-    FIX_STRIDES r2
-.loop:
-    DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
-    DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
-    add    r0, FDEC_STRIDEB*2
-    lea    r1, [r1+r2*2]
-    sub   r3d, 2
-    jg .loop
-    RET
-%endmacro ; PLANE_DEINTERLEAVE
-
-%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
-%if cpuflag(ssse3)
-    mova        m3, [deinterleave_rgb_shuf+(%1-3)*16]
-%endif
-%%loopy:
-    mov         %8, r6
-    mov         %9, %6
-%%loopx:
-    movu        m0, [%8]
-    movu        m1, [%8+%1*mmsize/4]
-%if cpuflag(ssse3)
-    pshufb      m0, m3        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    pshufb      m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
-%elif %1 == 3
-    psrldq      m2, m0, 6
-    punpcklqdq  m0, m1        ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
-    psrldq      m1, 6
-    punpcklqdq  m2, m1        ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
-    psrlq       m3, m0, 24
-    psrlq       m4, m2, 24
-    punpckhbw   m1, m0, m3    ; b4 b5 g4 g5 r4 r5
-    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
-    punpckhbw   m3, m2, m4    ; b6 b7 g6 g7 r6 r7
-    punpcklbw   m2, m4        ; b2 b3 g2 g3 r2 r3
-    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
-%else
-    pshufd      m3, m0, q2301
-    pshufd      m4, m1, q2301
-    punpckhbw   m2, m0, m3    ; b2 b3 g2 g3 r2 r3
-    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
-    punpckhbw   m3, m1, m4    ; b6 b7 g6 g7 r6 r7
-    punpcklbw   m1, m4        ; b4 b5 g4 g5 r4 r5
-    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
-%endif
-    punpckldq   m2, m0, m1    ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
-    punpckhdq   m0, m1        ; r0 r1 r2 r3 r4 r5 r6 r7
-    movh   [r0+%9], m2
-    movhps [r2+%9], m2
-    movh   [r4+%9], m0
-    add         %8, %1*mmsize/2
-    add         %9, mmsize/2
-    jl %%loopx
-    add         r0, %2
-    add         r2, %3
-    add         r4, %4
-    add         r6, %5
-    dec        %7d
-    jg %%loopy
-%endmacro
-
-%macro PLANE_DEINTERLEAVE_RGB 0
-;-----------------------------------------------------------------------------
-; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
-;                                        pixel *dstb, intptr_t i_dstb,
-;                                        pixel *dstc, intptr_t i_dstc,
-;                                        pixel *src,  intptr_t i_src, int pw, int w, int h )
-;-----------------------------------------------------------------------------
-%if ARCH_X86_64
-cglobal plane_copy_deinterleave_rgb, 8,12
-    %define %%args r1, r3, r5, r7, r8, r9, r10, r11
-    mov        r8d, r9m
-    mov        r9d, r10m
-    add         r0, r8
-    add         r2, r8
-    add         r4, r8
-    neg         r8
-%else
-cglobal plane_copy_deinterleave_rgb, 1,7
-    %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
-    mov         r1, r9m
-    mov         r2, r2m
-    mov         r4, r4m
-    mov         r6, r6m
-    add         r0, r1
-    add         r2, r1
-    add         r4, r1
-    neg         r1
-    mov        r9m, r1
-    mov         r1, r10m
-%endif
-    cmp  dword r8m, 4
-    je .pw4
-    PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
-    jmp .ret
-.pw4:
-    PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
-.ret:
-    REP_RET
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM sse2
-PLANE_DEINTERLEAVE_RGB
-INIT_XMM ssse3
-PLANE_DEINTERLEAVE_RGB
-%endif ; !HIGH_BIT_DEPTH
-
-%macro PLANE_DEINTERLEAVE_V210 0
-;-----------------------------------------------------------------------------
-; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
-;                                         uint16_t *dstc, intptr_t i_dstc,
-;                                         uint32_t *src, intptr_t i_src, int w, int h )
-;-----------------------------------------------------------------------------
-%if ARCH_X86_64
-cglobal plane_copy_deinterleave_v210, 8,10,7
-%define src   r8
-%define org_w r9
-%define h     r7d
-%else
-cglobal plane_copy_deinterleave_v210, 7,7,7
-%define src   r4m
-%define org_w r6m
-%define h     dword r7m
-%endif
-    FIX_STRIDES r1, r3, r6d
-    shl    r5, 2
-    add    r0, r6
-    add    r2, r6
-    neg    r6
-    mov   src, r4
-    mov org_w, r6
-    mova   m2, [v210_mask]
-    mova   m3, [v210_luma_shuf]
-    mova   m4, [v210_chroma_shuf]
-    mova   m5, [v210_mult] ; also functions as vpermd index for avx2
-    pshufd m6, m5, q1102
-
-ALIGN 16
-.loop:
-    movu   m1, [r4]
-    pandn  m0, m2, m1
-    pand   m1, m2
-    pshufb m0, m3
-    pshufb m1, m4
-    pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
-    pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
-%if mmsize == 32
-    vpermd m0, m5, m0
-    vpermd m1, m5, m1
-%endif
-    movu [r0+r6], m0
-    movu [r2+r6], m1
-    add    r4, mmsize
-    add    r6, 3*mmsize/4
-    jl .loop
-    add    r0, r1
-    add    r2, r3
-    add   src, r5
-    mov    r4, src
-    mov    r6, org_w
-    dec     h
-    jg .loop
-    RET
-%endmacro ; PLANE_DEINTERLEAVE_V210
-
-%if HIGH_BIT_DEPTH
-INIT_MMX mmx2
-PLANE_INTERLEAVE
-INIT_MMX mmx
-PLANE_DEINTERLEAVE
-INIT_XMM sse2
-PLANE_INTERLEAVE
-PLANE_DEINTERLEAVE
-INIT_XMM ssse3
-PLANE_DEINTERLEAVE_V210
-INIT_XMM avx
-PLANE_INTERLEAVE
-PLANE_DEINTERLEAVE
-PLANE_DEINTERLEAVE_V210
-INIT_YMM avx2
-PLANE_DEINTERLEAVE_V210
-%else
-INIT_MMX mmx2
-PLANE_INTERLEAVE
-INIT_MMX mmx
-PLANE_DEINTERLEAVE
-INIT_XMM sse2
-PLANE_INTERLEAVE
-PLANE_DEINTERLEAVE
-INIT_XMM ssse3
-PLANE_DEINTERLEAVE
-%endif
-
-; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size.
-; memzero SSE will fail for non-mod128.
-
-;-----------------------------------------------------------------------------
-; void *memcpy_aligned( void *dst, const void *src, size_t n );
-;-----------------------------------------------------------------------------
-%macro MEMCPY 0
-cglobal memcpy_aligned, 3,3
-%if mmsize == 16
-    test r2d, 16
-    jz .copy2
-    mova  m0, [r1+r2-16]
-    mova [r0+r2-16], m0
-    sub  r2d, 16
-.copy2:
-%endif
-    test r2d, 2*mmsize
-    jz .copy4start
-    mova  m0, [r1+r2-1*mmsize]
-    mova  m1, [r1+r2-2*mmsize]
-    mova [r0+r2-1*mmsize], m0
-    mova [r0+r2-2*mmsize], m1
-    sub  r2d, 2*mmsize
-.copy4start:
-    test r2d, r2d
-    jz .ret
-.copy4:
-    mova  m0, [r1+r2-1*mmsize]
-    mova  m1, [r1+r2-2*mmsize]
-    mova  m2, [r1+r2-3*mmsize]
-    mova  m3, [r1+r2-4*mmsize]
-    mova [r0+r2-1*mmsize], m0
-    mova [r0+r2-2*mmsize], m1
-    mova [r0+r2-3*mmsize], m2
-    mova [r0+r2-4*mmsize], m3
-    sub  r2d, 4*mmsize
-    jg .copy4
-.ret:
-    REP_RET
-%endmacro
-
-INIT_MMX mmx
-MEMCPY
-INIT_XMM sse
-MEMCPY
-
-;-----------------------------------------------------------------------------
-; void *memzero_aligned( void *dst, size_t n );
-;-----------------------------------------------------------------------------
-%macro MEMZERO 1
-cglobal memzero_aligned, 2,2
-    add  r0, r1
-    neg  r1
-%if mmsize == 8
-    pxor m0, m0
-%else
-    xorps m0, m0
-%endif
-.loop:
-%assign i 0
-%rep %1
-    mova [r0 + r1 + i], m0
-%assign i i+mmsize
-%endrep
-    add r1, mmsize*%1
-    jl .loop
-    RET
-%endmacro
-
-INIT_MMX mmx
-MEMZERO 8
-INIT_XMM sse
-MEMZERO 8
-INIT_YMM avx
-MEMZERO 4
-
-%if HIGH_BIT_DEPTH == 0
-;-----------------------------------------------------------------------------
-; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
-;-----------------------------------------------------------------------------
-%macro INTEGRAL_INIT4H 0
-cglobal integral_init4h, 3,4
-    lea     r3, [r0+r2*2]
-    add     r1, r2
-    neg     r2
-    pxor    m4, m4
-.loop:
-    mova   xm0, [r1+r2]
-    mova   xm1, [r1+r2+16]
-%if mmsize==32
-    vinserti128 m0, m0, [r1+r2+ 8], 1
-    vinserti128 m1, m1, [r1+r2+24], 1
-%else
-    palignr m1, m0, 8
-%endif
-    mpsadbw m0, m4, 0
-    mpsadbw m1, m4, 0
-    paddw   m0, [r0+r2*2]
-    paddw   m1, [r0+r2*2+mmsize]
-    mova  [r3+r2*2   ], m0
-    mova  [r3+r2*2+mmsize], m1
-    add     r2, mmsize
-    jl .loop
-    RET
-%endmacro
-
-INIT_XMM sse4
-INTEGRAL_INIT4H
-INIT_YMM avx2
-INTEGRAL_INIT4H
-
-%macro INTEGRAL_INIT8H 0
-cglobal integral_init8h, 3,4
-    lea     r3, [r0+r2*2]
-    add     r1, r2
-    neg     r2
-    pxor    m4, m4
-.loop:
-    mova   xm0, [r1+r2]
-    mova   xm1, [r1+r2+16]
-%if mmsize==32
-    vinserti128 m0, m0, [r1+r2+ 8], 1
-    vinserti128 m1, m1, [r1+r2+24], 1
-    mpsadbw m2, m0, m4, 100100b
-    mpsadbw m3, m1, m4, 100100b
-%else
-    palignr m1, m0, 8
-    mpsadbw m2, m0, m4, 100b
-    mpsadbw m3, m1, m4, 100b
-%endif
-    mpsadbw m0, m4, 0
-    mpsadbw m1, m4, 0
-    paddw   m0, [r0+r2*2]
-    paddw   m1, [r0+r2*2+mmsize]
-    paddw   m0, m2
-    paddw   m1, m3
-    mova  [r3+r2*2   ], m0
-    mova  [r3+r2*2+mmsize], m1
-    add     r2, mmsize
-    jl .loop
-    RET
-%endmacro
-
-INIT_XMM sse4
-INTEGRAL_INIT8H
-INIT_XMM avx
-INTEGRAL_INIT8H
-INIT_YMM avx2
-INTEGRAL_INIT8H
-%endif ; !HIGH_BIT_DEPTH
-
-%macro INTEGRAL_INIT_8V 0
-;-----------------------------------------------------------------------------
-; void integral_init8v( uint16_t *sum8, intptr_t stride )
-;-----------------------------------------------------------------------------
-cglobal integral_init8v, 3,3
-    add   r1, r1
-    add   r0, r1
-    lea   r2, [r0+r1*8]
-    neg   r1
-.loop:
-    mova  m0, [r2+r1]
-    mova  m1, [r2+r1+mmsize]
-    psubw m0, [r0+r1]
-    psubw m1, [r0+r1+mmsize]
-    mova  [r0+r1], m0
-    mova  [r0+r1+mmsize], m1
-    add   r1, 2*mmsize
-    jl .loop
-    RET
-%endmacro
-
-INIT_MMX mmx
-INTEGRAL_INIT_8V
-INIT_XMM sse2
-INTEGRAL_INIT_8V
-INIT_YMM avx2
-INTEGRAL_INIT_8V
-
-;-----------------------------------------------------------------------------
-; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx
-cglobal integral_init4v, 3,5
-    shl   r2, 1
-    lea   r3, [r0+r2*4]
-    lea   r4, [r0+r2*8]
-    mova  m0, [r0+r2]
-    mova  m4, [r4+r2]
-.loop:
-    mova  m1, m4
-    psubw m1, m0
-    mova  m4, [r4+r2-8]
-    mova  m0, [r0+r2-8]
-    paddw m1, m4
-    mova  m3, [r3+r2-8]
-    psubw m1, m0
-    psubw m3, m0
-    mova  [r0+r2-8], m1
-    mova  [r1+r2-8], m3
-    sub   r2, 8
-    jge .loop
-    RET
-
-INIT_XMM sse2
-cglobal integral_init4v, 3,5
-    shl     r2, 1
-    add     r0, r2
-    add     r1, r2
-    lea     r3, [r0+r2*4]
-    lea     r4, [r0+r2*8]
-    neg     r2
-.loop:
-    mova    m0, [r0+r2]
-    mova    m1, [r4+r2]
-    mova    m2, m0
-    mova    m4, m1
-    shufpd  m0, [r0+r2+16], 1
-    shufpd  m1, [r4+r2+16], 1
-    paddw   m0, m2
-    paddw   m1, m4
-    mova    m3, [r3+r2]
-    psubw   m1, m0
-    psubw   m3, m2
-    mova  [r0+r2], m1
-    mova  [r1+r2], m3
-    add     r2, 16
-    jl .loop
-    RET
-
-INIT_XMM ssse3
-cglobal integral_init4v, 3,5
-    shl     r2, 1
-    add     r0, r2
-    add     r1, r2
-    lea     r3, [r0+r2*4]
-    lea     r4, [r0+r2*8]
-    neg     r2
-.loop:
-    mova    m2, [r0+r2]
-    mova    m0, [r0+r2+16]
-    mova    m4, [r4+r2]
-    mova    m1, [r4+r2+16]
-    palignr m0, m2, 8
-    palignr m1, m4, 8
-    paddw   m0, m2
-    paddw   m1, m4
-    mova    m3, [r3+r2]
-    psubw   m1, m0
-    psubw   m3, m2
-    mova  [r0+r2], m1
-    mova  [r1+r2], m3
-    add     r2, 16
-    jl .loop
-    RET
-
-INIT_YMM avx2
-cglobal integral_init4v, 3,5
-    add     r2, r2
-    add     r0, r2
-    add     r1, r2
-    lea     r3, [r0+r2*4]
-    lea     r4, [r0+r2*8]
-    neg     r2
-.loop:
-    mova    m2, [r0+r2]
-    movu    m1, [r4+r2+8]
-    paddw   m0, m2, [r0+r2+8]
-    paddw   m1, [r4+r2]
-    mova    m3, [r3+r2]
-    psubw   m1, m0
-    psubw   m3, m2
-    mova  [r0+r2], m1
-    mova  [r1+r2], m3
-    add     r2, 32
-    jl .loop
-    RET
-
-%macro FILT8x4 7
-    mova      %3, [r0+%7]
-    mova      %4, [r0+r5+%7]
-    pavgb     %3, %4
-    pavgb     %4, [r0+r5*2+%7]
-    PALIGNR   %1, %3, 1, m6
-    PALIGNR   %2, %4, 1, m6
-%if cpuflag(xop)
-    pavgb     %1, %3
-    pavgb     %2, %4
-%else
-    pavgb     %1, %3
-    pavgb     %2, %4
-    psrlw     %5, %1, 8
-    psrlw     %6, %2, 8
-    pand      %1, m7
-    pand      %2, m7
-%endif
-%endmacro
-
-%macro FILT32x4U 4
-    mova      m1, [r0+r5]
-    pavgb     m0, m1, [r0]
-    movu      m3, [r0+r5+1]
-    pavgb     m2, m3, [r0+1]
-    pavgb     m1, [r0+r5*2]
-    pavgb     m3, [r0+r5*2+1]
-    pavgb     m0, m2
-    pavgb     m1, m3
-
-    mova      m3, [r0+r5+mmsize]
-    pavgb     m2, m3, [r0+mmsize]
-    movu      m5, [r0+r5+1+mmsize]
-    pavgb     m4, m5, [r0+1+mmsize]
-    pavgb     m3, [r0+r5*2+mmsize]
-    pavgb     m5, [r0+r5*2+1+mmsize]
-    pavgb     m2, m4
-    pavgb     m3, m5
-
-    pshufb    m0, m7
-    pshufb    m1, m7
-    pshufb    m2, m7
-    pshufb    m3, m7
-    punpckhqdq m4, m0, m2
-    punpcklqdq m0, m0, m2
-    punpckhqdq m5, m1, m3
-    punpcklqdq m2, m1, m3
-    vpermq    m0, m0, q3120
-    vpermq    m1, m4, q3120
-    vpermq    m2, m2, q3120
-    vpermq    m3, m5, q3120
-    mova    [%1], m0
-    mova    [%2], m1
-    mova    [%3], m2
-    mova    [%4], m3
-%endmacro
-
-%macro FILT16x2 4
-    mova      m3, [r0+%4+mmsize]
-    mova      m2, [r0+%4]
-    pavgb     m3, [r0+%4+r5+mmsize]
-    pavgb     m2, [r0+%4+r5]
-    PALIGNR   %1, m3, 1, m6
-    pavgb     %1, m3
-    PALIGNR   m3, m2, 1, m6
-    pavgb     m3, m2
-%if cpuflag(xop)
-    vpperm    m5, m3, %1, m7
-    vpperm    m3, m3, %1, m6
-%else
-    psrlw     m5, m3, 8
-    psrlw     m4, %1, 8
-    pand      m3, m7
-    pand      %1, m7
-    packuswb  m3, %1
-    packuswb  m5, m4
-%endif
-    mova    [%2], m3
-    mova    [%3], m5
-    mova      %1, m2
-%endmacro
-
-%macro FILT8x2U 3
-    mova      m3, [r0+%3+8]
-    mova      m2, [r0+%3]
-    pavgb     m3, [r0+%3+r5+8]
-    pavgb     m2, [r0+%3+r5]
-    mova      m1, [r0+%3+9]
-    mova      m0, [r0+%3+1]
-    pavgb     m1, [r0+%3+r5+9]
-    pavgb     m0, [r0+%3+r5+1]
-    pavgb     m1, m3
-    pavgb     m0, m2
-    psrlw     m3, m1, 8
-    psrlw     m2, m0, 8
-    pand      m1, m7
-    pand      m0, m7
-    packuswb  m0, m1
-    packuswb  m2, m3
-    mova    [%1], m0
-    mova    [%2], m2
-%endmacro
-
-%macro FILT8xU 3
-    mova      m3, [r0+%3+8]
-    mova      m2, [r0+%3]
-    pavgw     m3, [r0+%3+r5+8]
-    pavgw     m2, [r0+%3+r5]
-    movu      m1, [r0+%3+10]
-    movu      m0, [r0+%3+2]
-    pavgw     m1, [r0+%3+r5+10]
-    pavgw     m0, [r0+%3+r5+2]
-    pavgw     m1, m3
-    pavgw     m0, m2
-    psrld     m3, m1, 16
-    psrld     m2, m0, 16
-    pand      m1, m7
-    pand      m0, m7
-    packssdw  m0, m1
-    packssdw  m2, m3
-    movu    [%1], m0
-    mova    [%2], m2
-%endmacro
-
-%macro FILT8xA 4
-    mova      m3, [r0+%4+mmsize]
-    mova      m2, [r0+%4]
-    pavgw     m3, [r0+%4+r5+mmsize]
-    pavgw     m2, [r0+%4+r5]
-    PALIGNR   %1, m3, 2, m6
-    pavgw     %1, m3
-    PALIGNR   m3, m2, 2, m6
-    pavgw     m3, m2
-%if cpuflag(xop)
-    vpperm    m5, m3, %1, m7
-    vpperm    m3, m3, %1, m6
-%else
-    psrld     m5, m3, 16
-    psrld     m4, %1, 16
-    pand      m3, m7
-    pand      %1, m7
-    packssdw  m3, %1
-    packssdw  m5, m4
-%endif
-    mova    [%2], m3
-    mova    [%3], m5
-    mova      %1, m2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
-;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 0
-cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
-%if HIGH_BIT_DEPTH
-    shl   dword r6m, 1
-    FIX_STRIDES r5
-    shl   dword r7m, 1
-%endif
-%if mmsize >= 16
-    add   dword r7m, mmsize-1
-    and   dword r7m, ~(mmsize-1)
-%endif
-    ; src += 2*(height-1)*stride + 2*width
-    mov      r6d, r8m
-    dec      r6d
-    imul     r6d, r5d
-    add      r6d, r7m
-    lea       r0, [r0+r6*2]
-    ; dst += (height-1)*stride + width
-    mov      r6d, r8m
-    dec      r6d
-    imul     r6d, r6m
-    add      r6d, r7m
-    add       r1, r6
-    add       r2, r6
-    add       r3, r6
-    add       r4, r6
-    ; gap = stride - width
-    mov      r6d, r6m
-    sub      r6d, r7m
-    PUSH      r6
-    %define dst_gap [rsp+gprsize]
-    mov      r6d, r5d
-    sub      r6d, r7m
-    shl      r6d, 1
-    PUSH      r6
-    %define src_gap [rsp]
-%if HIGH_BIT_DEPTH
-%if cpuflag(xop)
-    mova      m6, [deinterleave_shuf32a]
-    mova      m7, [deinterleave_shuf32b]
-%else
-    pcmpeqw   m7, m7
-    psrld     m7, 16
-%endif
-.vloop:
-    mov      r6d, r7m
-%ifnidn cpuname, mmx2
-    mova      m0, [r0]
-    mova      m1, [r0+r5]
-    pavgw     m0, m1
-    pavgw     m1, [r0+r5*2]
-%endif
-.hloop:
-    sub       r0, mmsize*2
-    sub       r1, mmsize
-    sub       r2, mmsize
-    sub       r3, mmsize
-    sub       r4, mmsize
-%ifidn cpuname, mmx2
-    FILT8xU r1, r2, 0
-    FILT8xU r3, r4, r5
-%else
-    FILT8xA m0, r1, r2, 0
-    FILT8xA m1, r3, r4, r5
-%endif
-    sub      r6d, mmsize
-    jg .hloop
-%else ; !HIGH_BIT_DEPTH
-%if cpuflag(avx2)
-    mova      m7, [deinterleave_shuf]
-%elif cpuflag(xop)
-    mova      m6, [deinterleave_shuf32a]
-    mova      m7, [deinterleave_shuf32b]
-%else
-    pcmpeqb   m7, m7
-    psrlw     m7, 8
-%endif
-.vloop:
-    mov      r6d, r7m
-%ifnidn cpuname, mmx2
-%if mmsize <= 16
-    mova      m0, [r0]
-    mova      m1, [r0+r5]
-    pavgb     m0, m1
-    pavgb     m1, [r0+r5*2]
-%endif
-%endif
-.hloop:
-    sub       r0, mmsize*2
-    sub       r1, mmsize
-    sub       r2, mmsize
-    sub       r3, mmsize
-    sub       r4, mmsize
-%if mmsize==32
-    FILT32x4U r1, r2, r3, r4
-%elifdef m8
-    FILT8x4   m0, m1, m2, m3, m10, m11, mmsize
-    mova      m8, m0
-    mova      m9, m1
-    FILT8x4   m2, m3, m0, m1, m4, m5, 0
-%if cpuflag(xop)
-    vpperm    m4, m2, m8, m7
-    vpperm    m2, m2, m8, m6
-    vpperm    m5, m3, m9, m7
-    vpperm    m3, m3, m9, m6
-%else
-    packuswb  m2, m8
-    packuswb  m3, m9
-    packuswb  m4, m10
-    packuswb  m5, m11
-%endif
-    mova    [r1], m2
-    mova    [r2], m4
-    mova    [r3], m3
-    mova    [r4], m5
-%elifidn cpuname, mmx2
-    FILT8x2U  r1, r2, 0
-    FILT8x2U  r3, r4, r5
-%else
-    FILT16x2  m0, r1, r2, 0
-    FILT16x2  m1, r3, r4, r5
-%endif
-    sub      r6d, mmsize
-    jg .hloop
-%endif ; HIGH_BIT_DEPTH
-.skip:
-    mov       r6, dst_gap
-    sub       r0, src_gap
-    sub       r1, r6
-    sub       r2, r6
-    sub       r3, r6
-    sub       r4, r6
-    dec    dword r8m
-    jg .vloop
-    ADD      rsp, 2*gprsize
-    emms
-    RET
-%endmacro ; FRAME_INIT_LOWRES
-
-INIT_MMX mmx2
-FRAME_INIT_LOWRES
-%if ARCH_X86_64 == 0
-INIT_MMX cache32, mmx2
-FRAME_INIT_LOWRES
-%endif
-INIT_XMM sse2
-FRAME_INIT_LOWRES
-INIT_XMM ssse3
-FRAME_INIT_LOWRES
-INIT_XMM avx
-FRAME_INIT_LOWRES
-INIT_XMM xop
-FRAME_INIT_LOWRES
-%if HIGH_BIT_DEPTH==0
-INIT_YMM avx2
-FRAME_INIT_LOWRES
-%endif
-
-;-----------------------------------------------------------------------------
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
-;-----------------------------------------------------------------------------
-%macro MBTREE 0
-cglobal mbtree_propagate_cost, 6,6,7
-    movss     m6, [r5]
-    mov      r5d, r6m
-    lea       r0, [r0+r5*2]
-    add      r5d, r5d
-    add       r1, r5
-    add       r2, r5
-    add       r3, r5
-    add       r4, r5
-    neg       r5
-    pxor      m4, m4
-    shufps    m6, m6, 0
-    mova      m5, [pw_3fff]
-.loop:
-    movq      m2, [r2+r5] ; intra
-    movq      m0, [r4+r5] ; invq
-    movq      m3, [r3+r5] ; inter
-    movq      m1, [r1+r5] ; prop
-    pand      m3, m5
-    pminsw    m3, m2
-    punpcklwd m2, m4
-    punpcklwd m0, m4
-    pmaddwd   m0, m2
-    punpcklwd m1, m4
-    punpcklwd m3, m4
-%if cpuflag(fma4)
-    cvtdq2ps  m0, m0
-    cvtdq2ps  m1, m1
-    fmaddps   m0, m0, m6, m1
-    cvtdq2ps  m1, m2
-    psubd     m2, m3
-    cvtdq2ps  m2, m2
-    rcpps     m3, m1
-    mulps     m1, m3
-    mulps     m0, m2
-    addps     m2, m3, m3
-    fnmaddps  m3, m1, m3, m2
-    mulps     m0, m3
-%else
-    cvtdq2ps  m0, m0
-    mulps     m0, m6    ; intra*invq*fps_factor>>8
-    cvtdq2ps  m1, m1    ; prop
-    addps     m0, m1    ; prop + (intra*invq*fps_factor>>8)
-    cvtdq2ps  m1, m2    ; intra
-    psubd     m2, m3    ; intra - inter
-    cvtdq2ps  m2, m2    ; intra - inter
-    rcpps     m3, m1    ; 1 / intra 1st approximation
-    mulps     m1, m3    ; intra * (1/intra 1st approx)
-    mulps     m1, m3    ; intra * (1/intra 1st approx)^2
-    mulps     m0, m2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps     m3, m3    ; 2 * (1/intra 1st approx)
-    subps     m3, m1    ; 2nd approximation for 1/intra
-    mulps     m0, m3    ; / intra
-%endif
-    cvtps2dq  m0, m0
-    packssdw  m0, m0
-    movh [r0+r5], m0
-    add       r5, 8
-    jl .loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-MBTREE
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
-INIT_XMM fma4
-MBTREE
-
-%macro INT16_UNPACK 1
-    punpckhwd   xm6, xm%1, xm7
-    punpcklwd  xm%1, xm7
-    vinsertf128 m%1, m%1, xm6, 1
-%endmacro
-
-; FIXME: align loads to 16 bytes
-%macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
-    vbroadcastss m5, [r5]
-    mov         r5d, r6m
-    lea          r0, [r0+r5*2]
-    add         r5d, r5d
-    add          r1, r5
-    add          r2, r5
-    add          r3, r5
-    add          r4, r5
-    neg          r5
-    mova        xm4, [pw_3fff]
-%if notcpuflag(avx2)
-    pxor        xm7, xm7
-%endif
-.loop:
-%if cpuflag(avx2)
-    pmovzxwd     m0, [r2+r5]      ; intra
-    pmovzxwd     m1, [r4+r5]      ; invq
-    pmovzxwd     m2, [r1+r5]      ; prop
-    pand        xm3, xm4, [r3+r5] ; inter
-    pmovzxwd     m3, xm3
-    pminsd       m3, m0
-    pmaddwd      m1, m0
-    psubd        m3, m0, m3
-    cvtdq2ps     m0, m0
-    cvtdq2ps     m1, m1
-    cvtdq2ps     m2, m2
-    cvtdq2ps     m3, m3
-    fmaddps      m1, m1, m5, m2
-    rcpps        m2, m0
-    mulps        m0, m2
-    mulps        m1, m3
-    addps        m3, m2, m2
-    fnmaddps     m2, m2, m0, m3
-    mulps        m1, m2
-%else
-    movu        xm0, [r2+r5]
-    movu        xm1, [r4+r5]
-    movu        xm2, [r1+r5]
-    pand        xm3, xm4, [r3+r5]
-    pminsw      xm3, xm0
-    INT16_UNPACK 0
-    INT16_UNPACK 1
-    INT16_UNPACK 2
-    INT16_UNPACK 3
-    cvtdq2ps     m0, m0
-    cvtdq2ps     m1, m1
-    cvtdq2ps     m2, m2
-    cvtdq2ps     m3, m3
-    mulps        m1, m0
-    subps        m3, m0, m3
-    mulps        m1, m5         ; intra*invq*fps_factor>>8
-    addps        m1, m2         ; prop + (intra*invq*fps_factor>>8)
-    rcpps        m2, m0         ; 1 / intra 1st approximation
-    mulps        m0, m2         ; intra * (1/intra 1st approx)
-    mulps        m0, m2         ; intra * (1/intra 1st approx)^2
-    mulps        m1, m3         ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps        m2, m2         ; 2 * (1/intra 1st approx)
-    subps        m2, m0         ; 2nd approximation for 1/intra
-    mulps        m1, m2         ; / intra
-%endif
-    vcvtps2dq    m1, m1
-    vextractf128 xm2, m1, 1
-    packssdw    xm1, xm2
-    mova    [r0+r5], xm1
-    add          r5, 16
-    jl .loop
-    RET
-%endmacro
-
-INIT_YMM avx
-MBTREE_AVX
-INIT_YMM avx2
-MBTREE_AVX
-
-%macro MBTREE_PROPAGATE_LIST 0
-;-----------------------------------------------------------------------------
-; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
-;                                      int16_t *output, int bipred_weight, int mb_y, int len )
-;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_list_internal, 4,6,8
-    movh     m6, [pw_0to15] ; mb_x
-    movd     m7, r5m
-    pshuflw  m7, m7, 0
-    punpcklwd m6, m7       ; 0 y 1 y 2 y 3 y
-    movd     m7, r4m
-    SPLATW   m7, m7        ; bipred_weight
-    psllw    m7, 9         ; bipred_weight << 9
-
-    mov     r5d, r6m
-    xor     r4d, r4d
-.loop:
-    mova     m3, [r1+r4*2]
-    movu     m4, [r2+r4*2]
-    mova     m5, [pw_0xc000]
-    pand     m4, m5
-    pcmpeqw  m4, m5
-    pmulhrsw m5, m3, m7    ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
-%if cpuflag(avx)
-    pblendvb m5, m3, m5, m4
-%else
-    pand     m5, m4
-    pandn    m4, m3
-    por      m5, m4        ; if( lists_used == 3 )
-                           ;     propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
-%endif
-
-    movu     m0, [r0+r4*4] ; x,y
-    movu     m1, [r0+r4*4+mmsize]
-
-    psraw    m2, m0, 5
-    psraw    m3, m1, 5
-    mova     m4, [pd_4]
-    paddw    m2, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
-    paddw    m6, m4        ; {mbx, mby} += {4, 0}
-    paddw    m3, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
-    paddw    m6, m4        ; {mbx, mby} += {4, 0}
-
-    mova [r3+mmsize*0], m2
-    mova [r3+mmsize*1], m3
-
-    mova     m3, [pw_31]
-    pand     m0, m3        ; x &= 31
-    pand     m1, m3        ; y &= 31
-    packuswb m0, m1
-    psrlw    m1, m0, 3
-    pand     m0, m3        ; x
-    SWAP      1, 3
-    pandn    m1, m3        ; y premultiplied by (1<<5) for later use of pmulhrsw
-
-    mova     m3, [pw_32]
-    psubw    m3, m0        ; 32 - x
-    mova     m4, [pw_1024]
-    psubw    m4, m1        ; (32 - y) << 5
-
-    pmullw   m2, m3, m4    ; idx0weight = (32-y)*(32-x) << 5
-    pmullw   m4, m0        ; idx1weight = (32-y)*x << 5
-    pmullw   m0, m1        ; idx3weight = y*x << 5
-    pmullw   m1, m3        ; idx2weight = y*(32-x) << 5
-
-    ; avoid overflow in the input to pmulhrsw
-    psrlw    m3, m2, 15
-    psubw    m2, m3        ; idx0weight -= (idx0weight == 32768)
-
-    pmulhrsw m2, m5        ; idx0weight * propagate_amount + 512 >> 10
-    pmulhrsw m4, m5        ; idx1weight * propagate_amount + 512 >> 10
-    pmulhrsw m1, m5        ; idx2weight * propagate_amount + 512 >> 10
-    pmulhrsw m0, m5        ; idx3weight * propagate_amount + 512 >> 10
-
-    SBUTTERFLY wd, 2, 4, 3
-    SBUTTERFLY wd, 1, 0, 3
-    mova [r3+mmsize*2], m2
-    mova [r3+mmsize*3], m4
-    mova [r3+mmsize*4], m1
-    mova [r3+mmsize*5], m0
-    add     r4d, mmsize/2
-    add      r3, mmsize*6
-    cmp     r4d, r5d
-    jl .loop
-    REP_RET
-%endmacro
-
-INIT_XMM ssse3
-MBTREE_PROPAGATE_LIST
-INIT_XMM avx
-MBTREE_PROPAGATE_LIST
-
-INIT_YMM avx2
-cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
-    mova          xm4, [pw_0xc000]
-%if UNIX64
-    shl           r4d, 9
-    shl           r5d, 16
-    movd          xm5, r4d
-    movd          xm6, r5d
-    vpbroadcastw  xm5, xm5
-    vpbroadcastd   m6, xm6
-%else
-    vpbroadcastw  xm5, r4m
-    vpbroadcastd   m6, r5m
-    psllw         xm5, 9             ; bipred_weight << 9
-    pslld          m6, 16
-%endif
-    mov           r4d, r6m
-    lea            r1, [r1+r4*2]
-    lea            r2, [r2+r4*2]
-    lea            r0, [r0+r4*4]
-    neg            r4
-    por            m6, [pd_0123]     ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
-    vbroadcasti128 m7, [pw_31]
-.loop:
-    mova          xm3, [r1+r4*2]
-    pand          xm0, xm4, [r2+r4*2]
-    pmulhrsw      xm1, xm3, xm5      ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6
-    pcmpeqw       xm0, xm4
-    pblendvb      xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount
-    vpermq         m3, m3, q1100
-
-    movu           m0, [r0+r4*4]     ; {x, y}
-    vbroadcasti128 m1, [pd_8]
-    psraw          m2, m0, 5
-    paddw          m2, m6            ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
-    paddw          m6, m1            ; i_mb_x += 8
-    mova         [r3], m2
-
-    mova           m1, [pw_32]
-    pand           m0, m7
-    psubw          m1, m0
-    packuswb       m1, m0            ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y}
-    psrlw          m0, m1, 3
-    pand           m1, [pw_00ff]     ; 32-x x 32-x x
-    pandn          m0, m7, m0        ; (32-y y 32-y y) << 5
-    pshufd         m2, m1, q1032
-    pmullw         m1, m0            ; idx0 idx3 idx0 idx3
-    pmullw         m2, m0            ; idx1 idx2 idx1 idx2
-
-    pmulhrsw       m0, m1, m3        ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10
-    pmulhrsw       m2, m3            ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10
-    psignw         m0, m1            ; correct potential overflow in the idx0 input to pmulhrsw
-    punpcklwd      m1, m0, m2        ; idx01weight
-    punpckhwd      m2, m0            ; idx23weight
-    mova      [r3+32], m1
-    mova      [r3+64], m2
-    add            r3, 3*mmsize
-    add            r4, 8
-    jl .loop
-    RET
-
-%macro MBTREE_FIX8 0
-;-----------------------------------------------------------------------------
-; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
-;-----------------------------------------------------------------------------
-cglobal mbtree_fix8_pack, 3,4
-%if mmsize == 32
-    vbroadcastf128 m2, [pf_256]
-    vbroadcasti128 m3, [mbtree_fix8_pack_shuf]
-%else
-    movaps       m2, [pf_256]
-    mova         m3, [mbtree_fix8_pack_shuf]
-%endif
-    sub         r2d, mmsize/2
-    movsxdifnidn r2, r2d
-    lea          r1, [r1+4*r2]
-    lea          r0, [r0+2*r2]
-    neg          r2
-    jg .skip_loop
-.loop:
-    mulps        m0, m2, [r1+4*r2]
-    mulps        m1, m2, [r1+4*r2+mmsize]
-    cvttps2dq    m0, m0
-    cvttps2dq    m1, m1
-    packssdw     m0, m1
-    pshufb       m0, m3
-%if mmsize == 32
-    vpermq       m0, m0, q3120
-%endif
-    mova  [r0+2*r2], m0
-    add          r2, mmsize/2
-    jle .loop
-.skip_loop:
-    sub          r2, mmsize/2
-    jz .end
-    ; Do the remaining values in scalar in order to avoid overreading src.
-.scalar:
-    mulss       xm0, xm2, [r1+4*r2+2*mmsize]
-    cvttss2si   r3d, xm0
-    rol         r3w, 8
-    mov [r0+2*r2+mmsize], r3w
-    inc          r2
-    jl .scalar
-.end:
-    RET
-
-;-----------------------------------------------------------------------------
-; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
-;-----------------------------------------------------------------------------
-cglobal mbtree_fix8_unpack, 3,4
-%if mmsize == 32
-    vbroadcastf128 m2, [pf_inv256]
-%else
-    movaps       m2, [pf_inv256]
-    mova         m4, [mbtree_fix8_unpack_shuf+16]
-%endif
-    mova         m3, [mbtree_fix8_unpack_shuf]
-    sub         r2d, mmsize/2
-    movsxdifnidn r2, r2d
-    lea          r1, [r1+2*r2]
-    lea          r0, [r0+4*r2]
-    neg          r2
-    jg .skip_loop
-.loop:
-%if mmsize == 32
-    vbroadcasti128 m0, [r1+2*r2]
-    vbroadcasti128 m1, [r1+2*r2+16]
-    pshufb       m0, m3
-    pshufb       m1, m3
-%else
-    mova         m1, [r1+2*r2]
-    pshufb       m0, m1, m3
-    pshufb       m1, m4
-%endif
-    psrad        m0, 16 ; sign-extend
-    psrad        m1, 16
-    cvtdq2ps     m0, m0
-    cvtdq2ps     m1, m1
-    mulps        m0, m2
-    mulps        m1, m2
-    movaps [r0+4*r2], m0
-    movaps [r0+4*r2+mmsize], m1
-    add          r2, mmsize/2
-    jle .loop
-.skip_loop:
-    sub          r2, mmsize/2
-    jz .end
-.scalar:
-    movzx       r3d, word [r1+2*r2+mmsize]
-    rol         r3w, 8
-    movsx       r3d, r3w
-    ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
-    ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
-    cvtsi2ss    xm0, xm2, r3d
-    mulss       xm0, xm2
-    movss [r0+4*r2+2*mmsize], xm0
-    inc          r2
-    jl .scalar
-.end:
-    RET
-%endmacro
-
-INIT_XMM ssse3
-MBTREE_FIX8
-INIT_YMM avx2
-MBTREE_FIX8
diff --git a/android/src/main/libenc/jni/libx264/common/x86/mc-c.c b/android/src/main/libenc/jni/libx264/common/x86/mc-c.c
deleted file mode 100755
index 3bff408..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/mc-c.c
+++ /dev/null
@@ -1,850 +0,0 @@
-/*****************************************************************************
- * mc-c.c: x86 motion compensation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "mc.h"
-
-#define DECL_SUF( func, args )\
-    void func##_mmx2 args;\
-    void func##_sse2 args;\
-    void func##_ssse3 args;\
-    void func##_avx2 args;
-
-DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-
-#define MC_WEIGHT(w,type) \
-    void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
-
-#define MC_WEIGHT_OFFSET(w,type) \
-    void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
-    void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
-    MC_WEIGHT(w,type)
-
-MC_WEIGHT_OFFSET( 4, mmx2 )
-MC_WEIGHT_OFFSET( 8, mmx2 )
-MC_WEIGHT_OFFSET( 12, mmx2 )
-MC_WEIGHT_OFFSET( 16, mmx2 )
-MC_WEIGHT_OFFSET( 20, mmx2 )
-MC_WEIGHT_OFFSET( 12, sse2 )
-MC_WEIGHT_OFFSET( 16, sse2 )
-MC_WEIGHT_OFFSET( 20, sse2 )
-#if HIGH_BIT_DEPTH
-MC_WEIGHT_OFFSET( 8, sse2 )
-#endif
-MC_WEIGHT( 8, sse2  )
-MC_WEIGHT( 4, ssse3 )
-MC_WEIGHT( 8, ssse3 )
-MC_WEIGHT( 12, ssse3 )
-MC_WEIGHT( 16, ssse3 )
-MC_WEIGHT( 20, ssse3 )
-MC_WEIGHT( 8, avx2 )
-MC_WEIGHT( 16, avx2 )
-MC_WEIGHT( 20, avx2 )
-#undef MC_OFFSET
-#undef MC_WEIGHT
-
-void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
-void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
-void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
-void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
-                                           pixel *srcu, intptr_t i_srcu,
-                                           pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
-                                           pixel *srcu, intptr_t i_srcu,
-                                           pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
-                                          pixel *srcu, intptr_t i_srcu,
-                                          pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
-                                       pixel *dstv, intptr_t i_dstv,
-                                       pixel *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
-                                        pixel *dstv, intptr_t i_dstv,
-                                        pixel *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
-                                         uint8_t *dstv, intptr_t i_dstv,
-                                         uint8_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
-                                       uint16_t *dstv, intptr_t i_dstv,
-                                       uint16_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
-                                             pixel *dstb, intptr_t i_dstb,
-                                             pixel *dstc, intptr_t i_dstc,
-                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
-                                             pixel *dstb, intptr_t i_dstb,
-                                             pixel *dstc, intptr_t i_dstc,
-                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx  ( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
-void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
-void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx( void *dst, size_t n );
-void x264_memzero_aligned_sse( void *dst, size_t n );
-void x264_memzero_aligned_avx( void *dst, size_t n );
-void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
-void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
-void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
-void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
-void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
-void x264_integral_init4v_mmx  ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
-void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
-void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
-void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
-void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
-void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
-void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
-void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
-void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
-void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count );
-
-#define MC_CHROMA(cpu)\
-void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
-                           int dx, int dy, int i_width, int i_height );
-MC_CHROMA(mmx2)
-MC_CHROMA(sse2)
-MC_CHROMA(ssse3)
-MC_CHROMA(ssse3_cache64)
-MC_CHROMA(avx)
-MC_CHROMA(avx2)
-
-#define LOWRES(cpu)\
-void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
-                                        intptr_t src_stride, intptr_t dst_stride, int width, int height );
-LOWRES(mmx2)
-LOWRES(cache32_mmx2)
-LOWRES(sse2)
-LOWRES(ssse3)
-LOWRES(avx)
-LOWRES(xop)
-LOWRES(avx2)
-
-#define PIXEL_AVG_W(width,cpu)\
-void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
-/* This declares some functions that don't exist, but that isn't a problem. */
-#define PIXEL_AVG_WALL(cpu)\
-PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
-
-PIXEL_AVG_WALL(mmx2)
-PIXEL_AVG_WALL(cache32_mmx2)
-PIXEL_AVG_WALL(cache64_mmx2)
-PIXEL_AVG_WALL(cache64_sse2)
-PIXEL_AVG_WALL(sse2)
-PIXEL_AVG_WALL(cache64_ssse3)
-PIXEL_AVG_WALL(avx2)
-
-#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
-static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
-{\
-    NULL,\
-    x264_pixel_avg2_w4_##name1,\
-    x264_pixel_avg2_w8_##name2,\
-    x264_pixel_avg2_w12_##name3,\
-    x264_pixel_avg2_w16_##name4,\
-    x264_pixel_avg2_w20_##name5,\
-};
-
-#if HIGH_BIT_DEPTH
-/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
-#define x264_pixel_avg2_w12_mmx2       x264_pixel_avg2_w10_mmx2
-#define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
-#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
-#define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
-#define x264_pixel_avg2_w12_avx2         x264_pixel_avg2_w16_avx2
-#define x264_pixel_avg2_w20_avx2         x264_pixel_avg2_w18_avx2
-#else
-/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
-#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
-#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
-#define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
-#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w16_sse2
-#endif // HIGH_BIT_DEPTH
-
-PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
-#if HIGH_BIT_DEPTH
-PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
-PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
-#else // !HIGH_BIT_DEPTH
-#if ARCH_X86
-PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
-PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
-#endif
-PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
-PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
-PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
-PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
-PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
-#endif // HIGH_BIT_DEPTH
-
-#define MC_COPY_WTAB(instr, name1, name2, name3)\
-static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
-{\
-    NULL,\
-    x264_mc_copy_w4_##name1,\
-    x264_mc_copy_w8_##name2,\
-    NULL,\
-    x264_mc_copy_w16_##name3,\
-};
-
-MC_COPY_WTAB(mmx,mmx,mmx,mmx)
-#if HIGH_BIT_DEPTH
-MC_COPY_WTAB(sse,mmx,sse,sse)
-MC_COPY_WTAB(avx,mmx,sse,avx)
-#else
-MC_COPY_WTAB(sse,mmx,mmx,sse)
-#endif
-
-#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
-    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
-{\
-    x264_mc_##function##_w4_##name1,\
-    x264_mc_##function##_w4_##name1,\
-    x264_mc_##function##_w8_##name2,\
-    x264_mc_##function##_w##w12version##_##instr,\
-    x264_mc_##function##_w16_##instr,\
-    x264_mc_##function##_w20_##instr,\
-};
-
-#if HIGH_BIT_DEPTH
-MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
-MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
-MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)
-
-static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
-{
-    if( w->i_scale == 1<<w->i_denom )
-    {
-        if( w->i_offset < 0 )
-            w->weightfn = h->mc.offsetsub;
-        else
-            w->weightfn = h->mc.offsetadd;
-        for( int i = 0; i < 8; i++ )
-            w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
-        return;
-    }
-    w->weightfn = h->mc.weight;
-    int den1 = 1<<w->i_denom;
-    int den2 = w->i_scale<<1;
-    int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
-    for( int i = 0; i < 8; i++ )
-    {
-        w->cachea[i] = den1;
-        w->cacheb[i] = i&1 ? den3 : den2;
-    }
-}
-#else
-MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
-MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
-MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
-MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
-MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
-MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
-
-static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
-{
-    int i;
-    int16_t den1;
-
-    if( w->i_scale == 1<<w->i_denom )
-    {
-        if( w->i_offset < 0 )
-            w->weightfn = h->mc.offsetsub;
-        else
-            w->weightfn = h->mc.offsetadd;
-        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
-        return;
-    }
-    w->weightfn = h->mc.weight;
-    den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
-    for( i = 0; i < 8; i++ )
-    {
-        w->cachea[i] = w->i_scale;
-        w->cacheb[i] = den1;
-    }
-}
-
-static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
-{
-    int i, den1;
-    if( w->i_scale == 1<<w->i_denom )
-    {
-        if( w->i_offset < 0 )
-            w->weightfn = h->mc.offsetsub;
-        else
-            w->weightfn = h->mc.offsetadd;
-
-        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
-        return;
-    }
-    w->weightfn = h->mc.weight;
-    den1 = w->i_scale << (8 - w->i_denom);
-    for( i = 0; i < 8; i++ )
-    {
-        w->cachea[i] = den1;
-        w->cacheb[i] = w->i_offset;
-    }
-}
-#endif // !HIGH_BIT_DEPTH
-
-#define MC_LUMA(name,instr1,instr2)\
-static void mc_luma_##name( pixel *dst,    intptr_t i_dst_stride,\
-                            pixel *src[4], intptr_t i_src_stride,\
-                            int mvx, int mvy,\
-                            int i_width, int i_height, const x264_weight_t *weight )\
-{\
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
-    if( qpel_idx & 5 ) /* qpel interpolation needed */\
-    {\
-        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
-        x264_pixel_avg_wtab_##instr1[i_width>>2](\
-                dst, i_dst_stride, src1, i_src_stride,\
-                src2, i_height );\
-        if( weight->weightfn )\
-            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
-    }\
-    else if( weight->weightfn )\
-        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
-    else\
-        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
-}
-
-MC_LUMA(mmx2,mmx2,mmx)
-MC_LUMA(sse2,sse2,sse)
-#if HIGH_BIT_DEPTH
-MC_LUMA(avx2,avx2,avx)
-#else
-#if ARCH_X86
-MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
-MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
-#endif
-MC_LUMA(cache64_sse2,cache64_sse2,sse)
-MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
-MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
-#endif // !HIGH_BIT_DEPTH
-
-#define GET_REF(name)\
-static pixel *get_ref_##name( pixel *dst,   intptr_t *i_dst_stride,\
-                              pixel *src[4], intptr_t i_src_stride,\
-                              int mvx, int mvy,\
-                              int i_width, int i_height, const x264_weight_t *weight )\
-{\
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
-    if( qpel_idx & 5 ) /* qpel interpolation needed */\
-    {\
-        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
-        x264_pixel_avg_wtab_##name[i_width>>2](\
-                dst, *i_dst_stride, src1, i_src_stride,\
-                src2, i_height );\
-        if( weight->weightfn )\
-            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
-        return dst;\
-    }\
-    else if( weight->weightfn )\
-    {\
-        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
-        return dst;\
-    }\
-    else\
-    {\
-        *i_dst_stride = i_src_stride;\
-        return src1;\
-    }\
-}
-
-GET_REF(mmx2)
-GET_REF(sse2)
-GET_REF(avx2)
-#if !HIGH_BIT_DEPTH
-#if ARCH_X86
-GET_REF(cache32_mmx2)
-GET_REF(cache64_mmx2)
-#endif
-GET_REF(cache64_sse2)
-GET_REF(cache64_ssse3)
-GET_REF(cache64_ssse3_atom)
-#endif // !HIGH_BIT_DEPTH
-
-#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
-void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
-void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
-void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
-static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
-                                    intptr_t stride, int width, int height, int16_t *buf )\
-{\
-    intptr_t realign = (intptr_t)src & (align-1);\
-    src -= realign;\
-    dstv -= realign;\
-    dstc -= realign;\
-    dsth -= realign;\
-    width += realign;\
-    while( height-- )\
-    {\
-        x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
-        x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
-        x264_hpel_filter_h_##cpuh( dsth, src, width );\
-        dsth += stride;\
-        dstv += stride;\
-        dstc += stride;\
-        src  += stride;\
-    }\
-    x264_sfence();\
-}
-
-HPEL(8, mmx2, mmx2, mmx2, mmx2)
-#if HIGH_BIT_DEPTH
-HPEL(16, sse2, sse2, sse2, sse2)
-#else // !HIGH_BIT_DEPTH
-HPEL(16, sse2_amd, mmx2, mmx2, sse2)
-#if ARCH_X86_64
-void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
-#else
-HPEL(16, sse2, sse2, sse2, sse2)
-HPEL(16, ssse3, ssse3, ssse3, ssse3)
-HPEL(16, avx, avx, avx, avx)
-HPEL(32, avx2, avx2, avx2, avx2)
-#endif
-#endif // HIGH_BIT_DEPTH
-
-PLANE_COPY(16, sse)
-PLANE_COPY(32, avx)
-
-PLANE_COPY_SWAP(16, ssse3)
-PLANE_COPY_SWAP(32, avx2)
-
-PLANE_INTERLEAVE(mmx2)
-PLANE_INTERLEAVE(sse2)
-#if HIGH_BIT_DEPTH
-PLANE_INTERLEAVE(avx)
-#endif
-
-#if HAVE_X86_INLINE_ASM
-#undef MC_CLIP_ADD
-#define MC_CLIP_ADD(s,x)\
-do\
-{\
-    int temp;\
-    asm("movd       %0, %%xmm0     \n"\
-        "movd       %2, %%xmm1     \n"\
-        "paddsw %%xmm1, %%xmm0     \n"\
-        "movd   %%xmm0, %1         \n"\
-        :"+m"(s), "=&r"(temp)\
-        :"m"(x)\
-    );\
-    s = temp;\
-} while(0)
-
-#undef MC_CLIP_ADD2
-#define MC_CLIP_ADD2(s,x)\
-do\
-{\
-    asm("movd       %0, %%xmm0     \n"\
-        "movd       %1, %%xmm1     \n"\
-        "paddsw %%xmm1, %%xmm0     \n"\
-        "movd   %%xmm0, %0         \n"\
-        :"+m"(M32(s))\
-        :"m"(M32(x))\
-    );\
-} while(0)
-#endif
-
-PROPAGATE_LIST(ssse3)
-PROPAGATE_LIST(avx)
-PROPAGATE_LIST(avx2)
-
-void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
-{
-    if( !(cpu&X264_CPU_MMX) )
-        return;
-
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
-
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
-
-    pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
-    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
-    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
-    pf->memcpy_aligned  = x264_memcpy_aligned_mmx;
-    pf->memzero_aligned = x264_memzero_aligned_mmx;
-    pf->integral_init4v = x264_integral_init4v_mmx;
-    pf->integral_init8v = x264_integral_init8v_mmx;
-
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-
-    pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
-    pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
-    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
-
-    pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
-    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmx2;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmx2;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmx2;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_mmx2;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmx2;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmx2;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmx2;
-
-    pf->mc_luma = mc_luma_mmx2;
-    pf->get_ref = get_ref_mmx2;
-    pf->mc_chroma = x264_mc_chroma_mmx2;
-    pf->hpel_filter = x264_hpel_filter_mmx2;
-    pf->weight = x264_mc_weight_wtab_mmx2;
-    pf->weight_cache = x264_weight_cache_mmx2;
-    pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
-    pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
-
-    if( cpu&X264_CPU_SSE )
-    {
-        pf->memcpy_aligned  = x264_memcpy_aligned_sse;
-        pf->memzero_aligned = x264_memzero_aligned_sse;
-        pf->plane_copy = x264_plane_copy_sse;
-    }
-
-#if HIGH_BIT_DEPTH
-#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
-    if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
-#endif
-
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
-
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
-
-    pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
-
-    if( cpu&X264_CPU_SSE2_IS_FAST )
-    {
-        pf->get_ref = get_ref_sse2;
-        pf->mc_luma = mc_luma_sse2;
-        pf->hpel_filter = x264_hpel_filter_sse2;
-    }
-
-    pf->integral_init4v = x264_integral_init4v_sse2;
-    pf->integral_init8v = x264_integral_init8v_sse2;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
-    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
-    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
-
-    if( cpu&X264_CPU_SSE2_IS_SLOW )
-        return;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_sse2;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_sse2;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_sse2;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sse2;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sse2;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;
-
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
-    pf->weight = x264_mc_weight_wtab_sse2;
-
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_sse2;
-
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
-    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
-    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_ssse3;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_ssse3;
-
-    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
-        pf->integral_init4v = x264_integral_init4v_ssse3;
-
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
-    pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
-    pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
-    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
-    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
-    pf->copy[PIXEL_16x16]            = x264_mc_copy_w16_aligned_avx;
-
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_avx;
-
-    if( cpu&X264_CPU_XOP )
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf->mc_luma = mc_luma_avx2;
-        pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
-    }
-#else // !HIGH_BIT_DEPTH
-
-#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
-    if( cpu&X264_CPU_CACHELINE_32 )
-    {
-        pf->mc_luma = mc_luma_cache32_mmx2;
-        pf->get_ref = get_ref_cache32_mmx2;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
-    }
-    else if( cpu&X264_CPU_CACHELINE_64 )
-    {
-        pf->mc_luma = mc_luma_cache64_mmx2;
-        pf->get_ref = get_ref_cache64_mmx2;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
-    }
-#endif
-
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-
-    pf->integral_init4v = x264_integral_init4v_sse2;
-    pf->integral_init8v = x264_integral_init8v_sse2;
-    pf->hpel_filter = x264_hpel_filter_sse2_amd;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
-
-    if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
-    {
-        pf->weight = x264_mc_weight_wtab_sse2;
-        if( !(cpu&X264_CPU_SLOW_ATOM) )
-        {
-            pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-            pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
-        }
-
-        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
-        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
-        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
-        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
-        pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
-        pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
-        pf->hpel_filter = x264_hpel_filter_sse2;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-            pf->mc_chroma = x264_mc_chroma_sse2;
-
-        if( cpu&X264_CPU_SSE2_IS_FAST )
-        {
-            pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
-            pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
-            pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
-            pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
-            pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
-            pf->mc_luma = mc_luma_sse2;
-            pf->get_ref = get_ref_sse2;
-            if( cpu&X264_CPU_CACHELINE_64 )
-            {
-                pf->mc_luma = mc_luma_cache64_sse2;
-                pf->get_ref = get_ref_cache64_sse2;
-            }
-        }
-    }
-
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_ssse3;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_ssse3;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_ssse3;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_ssse3;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_ssse3;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
-    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_ssse3;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_ssse3;
-
-    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
-    {
-        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
-        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
-        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
-    }
-
-    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
-    {
-#if ARCH_X86_64
-        if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
-#endif
-            pf->hpel_filter = x264_hpel_filter_ssse3;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
-    }
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_ssse3;
-
-    if( cpu&X264_CPU_CACHELINE_64 )
-    {
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-            pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
-        pf->mc_luma = mc_luma_cache64_ssse3;
-        pf->get_ref = get_ref_cache64_ssse3;
-        if( cpu&X264_CPU_SLOW_ATOM )
-        {
-            pf->mc_luma = mc_luma_cache64_ssse3_atom;
-            pf->get_ref = get_ref_cache64_ssse3_atom;
-        }
-    }
-
-    pf->weight_cache = x264_weight_cache_ssse3;
-    pf->weight = x264_mc_weight_wtab_ssse3;
-
-    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
-        pf->integral_init4v = x264_integral_init4v_ssse3;
-
-    if( !(cpu&X264_CPU_SSE4) )
-        return;
-
-    pf->integral_init4h = x264_integral_init4h_sse4;
-    pf->integral_init8h = x264_integral_init8h_sse4;
-
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
-    pf->integral_init8h = x264_integral_init8h_avx;
-    pf->hpel_filter = x264_hpel_filter_avx;
-
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_avx;
-
-    if( cpu&X264_CPU_XOP )
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf->hpel_filter = x264_hpel_filter_avx2;
-        pf->mc_chroma = x264_mc_chroma_avx2;
-        pf->weight = x264_mc_weight_wtab_avx2;
-        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
-        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_avx2;
-        pf->integral_init8v = x264_integral_init8v_avx2;
-        pf->integral_init4v = x264_integral_init4v_avx2;
-        pf->integral_init8h = x264_integral_init8h_avx2;
-        pf->integral_init4h = x264_integral_init4h_avx2;
-        pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
-    }
-#endif // HIGH_BIT_DEPTH
-
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf->memzero_aligned = x264_memzero_aligned_avx;
-    pf->plane_copy = x264_plane_copy_avx;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
-
-    if( cpu&X264_CPU_FMA4 )
-        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
-
-    if( !(cpu&X264_CPU_AVX2) )
-        return;
-    pf->plane_copy_swap = x264_plane_copy_swap_avx2;
-    pf->get_ref = get_ref_avx2;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
-    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_avx2;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_avx2;
-}
diff --git a/android/src/main/libenc/jni/libx264/common/x86/mc.h b/android/src/main/libenc/jni/libx264/common/x86/mc.h
deleted file mode 100755
index fc283ec..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/mc.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*****************************************************************************
- * mc.h: x86 motion compensation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_I386_MC_H
-#define X264_I386_MC_H
-
-void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/pixel-32.asm b/android/src/main/libenc/jni/libx264/common/x86/pixel-32.asm
deleted file mode 100755
index 7081edf..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/pixel-32.asm
+++ /dev/null
@@ -1,420 +0,0 @@
-;*****************************************************************************
-;* pixel-32.asm: x86_32 pixel metrics
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-cextern pw_ppmmppmm
-cextern pw_pmpmpmpm
-
-SECTION .text
-INIT_MMX mmx2
-
-%macro LOAD_DIFF_4x8P 1 ; dx
-    LOAD_DIFF  m0, m7, none, [r0+%1],      [r2+%1]
-    LOAD_DIFF  m1, m6, none, [r0+%1+r1],   [r2+%1+r3]
-    LOAD_DIFF  m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
-    LOAD_DIFF  m3, m6, none, [r0+%1+r4],   [r2+%1+r5]
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    LOAD_DIFF  m4, m7, none, [r0+%1],      [r2+%1]
-    LOAD_DIFF  m5, m6, none, [r0+%1+r1],   [r2+%1+r3]
-    LOAD_DIFF  m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
-    movq [spill], m5
-    LOAD_DIFF  m7, m5, none, [r0+%1+r4],   [r2+%1+r5]
-    movq m5, [spill]
-%endmacro
-
-%macro SUM4x8_MM 0
-    movq [spill],   m6
-    movq [spill+8], m7
-    ABSW2    m0, m1, m0, m1, m6, m7
-    ABSW2    m2, m3, m2, m3, m6, m7
-    paddw    m0, m2
-    paddw    m1, m3
-    movq     m6, [spill]
-    movq     m7, [spill+8]
-    ABSW2    m4, m5, m4, m5, m2, m3
-    ABSW2    m6, m7, m6, m7, m2, m3
-    paddw    m4, m6
-    paddw    m5, m7
-    paddw    m0, m4
-    paddw    m1, m5
-    paddw    m0, m1
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-cglobal pixel_sa8d_8x8_internal
-    push   r0
-    push   r2
-    sub    esp, 0x74
-%define args  esp+0x74
-%define spill esp+0x60 ; +16
-%define trans esp+0    ; +96
-    LOAD_DIFF_4x8P 0
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movq   [spill], m1
-    TRANSPOSE4x4W 4, 5, 6, 7, 1
-    movq   [trans+0x00], m4
-    movq   [trans+0x08], m5
-    movq   [trans+0x10], m6
-    movq   [trans+0x18], m7
-    movq   m1, [spill]
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-    movq   [trans+0x20], m0
-    movq   [trans+0x28], m1
-    movq   [trans+0x30], m2
-    movq   [trans+0x38], m3
-
-    mov    r0, [args+4]
-    mov    r2, [args]
-    LOAD_DIFF_4x8P 4
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movq   [spill], m7
-    TRANSPOSE4x4W 0, 1, 2, 3, 7
-    movq   [trans+0x40], m0
-    movq   [trans+0x48], m1
-    movq   [trans+0x50], m2
-    movq   [trans+0x58], m3
-    movq   m7, [spill]
-    TRANSPOSE4x4W 4, 5, 6, 7, 1
-    movq   m0, [trans+0x00]
-    movq   m1, [trans+0x08]
-    movq   m2, [trans+0x10]
-    movq   m3, [trans+0x18]
-
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-    SUM4x8_MM
-    movq   [trans], m0
-
-    movq   m0, [trans+0x20]
-    movq   m1, [trans+0x28]
-    movq   m2, [trans+0x30]
-    movq   m3, [trans+0x38]
-    movq   m4, [trans+0x40]
-    movq   m5, [trans+0x48]
-    movq   m6, [trans+0x50]
-    movq   m7, [trans+0x58]
-
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-    SUM4x8_MM
-
-    pavgw  m0, [trans]
-    add   esp, 0x7c
-    ret
-%undef args
-%undef spill
-%undef trans
-
-%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
-    pxor        %7, %7
-    pshufw      %4, %1, q1032
-    pshufw      %5, %2, q1032
-    pshufw      %6, %3, q1032
-    paddusw     %1, %4
-    paddusw     %2, %5
-    paddusw     %3, %6
-    punpcklwd   %1, %7
-    punpcklwd   %2, %7
-    punpcklwd   %3, %7
-    pshufw      %4, %1, q1032
-    pshufw      %5, %2, q1032
-    pshufw      %6, %3, q1032
-    %8          %1, %4
-    %8          %2, %5
-    %8          %3, %6
-%endmacro
-
-%macro LOAD_4x8P 1 ; dx
-    pxor        m7, m7
-    movd        m6, [r0+%1+7*FENC_STRIDE]
-    movd        m0, [r0+%1+0*FENC_STRIDE]
-    movd        m1, [r0+%1+1*FENC_STRIDE]
-    movd        m2, [r0+%1+2*FENC_STRIDE]
-    movd        m3, [r0+%1+3*FENC_STRIDE]
-    movd        m4, [r0+%1+4*FENC_STRIDE]
-    movd        m5, [r0+%1+5*FENC_STRIDE]
-    punpcklbw   m6, m7
-    punpcklbw   m0, m7
-    punpcklbw   m1, m7
-    movq   [spill], m6
-    punpcklbw   m2, m7
-    punpcklbw   m3, m7
-    movd        m6, [r0+%1+6*FENC_STRIDE]
-    punpcklbw   m4, m7
-    punpcklbw   m5, m7
-    punpcklbw   m6, m7
-    movq        m7, [spill]
-%endmacro
-
-%macro HSUMSUB2 4
-    pshufw m4, %1, %3
-    pshufw m5, %2, %3
-    pmullw %1, %4
-    pmullw m5, %4
-    paddw  %1, m4
-    paddw  %2, m5
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
-;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8, 2,3
-    SUB    esp, 0x94
-%define edge  esp+0x70 ; +32
-%define spill esp+0x60 ; +16
-%define trans esp+0    ; +96
-%define sum   esp+0    ; +32
-
-    pxor      m7, m7
-    movq      m0, [r1+7]
-    movq      m2, [r1+16]
-    movq      m1, m0
-    movq      m3, m2
-    punpcklbw m0, m7
-    punpckhbw m1, m7
-    punpcklbw m2, m7
-    punpckhbw m3, m7
-    movq      m6, [pw_ppmmppmm]
-    HSUMSUB2  m0, m2, q1032, m6
-    HSUMSUB2  m1, m3, q1032, m6
-    movq      m6, [pw_pmpmpmpm]
-    HSUMSUB2  m0, m2, q2301, m6
-    HSUMSUB2  m1, m3, q2301, m6
-    movq      m4, m0
-    movq      m5, m2
-    paddw     m0, m1
-    paddw     m2, m3
-    psubw     m4, m1
-    psubw     m3, m5
-    movq [edge+0], m0
-    movq [edge+8], m4
-    movq [edge+16], m2
-    movq [edge+24], m3
-
-    LOAD_4x8P 0
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movq   [spill], m0
-    TRANSPOSE4x4W 4, 5, 6, 7, 0
-    movq   [trans+0x00], m4
-    movq   [trans+0x08], m5
-    movq   [trans+0x10], m6
-    movq   [trans+0x18], m7
-    movq   m0, [spill]
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-    movq   [trans+0x20], m0
-    movq   [trans+0x28], m1
-    movq   [trans+0x30], m2
-    movq   [trans+0x38], m3
-
-    LOAD_4x8P 4
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movq   [spill], m7
-    TRANSPOSE4x4W 0, 1, 2, 3, 7
-    movq   [trans+0x40], m0
-    movq   [trans+0x48], m1
-    movq   [trans+0x50], m2
-    movq   [trans+0x58], m3
-    movq   m7, [spill]
-    TRANSPOSE4x4W 4, 5, 6, 7, 0
-    movq   m0, [trans+0x00]
-    movq   m1, [trans+0x08]
-    movq   m2, [trans+0x10]
-    movq   m3, [trans+0x18]
-
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movq [spill+0], m0
-    movq [spill+8], m1
-    ABSW2    m2, m3, m2, m3, m0, m1
-    ABSW2    m4, m5, m4, m5, m0, m1
-    paddw    m2, m4
-    paddw    m3, m5
-    ABSW2    m6, m7, m6, m7, m4, m5
-    movq     m0, [spill+0]
-    movq     m1, [spill+8]
-    paddw    m2, m6
-    paddw    m3, m7
-    paddw    m2, m3
-    ABSW     m1, m1, m4
-    paddw    m2, m1 ; 7x4 sum
-    movq     m7, m0
-    movq     m1, [edge+8] ; left bottom
-    psllw    m1, 3
-    psubw    m7, m1
-    ABSW2    m0, m7, m0, m7, m5, m3
-    paddw    m0, m2
-    paddw    m7, m2
-    movq [sum+0], m0 ; dc
-    movq [sum+8], m7 ; left
-
-    movq   m0, [trans+0x20]
-    movq   m1, [trans+0x28]
-    movq   m2, [trans+0x30]
-    movq   m3, [trans+0x38]
-    movq   m4, [trans+0x40]
-    movq   m5, [trans+0x48]
-    movq   m6, [trans+0x50]
-    movq   m7, [trans+0x58]
-
-    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
-
-    movd   [sum+0x10], m0
-    movd   [sum+0x12], m1
-    movd   [sum+0x14], m2
-    movd   [sum+0x16], m3
-    movd   [sum+0x18], m4
-    movd   [sum+0x1a], m5
-    movd   [sum+0x1c], m6
-    movd   [sum+0x1e], m7
-
-    movq [spill],   m0
-    movq [spill+8], m1
-    ABSW2    m2, m3, m2, m3, m0, m1
-    ABSW2    m4, m5, m4, m5, m0, m1
-    paddw    m2, m4
-    paddw    m3, m5
-    paddw    m2, m3
-    movq     m0, [spill]
-    movq     m1, [spill+8]
-    ABSW2    m6, m7, m6, m7, m4, m5
-    ABSW     m1, m1, m3
-    paddw    m2, m7
-    paddw    m1, m6
-    paddw    m2, m1 ; 7x4 sum
-    movq     m1, m0
-
-    movq     m7, [edge+0]
-    psllw    m7, 3   ; left top
-
-    mov      r2, [edge+0]
-    add      r2, [edge+16]
-    lea      r2, [4*r2+32]
-    and      r2, 0xffc0
-    movd     m6, r2 ; dc
-
-    psubw    m1, m7
-    psubw    m0, m6
-    ABSW2    m0, m1, m0, m1, m5, m6
-    movq     m3, [sum+0] ; dc
-    paddw    m0, m2
-    paddw    m1, m2
-    movq     m2, m0
-    paddw    m0, m3
-    paddw    m1, [sum+8] ; h
-    psrlq    m2, 16
-    paddw    m2, m3
-
-    movq     m3, [edge+16] ; top left
-    movq     m4, [edge+24] ; top right
-    psllw    m3, 3
-    psllw    m4, 3
-    psubw    m3, [sum+16]
-    psubw    m4, [sum+24]
-    ABSW2    m3, m4, m3, m4, m5, m6
-    paddw    m2, m3
-    paddw    m2, m4 ; v
-
-    SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
-    mov      r2, r2m
-    pxor      m7, m7
-    punpckldq m2, m1
-    pavgw     m0, m7
-    pavgw     m2, m7
-    movd  [r2+8], m0 ; dc
-    movq  [r2+0], m2 ; v, h
-    ADD     esp, 0x94
-    RET
-%undef edge
-%undef spill
-%undef trans
-%undef sum
-
-
-
-;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
-;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
-;-----------------------------------------------------------------------------
-cglobal pixel_ssim_4x4x2_core, 0,5
-    mov       r1, r1m
-    mov       r3, r3m
-    mov       r4, 4
-    pxor      m0, m0
-.loop:
-    mov       r0, r0m
-    mov       r2, r2m
-    add       r0, r4
-    add       r2, r4
-    pxor      m1, m1
-    pxor      m2, m2
-    pxor      m3, m3
-    pxor      m4, m4
-%rep 4
-    movd      m5, [r0]
-    movd      m6, [r2]
-    punpcklbw m5, m0
-    punpcklbw m6, m0
-    paddw     m1, m5
-    paddw     m2, m6
-    movq      m7, m5
-    pmaddwd   m5, m5
-    pmaddwd   m7, m6
-    pmaddwd   m6, m6
-    paddd     m3, m5
-    paddd     m4, m7
-    paddd     m3, m6
-    add       r0, r1
-    add       r2, r3
-%endrep
-    mov       r0, r4m
-    lea       r0, [r0+r4*4]
-    pshufw    m5, m1, q0032
-    pshufw    m6, m2, q0032
-    paddusw   m1, m5
-    paddusw   m2, m6
-    punpcklwd m1, m2
-    pshufw    m2, m1, q0032
-    pshufw    m5, m3, q0032
-    pshufw    m6, m4, q0032
-    paddusw   m1, m2
-    paddd     m3, m5
-    paddd     m4, m6
-    punpcklwd m1, m0
-    punpckldq m3, m4
-    movq  [r0+0], m1
-    movq  [r0+8], m3
-    sub       r4, 4
-    jge .loop
-    emms
-    RET
-
diff --git a/android/src/main/libenc/jni/libx264/common/x86/pixel-a.asm b/android/src/main/libenc/jni/libx264/common/x86/pixel-a.asm
deleted file mode 100755
index d55a41b..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/pixel-a.asm
+++ /dev/null
@@ -1,5219 +0,0 @@
-;*****************************************************************************
-;* pixel.asm: x86 pixel metrics
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Alex Izvorski <aizvorksi@gmail.com>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Oskar Arvidsson <oskar@irock.se>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-hmul_16p:  times 16 db 1
-           times 8 db 1, -1
-hmul_8p:   times 8 db 1
-           times 4 db 1, -1
-           times 8 db 1
-           times 4 db 1, -1
-mask_ff:   times 16 db 0xff
-           times 16 db 0
-mask_ac4:  times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
-mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
-mask_ac8:  times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
-%if HIGH_BIT_DEPTH
-ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
-%endif
-%if BIT_DEPTH == 10
-ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
-ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
-pf_64:     times 4 dd 64.0
-pf_128:    times 4 dd 128.0
-%elif BIT_DEPTH == 9
-ssim_c1:   times 4 dd 1671         ; .01*.01*511*511*64
-ssim_c2:   times 4 dd 947556       ; .03*.03*511*511*64*63
-%else ; 8-bit
-ssim_c1:   times 4 dd 416          ; .01*.01*255*255*64
-ssim_c2:   times 4 dd 235963       ; .03*.03*255*255*64*63
-%endif
-hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-mask_10:   times 4 dw 0, -1
-mask_1100: times 2 dd 0, -1
-pb_pppm:   times 4 db 1,1,1,-1
-deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
-
-intrax9a_ddlr1: db  6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
-intrax9a_ddlr2: db  8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
-intrax9a_hdu1:  db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
-intrax9a_hdu2:  db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
-intrax9a_vrl1:  db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
-intrax9a_vrl2:  db  2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
-intrax9a_vh1:   db  6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
-intrax9a_vh2:   db  6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
-intrax9a_dc:    db  1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
-intrax9a_lut:   db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
-pw_s01234567:   dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
-pw_s01234657:   dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
-intrax9_edge:   db  0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
-
-intrax9b_ddlr1: db  6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
-intrax9b_ddlr2: db  8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
-intrax9b_hdu1:  db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
-intrax9b_hdu2:  db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
-intrax9b_vrl1:  db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
-intrax9b_vrl2:  db  2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
-intrax9b_vh1:   db  6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
-intrax9b_vh2:   db  6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
-intrax9b_edge2: db  6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
-intrax9b_v1:    db  0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
-intrax9b_v2:    db  2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
-intrax9b_lut:   db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
-
-ALIGN 32
-intra8x9_h1:   db  7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
-intra8x9_h2:   db  6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
-intra8x9_h3:   db  3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
-intra8x9_h4:   db  2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
-intra8x9_ddl1: db  1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
-intra8x9_ddl2: db  2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
-intra8x9_ddl3: db  5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
-intra8x9_ddl4: db  6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
-intra8x9_vl1:  db  0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
-intra8x9_vl2:  db  1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
-intra8x9_vl3:  db  2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
-intra8x9_vl4:  db  3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
-intra8x9_ddr1: db  8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
-intra8x9_ddr2: db  7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
-intra8x9_ddr3: db  4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
-intra8x9_ddr4: db  3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
-intra8x9_vr1:  db  8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
-intra8x9_vr2:  db  8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
-intra8x9_vr3:  db  5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
-intra8x9_vr4:  db  4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
-intra8x9_hd1:  db  3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
-intra8x9_hd2:  db  2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
-intra8x9_hd3:  db  7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
-intra8x9_hd4:  db  5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
-intra8x9_hu1:  db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
-intra8x9_hu2:  db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
-intra8x9_hu3:  db  5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
-intra8x9_hu4:  db  3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
-pw_s00112233:  dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
-pw_s00001111:  dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
-
-transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
-transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
-
-sw_f0:     dq 0xfff0, 0
-pd_f0:     times 4 dd 0xffff0000
-
-pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
-
-ads_mvs_shuffle:
-%macro ADS_MVS_SHUFFLE 8
-    %assign y x
-    %rep 8
-        %rep 7
-            %rotate (~y)&1
-            %assign y y>>((~y)&1)
-        %endrep
-        db %1*2, %1*2+1
-        %rotate 1
-        %assign y y>>1
-    %endrep
-%endmacro
-%assign x 0
-%rep 256
-    ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
-%assign x x+1
-%endrep
-
-SECTION .text
-
-cextern pb_0
-cextern pb_1
-cextern pw_1
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_00ff
-cextern pw_ppppmmmm
-cextern pw_ppmmppmm
-cextern pw_pmpmpmpm
-cextern pw_pmmpzzzz
-cextern pd_1
-cextern hsub_mul
-cextern popcnt_table
-
-;=============================================================================
-; SSD
-;=============================================================================
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SSD_ONE 2
-cglobal pixel_ssd_%1x%2, 4,7,6
-    FIX_STRIDES r1, r3
-%if mmsize == %1*2
-    %define offset0_1 r1
-    %define offset0_2 r1*2
-    %define offset0_3 r5
-    %define offset1_1 r3
-    %define offset1_2 r3*2
-    %define offset1_3 r6
-    lea     r5, [3*r1]
-    lea     r6, [3*r3]
-%elif mmsize == %1
-    %define offset0_1 mmsize
-    %define offset0_2 r1
-    %define offset0_3 r1+mmsize
-    %define offset1_1 mmsize
-    %define offset1_2 r3
-    %define offset1_3 r3+mmsize
-%elif mmsize == %1/2
-    %define offset0_1 mmsize
-    %define offset0_2 mmsize*2
-    %define offset0_3 mmsize*3
-    %define offset1_1 mmsize
-    %define offset1_2 mmsize*2
-    %define offset1_3 mmsize*3
-%endif
-    %assign %%n %2/(2*mmsize/%1)
-%if %%n > 1
-    mov    r4d, %%n
-%endif
-    pxor    m0, m0
-.loop:
-    mova    m1, [r0]
-    mova    m2, [r0+offset0_1]
-    mova    m3, [r0+offset0_2]
-    mova    m4, [r0+offset0_3]
-    psubw   m1, [r2]
-    psubw   m2, [r2+offset1_1]
-    psubw   m3, [r2+offset1_2]
-    psubw   m4, [r2+offset1_3]
-%if %%n > 1
-    lea     r0, [r0+r1*(%2/%%n)]
-    lea     r2, [r2+r3*(%2/%%n)]
-%endif
-    pmaddwd m1, m1
-    pmaddwd m2, m2
-    pmaddwd m3, m3
-    pmaddwd m4, m4
-    paddd   m1, m2
-    paddd   m3, m4
-    paddd   m0, m1
-    paddd   m0, m3
-%if %%n > 1
-    dec    r4d
-    jg .loop
-%endif
-    HADDD   m0, m5
-    movd   eax, xm0
-    RET
-%endmacro
-
-INIT_MMX mmx2
-SSD_ONE     4,  4
-SSD_ONE     4,  8
-SSD_ONE     4, 16
-SSD_ONE     8,  4
-SSD_ONE     8,  8
-SSD_ONE     8, 16
-SSD_ONE    16,  8
-SSD_ONE    16, 16
-INIT_XMM sse2
-SSD_ONE     8,  4
-SSD_ONE     8,  8
-SSD_ONE     8, 16
-SSD_ONE    16,  8
-SSD_ONE    16, 16
-INIT_YMM avx2
-SSD_ONE    16,  8
-SSD_ONE    16, 16
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-%macro SSD_LOAD_FULL 5
-    mova      m1, [t0+%1]
-    mova      m2, [t2+%2]
-    mova      m3, [t0+%3]
-    mova      m4, [t2+%4]
-%if %5==1
-    add       t0, t1
-    add       t2, t3
-%elif %5==2
-    lea       t0, [t0+2*t1]
-    lea       t2, [t2+2*t3]
-%endif
-%endmacro
-
-%macro LOAD 5
-    movh      m%1, %3
-    movh      m%2, %4
-%if %5
-    lea       t0, [t0+2*t1]
-%endif
-%endmacro
-
-%macro JOIN 7
-    movh      m%3, %5
-    movh      m%4, %6
-%if %7
-    lea       t2, [t2+2*t3]
-%endif
-    punpcklbw m%1, m7
-    punpcklbw m%3, m7
-    psubw     m%1, m%3
-    punpcklbw m%2, m7
-    punpcklbw m%4, m7
-    psubw     m%2, m%4
-%endmacro
-
-%macro JOIN_SSE2 7
-    movh      m%3, %5
-    movh      m%4, %6
-%if %7
-    lea       t2, [t2+2*t3]
-%endif
-    punpcklqdq m%1, m%2
-    punpcklqdq m%3, m%4
-    DEINTB %2, %1, %4, %3, 7
-    psubw m%2, m%4
-    psubw m%1, m%3
-%endmacro
-
-%macro JOIN_SSSE3 7
-    movh      m%3, %5
-    movh      m%4, %6
-%if %7
-    lea       t2, [t2+2*t3]
-%endif
-    punpcklbw m%1, m%3
-    punpcklbw m%2, m%4
-%endmacro
-
-%macro LOAD_AVX2 5
-    mova     xm%1, %3
-    vinserti128 m%1, m%1, %4, 1
-%if %5
-    lea       t0, [t0+2*t1]
-%endif
-%endmacro
-
-%macro JOIN_AVX2 7
-    mova     xm%2, %5
-    vinserti128 m%2, m%2, %6, 1
-%if %7
-    lea       t2, [t2+2*t3]
-%endif
-    SBUTTERFLY bw, %1, %2, %3
-%endmacro
-
-%macro SSD_LOAD_HALF 5
-    LOAD      1, 2, [t0+%1], [t0+%3], 1
-    JOIN      1, 2, 3, 4, [t2+%2], [t2+%4], 1
-    LOAD      3, 4, [t0+%1], [t0+%3], %5
-    JOIN      3, 4, 5, 6, [t2+%2], [t2+%4], %5
-%endmacro
-
-%macro SSD_CORE 7-8
-%ifidn %8, FULL
-    mova      m%6, m%2
-    mova      m%7, m%4
-    psubusb   m%2, m%1
-    psubusb   m%4, m%3
-    psubusb   m%1, m%6
-    psubusb   m%3, m%7
-    por       m%1, m%2
-    por       m%3, m%4
-    punpcklbw m%2, m%1, m%5
-    punpckhbw m%1, m%5
-    punpcklbw m%4, m%3, m%5
-    punpckhbw m%3, m%5
-%endif
-    pmaddwd   m%1, m%1
-    pmaddwd   m%2, m%2
-    pmaddwd   m%3, m%3
-    pmaddwd   m%4, m%4
-%endmacro
-
-%macro SSD_CORE_SSE2 7-8
-%ifidn %8, FULL
-    DEINTB %6, %1, %7, %2, %5
-    psubw m%6, m%7
-    psubw m%1, m%2
-    SWAP %6, %2, %1
-    DEINTB %6, %3, %7, %4, %5
-    psubw m%6, m%7
-    psubw m%3, m%4
-    SWAP %6, %4, %3
-%endif
-    pmaddwd   m%1, m%1
-    pmaddwd   m%2, m%2
-    pmaddwd   m%3, m%3
-    pmaddwd   m%4, m%4
-%endmacro
-
-%macro SSD_CORE_SSSE3 7-8
-%ifidn %8, FULL
-    punpckhbw m%6, m%1, m%2
-    punpckhbw m%7, m%3, m%4
-    punpcklbw m%1, m%2
-    punpcklbw m%3, m%4
-    SWAP %6, %2, %3
-    SWAP %7, %4
-%endif
-    pmaddubsw m%1, m%5
-    pmaddubsw m%2, m%5
-    pmaddubsw m%3, m%5
-    pmaddubsw m%4, m%5
-    pmaddwd   m%1, m%1
-    pmaddwd   m%2, m%2
-    pmaddwd   m%3, m%3
-    pmaddwd   m%4, m%4
-%endmacro
-
-%macro SSD_ITER 6
-    SSD_LOAD_%1 %2,%3,%4,%5,%6
-    SSD_CORE  1, 2, 3, 4, 7, 5, 6, %1
-    paddd     m1, m2
-    paddd     m3, m4
-    paddd     m0, m1
-    paddd     m0, m3
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SSD 2
-%if %1 != %2
-    %assign function_align 8
-%else
-    %assign function_align 16
-%endif
-cglobal pixel_ssd_%1x%2, 0,0,0
-    mov     al, %1*%2/mmsize/2
-
-%if %1 != %2
-    jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
-%else
-
-.startloop:
-%if ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3
-    PROLOGUE 0,0,8
-%else
-    PROLOGUE 0,5
-    DECLARE_REG_TMP 1,2,3,4
-    mov t0, r0m
-    mov t1, r1m
-    mov t2, r2m
-    mov t3, r3m
-%endif
-
-%if cpuflag(ssse3)
-    mova    m7, [hsub_mul]
-%elifidn cpuname, sse2
-    mova    m7, [pw_00ff]
-%elif %1 >= mmsize
-    pxor    m7, m7
-%endif
-    pxor    m0, m0
-
-ALIGN 16
-.loop:
-%if %1 > mmsize
-    SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
-%elif %1 == mmsize
-    SSD_ITER FULL, 0, 0, t1, t3, 2
-%else
-    SSD_ITER HALF, 0, 0, t1, t3, 2
-%endif
-    dec     al
-    jg .loop
-%if mmsize==32
-    vextracti128 xm1, m0, 1
-    paddd  xm0, xm1
-    HADDD  xm0, xm1
-    movd   eax, xm0
-%else
-    HADDD   m0, m1
-    movd   eax, m0
-%endif
-    RET
-%endif
-%endmacro
-
-INIT_MMX mmx
-SSD 16, 16
-SSD 16,  8
-SSD  8,  8
-SSD  8, 16
-SSD  4,  4
-SSD  8,  4
-SSD  4,  8
-SSD  4, 16
-INIT_XMM sse2slow
-SSD 16, 16
-SSD  8,  8
-SSD 16,  8
-SSD  8, 16
-SSD  8,  4
-INIT_XMM sse2
-%define SSD_CORE SSD_CORE_SSE2
-%define JOIN JOIN_SSE2
-SSD 16, 16
-SSD  8,  8
-SSD 16,  8
-SSD  8, 16
-SSD  8,  4
-INIT_XMM ssse3
-%define SSD_CORE SSD_CORE_SSSE3
-%define JOIN JOIN_SSSE3
-SSD 16, 16
-SSD  8,  8
-SSD 16,  8
-SSD  8, 16
-SSD  8,  4
-INIT_XMM avx
-SSD 16, 16
-SSD  8,  8
-SSD 16,  8
-SSD  8, 16
-SSD  8,  4
-INIT_MMX ssse3
-SSD  4,  4
-SSD  4,  8
-SSD  4, 16
-INIT_XMM xop
-SSD 16, 16
-SSD  8,  8
-SSD 16,  8
-SSD  8, 16
-SSD  8,  4
-%define LOAD LOAD_AVX2
-%define JOIN JOIN_AVX2
-INIT_YMM avx2
-SSD 16, 16
-SSD 16,  8
-%assign function_align 16
-%endif ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
-;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; The maximum width this function can handle without risk of overflow is given
-; in the following equation: (mmsize in bits)
-;
-;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
-;
-; For 10-bit XMM this means width >= 32832. At sane distortion levels
-; it will take much more than that though.
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7,7
-    shl        r4d, 2
-    FIX_STRIDES r1, r3
-    add         r0, r4
-    add         r2, r4
-    neg         r4
-    pxor        m4, m4
-    pxor        m5, m5
-%if mmsize == 32
-    vbroadcasti128 m6, [ssd_nv12_shuf]
-%endif
-.loopy:
-    mov         r6, r4
-    pxor        m2, m2
-    pxor        m3, m3
-.loopx:
-    mova        m0, [r0+r6]
-    mova        m1, [r0+r6+mmsize]
-    psubw       m0, [r2+r6]
-    psubw       m1, [r2+r6+mmsize]
-%if mmsize == 32
-    pshufb      m0, m6
-    pshufb      m1, m6
-%else
-    SBUTTERFLY wd, 0, 1, 6
-%endif
-%if cpuflag(xop)
-    pmadcswd    m2, m0, m0, m2
-    pmadcswd    m3, m1, m1, m3
-%else
-    pmaddwd     m0, m0
-    pmaddwd     m1, m1
-    paddd       m2, m0
-    paddd       m3, m1
-%endif
-    add         r6, 2*mmsize
-    jl .loopx
-%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
-    jz .no_overread
-    psubd       m3, m1
-.no_overread:
-%endif
-    punpckhdq   m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
-    punpckhdq   m1, m3, m5 ; equation above, putting the width limit at 8208
-    punpckldq   m2, m5
-    punpckldq   m3, m5
-    paddq       m0, m1
-    paddq       m2, m3
-    paddq       m4, m0
-    paddq       m4, m2
-    add         r0, r1
-    add         r2, r3
-    dec        r5d
-    jg .loopy
-    mov         r0, r6m
-    mov         r1, r7m
-%if mmsize == 32
-    vextracti128 xm0, m4, 1
-    paddq      xm4, xm0
-%endif
-    movq      [r0], xm4
-    movhps    [r1], xm4
-    RET
-%endmacro ; SSD_NV12
-
-%else ; !HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
-;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; This implementation can potentially overflow on image widths >= 11008 (or
-; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
-; 20). At sane distortion levels it will take much more than that though.
-;-----------------------------------------------------------------------------
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7
-    add    r4d, r4d
-    add     r0, r4
-    add     r2, r4
-    neg     r4
-    pxor    m3, m3
-    pxor    m4, m4
-    mova    m5, [pw_00ff]
-.loopy:
-    mov     r6, r4
-.loopx:
-%if mmsize == 32 ; only 16-byte alignment is guaranteed
-    movu    m2, [r0+r6]
-    movu    m1, [r2+r6]
-%else
-    mova    m2, [r0+r6]
-    mova    m1, [r2+r6]
-%endif
-    psubusb m0, m2, m1
-    psubusb m1, m2
-    por     m0, m1
-    psrlw   m2, m0, 8
-    pand    m0, m5
-%if cpuflag(xop)
-    pmadcswd m4, m2, m2, m4
-    pmadcswd m3, m0, m0, m3
-%else
-    pmaddwd m2, m2
-    pmaddwd m0, m0
-    paddd   m4, m2
-    paddd   m3, m0
-%endif
-    add     r6, mmsize
-    jl .loopx
-%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
-    jz .no_overread
-    pcmpeqb xm1, xm1
-    pandn   m0, m1, m0 ; zero the lower half
-    pandn   m2, m1, m2
-    psubd   m3, m0
-    psubd   m4, m2
-.no_overread:
-%endif
-    add     r0, r1
-    add     r2, r3
-    dec    r5d
-    jg .loopy
-    mov     r0, r6m
-    mov     r1, r7m
-%if cpuflag(ssse3)
-    phaddd  m3, m4
-%else
-    SBUTTERFLY qdq, 3, 4, 0
-    paddd   m3, m4
-%endif
-%if mmsize == 32
-    vextracti128 xm4, m3, 1
-    paddd  xm3, xm4
-%endif
-    psllq  xm4, xm3, 32
-    paddd  xm3, xm4
-    psrlq  xm3, 32
-    movq  [r0], xm3
-    movhps [r1], xm3
-    RET
-%endmacro ; SSD_NV12
-%endif ; !HIGH_BIT_DEPTH
-
-INIT_XMM sse2
-SSD_NV12
-INIT_XMM avx
-SSD_NV12
-INIT_XMM xop
-SSD_NV12
-INIT_YMM avx2
-SSD_NV12
-
-;=============================================================================
-; variance
-;=============================================================================
-
-%macro VAR_START 1
-    pxor  m5, m5    ; sum
-    pxor  m6, m6    ; sum squared
-%if HIGH_BIT_DEPTH == 0
-%if %1
-    mova  m7, [pw_00ff]
-%elif mmsize < 32
-    pxor  m7, m7    ; zero
-%endif
-%endif ; !HIGH_BIT_DEPTH
-%endmacro
-
-%macro VAR_END 2
-%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
-    HADDUW  m5, m2
-%else
-    HADDW   m5, m2
-%endif
-    HADDD   m6, m1
-%if ARCH_X86_64
-    punpckldq m5, m6
-    movq   rax, m5
-%else
-    movd   eax, m5
-    movd   edx, m6
-%endif
-    RET
-%endmacro
-
-%macro VAR_CORE 0
-    paddw     m5, m0
-    paddw     m5, m3
-    paddw     m5, m1
-    paddw     m5, m4
-    pmaddwd   m0, m0
-    pmaddwd   m3, m3
-    pmaddwd   m1, m1
-    pmaddwd   m4, m4
-    paddd     m6, m0
-    paddd     m6, m3
-    paddd     m6, m1
-    paddd     m6, m4
-%endmacro
-
-%macro VAR_2ROW 2
-    mov      r2d, %2
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+mmsize]
-    mova      m3, [r0+%1]
-    mova      m4, [r0+%1+mmsize]
-%else ; !HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m3, [r0+%1]
-    punpckhbw m1, m0, m7
-    punpcklbw m0, m7
-    punpckhbw m4, m3, m7
-    punpcklbw m3, m7
-%endif ; HIGH_BIT_DEPTH
-%ifidn %1, r1
-    lea       r0, [r0+%1*2]
-%else
-    add       r0, r1
-%endif
-    VAR_CORE
-    dec r2d
-    jg .loop
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_var_wxh( uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_var_16x16, 2,3
-    FIX_STRIDES r1
-    VAR_START 0
-    VAR_2ROW 8*SIZEOF_PIXEL, 16
-    VAR_END 16, 16
-
-cglobal pixel_var_8x16, 2,3
-    FIX_STRIDES r1
-    VAR_START 0
-    VAR_2ROW r1, 8
-    VAR_END 8, 16
-
-cglobal pixel_var_8x8, 2,3
-    FIX_STRIDES r1
-    VAR_START 0
-    VAR_2ROW r1, 4
-    VAR_END 8, 8
-
-%if HIGH_BIT_DEPTH
-%macro VAR 0
-cglobal pixel_var_16x16, 2,3,8
-    FIX_STRIDES r1
-    VAR_START 0
-    VAR_2ROW r1, 8
-    VAR_END 16, 16
-
-cglobal pixel_var_8x8, 2,3,8
-    lea       r2, [r1*3]
-    VAR_START 0
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m3, [r0+r1*4]
-    mova      m4, [r0+r2*2]
-    lea       r0, [r0+r1*8]
-    VAR_CORE
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m3, [r0+r1*4]
-    mova      m4, [r0+r2*2]
-    VAR_CORE
-    VAR_END 8, 8
-%endmacro ; VAR
-
-INIT_XMM sse2
-VAR
-INIT_XMM avx
-VAR
-INIT_XMM xop
-VAR
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-%macro VAR 0
-cglobal pixel_var_16x16, 2,3,8
-    VAR_START 1
-    mov      r2d, 8
-.loop:
-    mova      m0, [r0]
-    mova      m3, [r0+r1]
-    DEINTB    1, 0, 4, 3, 7
-    lea       r0, [r0+r1*2]
-    VAR_CORE
-    dec r2d
-    jg .loop
-    VAR_END 16, 16
-
-cglobal pixel_var_8x8, 2,4,8
-    VAR_START 1
-    mov      r2d, 2
-    lea       r3, [r1*3]
-.loop:
-    movh      m0, [r0]
-    movh      m3, [r0+r1]
-    movhps    m0, [r0+r1*2]
-    movhps    m3, [r0+r3]
-    DEINTB    1, 0, 4, 3, 7
-    lea       r0, [r0+r1*4]
-    VAR_CORE
-    dec r2d
-    jg .loop
-    VAR_END 8, 8
-
-cglobal pixel_var_8x16, 2,4,8
-    VAR_START 1
-    mov      r2d, 4
-    lea       r3, [r1*3]
-.loop:
-    movh      m0, [r0]
-    movh      m3, [r0+r1]
-    movhps    m0, [r0+r1*2]
-    movhps    m3, [r0+r3]
-    DEINTB    1, 0, 4, 3, 7
-    lea       r0, [r0+r1*4]
-    VAR_CORE
-    dec r2d
-    jg .loop
-    VAR_END 8, 16
-%endmacro ; VAR
-
-INIT_XMM sse2
-VAR
-INIT_XMM avx
-VAR
-INIT_XMM xop
-VAR
-%endif ; !HIGH_BIT_DEPTH
-
-INIT_YMM avx2
-cglobal pixel_var_16x16, 2,4,7
-    FIX_STRIDES r1
-    VAR_START 0
-    mov      r2d, 4
-    lea       r3, [r1*3]
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m3, [r0+r1]
-    mova      m1, [r0+r1*2]
-    mova      m4, [r0+r3]
-%else
-    pmovzxbw  m0, [r0]
-    pmovzxbw  m3, [r0+r1]
-    pmovzxbw  m1, [r0+r1*2]
-    pmovzxbw  m4, [r0+r3]
-%endif
-    lea       r0, [r0+r1*4]
-    VAR_CORE
-    dec r2d
-    jg .loop
-    vextracti128 xm0, m5, 1
-    vextracti128 xm1, m6, 1
-    paddw  xm5, xm0
-    paddd  xm6, xm1
-    HADDW  xm5, xm2
-    HADDD  xm6, xm1
-%if ARCH_X86_64
-    punpckldq xm5, xm6
-    movq   rax, xm5
-%else
-    movd   eax, xm5
-    movd   edx, xm6
-%endif
-    RET
-
-%macro VAR2_END 3
-    HADDW   %2, xm1
-    movd   r1d, %2
-    imul   r1d, r1d
-    HADDD   %3, xm1
-    shr    r1d, %1
-    movd   eax, %3
-    movd  [r4], %3
-    sub    eax, r1d  ; sqr - (sum * sum >> shift)
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
-;-----------------------------------------------------------------------------
-%macro VAR2_8x8_MMX 2
-cglobal pixel_var2_8x%1, 5,6
-    FIX_STRIDES r1, r3
-    VAR_START 0
-    mov      r5d, %1
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+mmsize]
-    psubw     m0, [r2]
-    psubw     m1, [r2+mmsize]
-%else ; !HIGH_BIT_DEPTH
-    movq      m0, [r0]
-    movq      m1, m0
-    movq      m2, [r2]
-    movq      m3, m2
-    punpcklbw m0, m7
-    punpckhbw m1, m7
-    punpcklbw m2, m7
-    punpckhbw m3, m7
-    psubw     m0, m2
-    psubw     m1, m3
-%endif ; HIGH_BIT_DEPTH
-    paddw     m5, m0
-    paddw     m5, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m6, m0
-    paddd     m6, m1
-    add       r0, r1
-    add       r2, r3
-    dec       r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-VAR2_8x8_MMX  8, 6
-VAR2_8x8_MMX 16, 7
-%endif
-
-%macro VAR2_8x8_SSE2 2
-cglobal pixel_var2_8x%1, 5,6,8
-    VAR_START 1
-    mov      r5d, %1/2
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m2, [r2]
-    mova      m3, [r2+r3*2]
-%else ; !HIGH_BIT_DEPTH
-    movq      m1, [r0]
-    movhps    m1, [r0+r1]
-    movq      m3, [r2]
-    movhps    m3, [r2+r3]
-    DEINTB    0, 1, 2, 3, 7
-%endif ; HIGH_BIT_DEPTH
-    psubw     m0, m2
-    psubw     m1, m3
-    paddw     m5, m0
-    paddw     m5, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m6, m0
-    paddd     m6, m1
-    lea       r0, [r0+r1*2*SIZEOF_PIXEL]
-    lea       r2, [r2+r3*2*SIZEOF_PIXEL]
-    dec      r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM sse2
-VAR2_8x8_SSE2  8, 6
-VAR2_8x8_SSE2 16, 7
-
-%if HIGH_BIT_DEPTH == 0
-%macro VAR2_8x8_SSSE3 2
-cglobal pixel_var2_8x%1, 5,6,8
-    pxor      m5, m5    ; sum
-    pxor      m6, m6    ; sum squared
-    mova      m7, [hsub_mul]
-    mov      r5d, %1/4
-.loop:
-    movq      m0, [r0]
-    movq      m2, [r2]
-    movq      m1, [r0+r1]
-    movq      m3, [r2+r3]
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m0, m2
-    punpcklbw m1, m3
-    movq      m2, [r0]
-    movq      m3, [r2]
-    punpcklbw m2, m3
-    movq      m3, [r0+r1]
-    movq      m4, [r2+r3]
-    punpcklbw m3, m4
-    pmaddubsw m0, m7
-    pmaddubsw m1, m7
-    pmaddubsw m2, m7
-    pmaddubsw m3, m7
-    paddw     m5, m0
-    paddw     m5, m1
-    paddw     m5, m2
-    paddw     m5, m3
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    pmaddwd   m2, m2
-    pmaddwd   m3, m3
-    paddd     m6, m0
-    paddd     m6, m1
-    paddd     m6, m2
-    paddd     m6, m3
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    dec      r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM ssse3
-VAR2_8x8_SSSE3  8, 6
-VAR2_8x8_SSSE3 16, 7
-INIT_XMM xop
-VAR2_8x8_SSSE3  8, 6
-VAR2_8x8_SSSE3 16, 7
-
-%macro VAR2_8x8_AVX2 2
-cglobal pixel_var2_8x%1, 5,6,6
-    pxor      m3, m3    ; sum
-    pxor      m4, m4    ; sum squared
-    mova      m5, [hsub_mul]
-    mov      r5d, %1/4
-.loop:
-    movq     xm0, [r0]
-    movq     xm1, [r2]
-    vinserti128 m0, m0, [r0+r1], 1
-    vinserti128 m1, m1, [r2+r3], 1
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m0, m1
-    movq     xm1, [r0]
-    movq     xm2, [r2]
-    vinserti128 m1, m1, [r0+r1], 1
-    vinserti128 m2, m2, [r2+r3], 1
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m1, m2
-    pmaddubsw m0, m5
-    pmaddubsw m1, m5
-    paddw     m3, m0
-    paddw     m3, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m4, m0
-    paddd     m4, m1
-    dec      r5d
-    jg .loop
-    vextracti128 xm0, m3, 1
-    vextracti128 xm1, m4, 1
-    paddw    xm3, xm0
-    paddd    xm4, xm1
-    VAR2_END %2, xm3, xm4
-%endmacro
-
-INIT_YMM avx2
-VAR2_8x8_AVX2  8, 6
-VAR2_8x8_AVX2 16, 7
-
-%endif ; !HIGH_BIT_DEPTH
-
-;=============================================================================
-; SATD
-;=============================================================================
-
-%macro JDUP 2
-%if cpuflag(sse4)
-    ; just use shufps on anything post conroe
-    shufps %1, %2, 0
-%elif cpuflag(ssse3) && notcpuflag(atom)
-    ; join 2x 32 bit and duplicate them
-    ; emulating shufps is faster on conroe
-    punpcklqdq %1, %2
-    movsldup %1, %1
-%else
-    ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
-    punpckldq %1, %2
-%endif
-%endmacro
-
-%macro HSUMSUB 5
-    pmaddubsw m%2, m%5
-    pmaddubsw m%1, m%5
-    pmaddubsw m%4, m%5
-    pmaddubsw m%3, m%5
-%endmacro
-
-%macro DIFF_UNPACK_SSE2 5
-    punpcklbw m%1, m%5
-    punpcklbw m%2, m%5
-    punpcklbw m%3, m%5
-    punpcklbw m%4, m%5
-    psubw m%1, m%2
-    psubw m%3, m%4
-%endmacro
-
-%macro DIFF_SUMSUB_SSSE3 5
-    HSUMSUB %1, %2, %3, %4, %5
-    psubw m%1, m%2
-    psubw m%3, m%4
-%endmacro
-
-%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
-    movd %1, %3
-    movd %2, %4
-    JDUP %1, %2
-%endmacro
-
-%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
-    movddup m%3, %6
-    movddup m%4, %8
-    movddup m%1, %5
-    movddup m%2, %7
-%endmacro
-
-%macro LOAD_DUP_4x8P_PENRYN 8
-    ; penryn and nehalem run punpcklqdq and movddup in different units
-    movh m%3, %6
-    movh m%4, %8
-    punpcklqdq m%3, m%3
-    movddup m%1, %5
-    punpcklqdq m%4, m%4
-    movddup m%2, %7
-%endmacro
-
-%macro LOAD_SUMSUB_8x2P 9
-    LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
-    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
-%endmacro
-
-%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
-; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
-    LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
-    LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
-%if %10
-    lea %8, [%8+4*r1]
-    lea %9, [%9+4*r3]
-%endif
-%endmacro
-
-%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
-    movddup m%1, [%7]
-    movddup m%2, [%7+8]
-    mova m%4, [%6]
-    movddup m%3, m%4
-    punpckhqdq m%4, m%4
-    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
-%endmacro
-
-%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
-    movu  m%4, [%7]
-    mova  m%2, [%6]
-    DEINTB %1, %2, %3, %4, %5
-    psubw m%1, m%3
-    psubw m%2, m%4
-    SUMSUB_BA w, %1, %2, %3
-%endmacro
-
-%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
-; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
-    LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
-    LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
-    LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
-    LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
-%endmacro
-
-%macro LOAD_SUMSUB_16x2P_AVX2 9
-; 2*dst, 2*tmp, mul, 4*ptr
-    vbroadcasti128 m%1, [%6]
-    vbroadcasti128 m%3, [%7]
-    vbroadcasti128 m%2, [%8]
-    vbroadcasti128 m%4, [%9]
-    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
-%endmacro
-
-%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
-; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
-    LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
-    LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
-%if %10
-    lea  %8, [%8+4*r1]
-    lea  %9, [%9+4*r3]
-%endif
-%endmacro
-
-%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
-    mova  xm%3, %6
-    mova  xm%4, %8
-    mova  xm%1, %5
-    mova  xm%2, %7
-    vpermq m%3, m%3, q0011
-    vpermq m%4, m%4, q0011
-    vpermq m%1, m%1, q0011
-    vpermq m%2, m%2, q0011
-%endmacro
-
-%macro LOAD_SUMSUB8_16x2P_AVX2 9
-; 2*dst, 2*tmp, mul, 4*ptr
-    LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
-    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
-%endmacro
-
-%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
-; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
-    LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
-    LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
-%if %10
-    lea  %8, [%8+4*r1]
-    lea  %9, [%9+4*r3]
-%endif
-%endmacro
-
-; in: r4=3*stride1, r5=3*stride2
-; in: %2 = horizontal offset
-; in: %3 = whether we need to increment pix1 and pix2
-; clobber: m3..m7
-; out: %1 = satd
-%macro SATD_4x4_MMX 3
-    %xdefine %%n nn%1
-    %assign offset %2*SIZEOF_PIXEL
-    LOAD_DIFF m4, m3, none, [r0+     offset], [r2+     offset]
-    LOAD_DIFF m5, m3, none, [r0+  r1+offset], [r2+  r3+offset]
-    LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
-    LOAD_DIFF m7, m3, none, [r0+  r4+offset], [r2+  r5+offset]
-%if %3
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%endif
-    HADAMARD4_2D 4, 5, 6, 7, 3, %%n
-    paddw m4, m6
-    SWAP %%n, 4
-%endmacro
-
-; in: %1 = horizontal if 0, vertical if 1
-%macro SATD_8x4_SSE 8-9
-%if %1
-    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
-%else
-    HADAMARD4_V %2, %3, %4, %5, %6
-    ; doing the abs first is a slight advantage
-    ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
-    ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
-    HADAMARD 1, max, %2, %4, %6, %7
-%endif
-%ifnidn %9, swap
-    paddw m%8, m%2
-%else
-    SWAP %8, %2
-%endif
-%if %1
-    paddw m%8, m%4
-%else
-    HADAMARD 1, max, %3, %5, %6, %7
-    paddw m%8, m%3
-%endif
-%endmacro
-
-%macro SATD_START_MMX 0
-    FIX_STRIDES r1, r3
-    lea  r4, [3*r1] ; 3*stride1
-    lea  r5, [3*r3] ; 3*stride2
-%endmacro
-
-%macro SATD_END_MMX 0
-%if HIGH_BIT_DEPTH
-    HADDUW      m0, m1
-    movd       eax, m0
-%else ; !HIGH_BIT_DEPTH
-    pshufw      m1, m0, q1032
-    paddw       m0, m1
-    pshufw      m1, m0, q2301
-    paddw       m0, m1
-    movd       eax, m0
-    and        eax, 0xffff
-%endif ; HIGH_BIT_DEPTH
-    RET
-%endmacro
-
-; FIXME avoid the spilling of regs to hold 3*stride.
-; for small blocks on x86_32, modify pixel pointer instead.
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    SATD_4x4_MMX m2,  8, 0
-    paddw        m0, m1
-    SATD_4x4_MMX m1, 12, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-cglobal pixel_satd_8x8_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 1
-    paddw        m0, m2
-    paddw        m0, m1
-pixel_satd_8x4_internal_mmx2:
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_%1x%3_internal_mmx2
-    HADDUW m0, m1
-    movd  r6d, m0
-%rep %2/%3-1
-    pxor   m0, m0
-    lea    r0, [r0+4*r1]
-    lea    r2, [r2+4*r3]
-    call pixel_satd_%1x%3_internal_mmx2
-    movd   m2, r4
-    HADDUW m0, m1
-    movd   r4, m0
-    add    r6, r4
-    movd   r4, m2
-%endrep
-    movifnidn eax, r6d
-    RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4
-SATD_MxN_MMX 16,  8, 4
-SATD_MxN_MMX  8, 16, 8
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-cglobal pixel_satd_16x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-%rep 3
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%endrep
-    call pixel_satd_16x4_internal_mmx2
-    HADDUW m0, m1
-    movd  eax, m0
-    RET
-
-cglobal pixel_satd_16x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_16x4_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_16x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x16, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-%endif ; !HIGH_BIT_DEPTH
-
-cglobal pixel_satd_8x8, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x8_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_8x4, 4,6
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_8x4_internal_mmx2
-    SATD_END_MMX
-
-cglobal pixel_satd_4x16, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 1
-    paddw  m0, m1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x8, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 1
-    SATD_4x4_MMX m1, 0, 0
-    paddw  m0, m1
-    SATD_END_MMX
-
-cglobal pixel_satd_4x4, 4,6
-    SATD_START_MMX
-    SATD_4x4_MMX m0, 0, 0
-    SATD_END_MMX
-
-%macro SATD_START_SSE2 2-3 0
-    FIX_STRIDES r1, r3
-%if HIGH_BIT_DEPTH && %3
-    pxor    %2, %2
-%elif cpuflag(ssse3) && notcpuflag(atom)
-%if mmsize==32
-    mova    %2, [hmul_16p]
-%else
-    mova    %2, [hmul_8p]
-%endif
-%endif
-    lea     r4, [3*r1]
-    lea     r5, [3*r3]
-    pxor    %1, %1
-%endmacro
-
-%macro SATD_END_SSE2 1-2
-%if HIGH_BIT_DEPTH
-    HADDUW  %1, xm0
-%if %0 == 2
-    paddd   %1, %2
-%endif
-%else
-    HADDW   %1, xm7
-%endif
-    movd   eax, %1
-    RET
-%endmacro
-
-%macro SATD_ACCUM 3
-%if HIGH_BIT_DEPTH
-    HADDUW %1, %2
-    paddd  %3, %1
-    pxor   %1, %1
-%endif
-%endmacro
-
-%macro BACKUP_POINTERS 0
-%if ARCH_X86_64
-%if WIN64
-    PUSH r7
-%endif
-    mov     r6, r0
-    mov     r7, r2
-%endif
-%endmacro
-
-%macro RESTORE_AND_INC_POINTERS 0
-%if ARCH_X86_64
-    lea     r0, [r6+8*SIZEOF_PIXEL]
-    lea     r2, [r7+8*SIZEOF_PIXEL]
-%if WIN64
-    POP r7
-%endif
-%else
-    mov     r0, r0mp
-    mov     r2, r2mp
-    add     r0, 8*SIZEOF_PIXEL
-    add     r2, 8*SIZEOF_PIXEL
-%endif
-%endmacro
-
-%macro SATD_4x8_SSE 3
-%if HIGH_BIT_DEPTH
-    movh    m0, [r0+0*r1]
-    movh    m4, [r2+0*r3]
-    movh    m1, [r0+1*r1]
-    movh    m5, [r2+1*r3]
-    movhps  m0, [r0+4*r1]
-    movhps  m4, [r2+4*r3]
-    movh    m2, [r0+2*r1]
-    movh    m6, [r2+2*r3]
-    psubw   m0, m4
-    movh    m3, [r0+r4]
-    movh    m4, [r2+r5]
-    lea     r0, [r0+4*r1]
-    lea     r2, [r2+4*r3]
-    movhps  m1, [r0+1*r1]
-    movhps  m5, [r2+1*r3]
-    movhps  m2, [r0+2*r1]
-    movhps  m6, [r2+2*r3]
-    psubw   m1, m5
-    movhps  m3, [r0+r4]
-    movhps  m4, [r2+r5]
-    psubw   m2, m6
-    psubw   m3, m4
-%else ; !HIGH_BIT_DEPTH
-    movd m4, [r2]
-    movd m5, [r2+r3]
-    movd m6, [r2+2*r3]
-    add r2, r5
-    movd m0, [r0]
-    movd m1, [r0+r1]
-    movd m2, [r0+2*r1]
-    add r0, r4
-    movd m3, [r2+r3]
-    JDUP m4, m3
-    movd m3, [r0+r1]
-    JDUP m0, m3
-    movd m3, [r2+2*r3]
-    JDUP m5, m3
-    movd m3, [r0+2*r1]
-    JDUP m1, m3
-%if %1==0 && %2==1
-    mova m3, [hmul_4p]
-    DIFFOP 0, 4, 1, 5, 3
-%else
-    DIFFOP 0, 4, 1, 5, 7
-%endif
-    movd m5, [r2]
-    add r2, r5
-    movd m3, [r0]
-    add r0, r4
-    movd m4, [r2]
-    JDUP m6, m4
-    movd m4, [r0]
-    JDUP m2, m4
-    movd m4, [r2+r3]
-    JDUP m5, m4
-    movd m4, [r0+r1]
-    JDUP m3, m4
-%if %1==0 && %2==1
-    mova m4, [hmul_4p]
-    DIFFOP 2, 6, 3, 5, 4
-%else
-    DIFFOP 2, 6, 3, 5, 7
-%endif
-%endif ; HIGH_BIT_DEPTH
-    SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SATDS_SSE2 0
-%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
-
-%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
-cglobal pixel_satd_4x4, 4, 6, 6
-    SATD_START_MMX
-    mova m4, [hmul_4p]
-    LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
-    LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
-    LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
-    LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
-    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
-    HADAMARD 0, sumsub, 0, 1, 2, 3
-    HADAMARD 4, sumsub, 0, 1, 2, 3
-    HADAMARD 1, amax, 0, 1, 2, 3
-    HADDW m0, m1
-    movd eax, m0
-    RET
-%endif
-
-cglobal pixel_satd_4x8, 4, 6, 8
-    SATD_START_MMX
-%if vertical==0
-    mova m7, [hmul_4p]
-%endif
-    SATD_4x8_SSE vertical, 0, swap
-    HADDW m7, m1
-    movd eax, m7
-    RET
-
-cglobal pixel_satd_4x16, 4, 6, 8
-    SATD_START_MMX
-%if vertical==0
-    mova m7, [hmul_4p]
-%endif
-    SATD_4x8_SSE vertical, 0, swap
-    lea r0, [r0+r1*2*SIZEOF_PIXEL]
-    lea r2, [r2+r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
-    HADDW m7, m1
-    movd eax, m7
-    RET
-
-cglobal pixel_satd_8x8_internal
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
-%%pixel_satd_8x4_internal:
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
-    ret
-
-; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
-; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
-%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
-cglobal pixel_satd_16x4_internal
-    LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
-    lea  r2, [r2+4*r3]
-    lea  r0, [r0+4*r1]
-    ; always use horizontal mode here
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
-    SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
-    ret
-
-cglobal pixel_satd_16x8, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    jmp %%pixel_satd_16x8_internal
-
-cglobal pixel_satd_16x16, 4,6,12
-    SATD_START_SSE2 m10, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-%%pixel_satd_16x8_internal:
-    call pixel_satd_16x4_internal
-    call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
-%else
-cglobal pixel_satd_16x8, 4,6,8
-    SATD_START_SSE2 m6, m7
-    BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_16x16, 4,6,8
-    SATD_START_SSE2 m6, m7, 1
-    BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
-    RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
-%endif
-
-cglobal pixel_satd_8x16, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_8x8, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
-cglobal pixel_satd_8x4, 4,6,8
-    SATD_START_SSE2 m6, m7
-    call %%pixel_satd_8x4_internal
-    SATD_END_SSE2 m6
-%endmacro ; SATDS_SSE2
-
-%macro SA8D_INTER 0
-%if ARCH_X86_64
-    %define lh m10
-    %define rh m0
-%else
-    %define lh m0
-    %define rh [esp+48]
-%endif
-%if HIGH_BIT_DEPTH
-    HADDUW  m0, m1
-    paddd   lh, rh
-%else
-    paddusw lh, rh
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro SA8D 0
-; sse2 doesn't seem to like the horizontal way of doing things
-%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-cglobal pixel_sa8d_8x8_internal
-    lea  r6, [r0+4*r1]
-    lea  r7, [r2+4*r3]
-    LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
-    LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
-%if vertical
-    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
-%else ; non-sse2
-    HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
-%endif
-    paddw m0, m1
-    paddw m0, m2
-    paddw m0, m8
-    SAVE_MM_PERMUTATION
-    ret
-
-cglobal pixel_sa8d_8x8, 4,8,12
-    FIX_STRIDES r1, r3
-    lea  r4, [3*r1]
-    lea  r5, [3*r3]
-%if vertical == 0
-    mova m7, [hmul_8p]
-%endif
-    call pixel_sa8d_8x8_internal
-%if HIGH_BIT_DEPTH
-    HADDUW m0, m1
-%else
-    HADDW m0, m1
-%endif ; HIGH_BIT_DEPTH
-    movd eax, m0
-    add eax, 1
-    shr eax, 1
-    RET
-
-cglobal pixel_sa8d_16x16, 4,8,12
-    FIX_STRIDES r1, r3
-    lea  r4, [3*r1]
-    lea  r5, [3*r3]
-%if vertical == 0
-    mova m7, [hmul_8p]
-%endif
-    call pixel_sa8d_8x8_internal ; pix[0]
-    add  r2, 8*SIZEOF_PIXEL
-    add  r0, 8*SIZEOF_PIXEL
-%if HIGH_BIT_DEPTH
-    HADDUW m0, m1
-%endif
-    mova m10, m0
-    call pixel_sa8d_8x8_internal ; pix[8]
-    lea  r2, [r2+8*r3]
-    lea  r0, [r0+8*r1]
-    SA8D_INTER
-    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
-    sub  r2, 8*SIZEOF_PIXEL
-    sub  r0, 8*SIZEOF_PIXEL
-    SA8D_INTER
-    call pixel_sa8d_8x8_internal ; pix[8*stride]
-    SA8D_INTER
-    SWAP 0, 10
-%if HIGH_BIT_DEPTH == 0
-    HADDUW m0, m1
-%endif
-    movd eax, m0
-    add  eax, 1
-    shr  eax, 1
-    RET
-
-%else ; ARCH_X86_32
-%if mmsize == 16
-cglobal pixel_sa8d_8x8_internal
-    %define spill0 [esp+4]
-    %define spill1 [esp+20]
-    %define spill2 [esp+36]
-%if vertical
-    LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
-    HADAMARD4_2D 0, 1, 2, 3, 4
-    movdqa spill0, m3
-    LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
-    HADAMARD4_2D 4, 5, 6, 7, 3
-    HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
-    movdqa m3, spill0
-    paddw m0, m1
-    HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
-%else ; mmsize == 8
-    mova m7, [hmul_8p]
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
-    ; could do first HADAMARD4_V here to save spilling later
-    ; surprisingly, not a win on conroe or even p4
-    mova spill0, m2
-    mova spill1, m3
-    mova spill2, m1
-    SWAP 1, 7
-    LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
-    HADAMARD4_V 4, 5, 6, 7, 3
-    mova m1, spill2
-    mova m2, spill0
-    mova m3, spill1
-    mova spill0, m6
-    mova spill1, m7
-    HADAMARD4_V 0, 1, 2, 3, 7
-    SUMSUB_BADC w, 0, 4, 1, 5, 7
-    HADAMARD 2, sumsub, 0, 4, 7, 6
-    HADAMARD 2, sumsub, 1, 5, 7, 6
-    HADAMARD 1, amax, 0, 4, 7, 6
-    HADAMARD 1, amax, 1, 5, 7, 6
-    mova m6, spill0
-    mova m7, spill1
-    paddw m0, m1
-    SUMSUB_BADC w, 2, 6, 3, 7, 4
-    HADAMARD 2, sumsub, 2, 6, 4, 5
-    HADAMARD 2, sumsub, 3, 7, 4, 5
-    HADAMARD 1, amax, 2, 6, 4, 5
-    HADAMARD 1, amax, 3, 7, 4, 5
-%endif ; sse2/non-sse2
-    paddw m0, m2
-    paddw m0, m3
-    SAVE_MM_PERMUTATION
-    ret
-%endif ; ifndef mmx2
-
-cglobal pixel_sa8d_8x8, 4,7
-    FIX_STRIDES r1, r3
-    mov    r6, esp
-    and   esp, ~15
-    sub   esp, 48
-    lea    r4, [3*r1]
-    lea    r5, [3*r3]
-    call pixel_sa8d_8x8_internal
-%if HIGH_BIT_DEPTH
-    HADDUW m0, m1
-%else
-    HADDW  m0, m1
-%endif ; HIGH_BIT_DEPTH
-    movd  eax, m0
-    add   eax, 1
-    shr   eax, 1
-    mov   esp, r6
-    RET
-
-cglobal pixel_sa8d_16x16, 4,7
-    FIX_STRIDES r1, r3
-    mov  r6, esp
-    and  esp, ~15
-    sub  esp, 64
-    lea  r4, [3*r1]
-    lea  r5, [3*r3]
-    call pixel_sa8d_8x8_internal
-%if mmsize == 8
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%endif
-%if HIGH_BIT_DEPTH
-    HADDUW m0, m1
-%endif
-    mova [esp+48], m0
-    call pixel_sa8d_8x8_internal
-    mov  r0, [r6+20]
-    mov  r2, [r6+28]
-    add  r0, 8*SIZEOF_PIXEL
-    add  r2, 8*SIZEOF_PIXEL
-    SA8D_INTER
-    mova [esp+48], m0
-    call pixel_sa8d_8x8_internal
-%if mmsize == 8
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-%else
-    SA8D_INTER
-%endif
-    mova [esp+64-mmsize], m0
-    call pixel_sa8d_8x8_internal
-%if HIGH_BIT_DEPTH
-    SA8D_INTER
-%else ; !HIGH_BIT_DEPTH
-    paddusw m0, [esp+64-mmsize]
-%if mmsize == 16
-    HADDUW m0, m1
-%else
-    mova m2, [esp+48]
-    pxor m7, m7
-    mova m1, m0
-    mova m3, m2
-    punpcklwd m0, m7
-    punpckhwd m1, m7
-    punpcklwd m2, m7
-    punpckhwd m3, m7
-    paddd m0, m1
-    paddd m2, m3
-    paddd m0, m2
-    HADDD m0, m1
-%endif
-%endif ; HIGH_BIT_DEPTH
-    movd eax, m0
-    add  eax, 1
-    shr  eax, 1
-    mov  esp, r6
-    RET
-%endif ; !ARCH_X86_64
-%endmacro ; SA8D
-
-;=============================================================================
-; SA8D_SATD
-;=============================================================================
-
-; %1: vertical/horizontal mode
-; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
-; m10: satd result
-; m6, m11-15: tmp regs
-%macro SA8D_SATD_8x4 5
-%if %1
-    LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
-    HADAMARD   0, sumsub, %2, %3, 6
-    HADAMARD   0, sumsub, %4, %5, 6
-    SBUTTERFLY        wd, %2, %3, 6
-    SBUTTERFLY        wd, %4, %5, 6
-    HADAMARD2_2D  %2, %4, %3, %5, 6, dq
-
-    mova   m12, m%2
-    mova   m13, m%3
-    mova   m14, m%4
-    mova   m15, m%5
-    HADAMARD 0, sumsub, %2, %3, 6
-    HADAMARD 0, sumsub, %4, %5, 6
-    SBUTTERFLY     qdq, 12, 13, 6
-    HADAMARD   0, amax, 12, 13, 6
-    SBUTTERFLY     qdq, 14, 15, 6
-    paddw m10, m12
-    HADAMARD   0, amax, 14, 15, 6
-    paddw m10, m14
-%else
-    LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
-    HADAMARD4_V %2, %3, %4, %5, 6
-
-    pabsw    m12, m%2 ; doing the abs first is a slight advantage
-    pabsw    m14, m%4
-    pabsw    m13, m%3
-    pabsw    m15, m%5
-    HADAMARD 1, max, 12, 14, 6, 11
-    paddw    m10, m12
-    HADAMARD 1, max, 13, 15, 6, 11
-    paddw    m10, m13
-%endif
-%endmacro ; SA8D_SATD_8x4
-
-; %1: add spilled regs?
-; %2: spill regs?
-%macro SA8D_SATD_ACCUM 2
-%if HIGH_BIT_DEPTH
-    pmaddwd m10, [pw_1]
-    HADDUWD  m0, m1
-%if %1
-    paddd   m10, temp1
-    paddd    m0, temp0
-%endif
-%if %2
-    mova  temp1, m10
-    pxor    m10, m10
-%endif
-%elif %1
-    paddw    m0, temp0
-%endif
-%if %2
-    mova  temp0, m0
-%endif
-%endmacro
-
-%macro SA8D_SATD 0
-%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
-cglobal pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_8x4 vertical, 0, 1, 2, 3
-    SA8D_SATD_8x4 vertical, 4, 5, 8, 9
-
-%if vertical ; sse2-style
-    HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
-    HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
-%else        ; complete sa8d
-    SUMSUB_BADC w, 0, 4, 1, 5, 12
-    HADAMARD 2, sumsub, 0, 4, 12, 11
-    HADAMARD 2, sumsub, 1, 5, 12, 11
-    SUMSUB_BADC w, 2, 8, 3, 9, 12
-    HADAMARD 2, sumsub, 2, 8, 12, 11
-    HADAMARD 2, sumsub, 3, 9, 12, 11
-    HADAMARD 1, amax, 0, 4, 12, 11
-    HADAMARD 1, amax, 1, 5, 12, 4
-    HADAMARD 1, amax, 2, 8, 12, 4
-    HADAMARD 1, amax, 3, 9, 12, 4
-%endif
-
-    ; create sa8d sub results
-    paddw    m1, m2
-    paddw    m0, m3
-    paddw    m0, m1
-
-    SAVE_MM_PERMUTATION
-    ret
-
-;-------------------------------------------------------------------------------
-; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
-;-------------------------------------------------------------------------------
-cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
-    %define temp0 [rsp+0*mmsize]
-    %define temp1 [rsp+1*mmsize]
-    FIX_STRIDES r1, r3
-%if vertical==0
-    mova     m7, [hmul_8p]
-%endif
-    lea      r4, [3*r1]
-    lea      r5, [3*r3]
-    pxor    m10, m10
-
-%if mmsize==32
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 0, 1
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 1, 0
-    vextracti128 xm1, m0, 1
-    vextracti128 xm2, m10, 1
-    paddw   xm0, xm1
-    paddw  xm10, xm2
-%else
-    lea      r6, [r2+8*SIZEOF_PIXEL]
-    lea      r7, [r0+8*SIZEOF_PIXEL]
-
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 0, 1
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 1, 1
-
-    mov      r0, r7
-    mov      r2, r6
-
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 1, 1
-    call pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_ACCUM 1, 0
-%endif
-
-; xop already has fast horizontal sums
-%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
-    pmaddwd xm10, [pw_1]
-    HADDUWD xm0, xm1
-    phaddd  xm0, xm10       ;  sa8d1  sa8d2  satd1  satd2
-    pshufd  xm1, xm0, q2301 ;  sa8d2  sa8d1  satd2  satd1
-    paddd   xm0, xm1        ;   sa8d   sa8d   satd   satd
-    movd    r0d, xm0
-    pextrd  eax, xm0, 2
-%else
-%if HIGH_BIT_DEPTH
-    HADDD   xm0, xm1
-    HADDD  xm10, xm2
-%else
-    HADDUW  xm0, xm1
-    HADDW  xm10, xm2
-%endif
-    movd    r0d, xm0
-    movd    eax, xm10
-%endif
-    add     r0d, 1
-    shl     rax, 32
-    shr     r0d, 1
-    or      rax, r0
-    RET
-%endmacro ; SA8D_SATD
-
-;=============================================================================
-; INTRA SATD
-;=============================================================================
-
-%macro HSUMSUB2 8
-    pshufd %4, %2, %7
-    pshufd %5, %3, %7
-    %1     %2, %8
-    %1     %6, %8
-    paddw  %2, %4
-    paddw  %3, %5
-%endmacro
-
-; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
-; and are only retained for old cpus.
-%macro INTRA_SA8D_SSE2 0
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
-;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8, 3,3,14
-    ; 8x8 hadamard
-    pxor        m8, m8
-    movq        m0, [r0+0*FENC_STRIDE]
-    movq        m1, [r0+1*FENC_STRIDE]
-    movq        m2, [r0+2*FENC_STRIDE]
-    movq        m3, [r0+3*FENC_STRIDE]
-    movq        m4, [r0+4*FENC_STRIDE]
-    movq        m5, [r0+5*FENC_STRIDE]
-    movq        m6, [r0+6*FENC_STRIDE]
-    movq        m7, [r0+7*FENC_STRIDE]
-    punpcklbw   m0, m8
-    punpcklbw   m1, m8
-    punpcklbw   m2, m8
-    punpcklbw   m3, m8
-    punpcklbw   m4, m8
-    punpcklbw   m5, m8
-    punpcklbw   m6, m8
-    punpcklbw   m7, m8
-
-    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
-
-    ABSW2       m8,  m9,  m2, m3, m2, m3
-    ABSW2       m10, m11, m4, m5, m4, m5
-    paddusw     m8,  m10
-    paddusw     m9,  m11
-    ABSW2       m10, m11, m6, m7, m6, m7
-    ABSW        m13, m1,  m1
-    paddusw     m10, m11
-    paddusw     m8,  m9
-    paddusw     m13, m10
-    paddusw     m13, m8
-
-    ; 1D hadamard of edges
-    movq        m8,  [r1+7]
-    movq        m9,  [r1+16]
-    pxor        m10, m10
-    punpcklbw   m8,  m10
-    punpcklbw   m9,  m10
-    HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
-    HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
-    pshuflw     m10, m8,  q2301
-    pshuflw     m11, m9,  q2301
-    pshufhw     m10, m10, q2301
-    pshufhw     m11, m11, q2301
-    pmullw      m8,  [pw_pmpmpmpm]
-    pmullw      m11, [pw_pmpmpmpm]
-    paddw       m8,  m10
-    paddw       m9,  m11
-
-    ; differences
-    paddw       m10, m8, m9
-    paddw       m10, [pw_8]
-    pand        m10, [sw_f0]
-    psllw       m10, 2 ; dc
-
-    psllw       m8,  3 ; left edge
-    psubw       m8,  m0
-    psubw       m10, m0
-    ABSW2       m8, m10, m8, m10, m11, m12 ; 1x8 sum
-    paddusw     m8,  m13
-    paddusw     m13, m10
-    punpcklwd   m0,  m1
-    punpcklwd   m2,  m3
-    punpcklwd   m4,  m5
-    punpcklwd   m6,  m7
-    punpckldq   m0,  m2
-    punpckldq   m4,  m6
-    punpcklqdq  m0,  m4 ; transpose
-    psllw       m9,  3 ; top edge
-    psrldq      m2,  m13, 2 ; 8x7 sum
-    psubw       m0,  m9  ; 8x1 sum
-    ABSW        m0,  m0,  m9
-    paddusw     m2,  m0
-
-    ; 3x HADDW
-    movdqa      m7,  [pw_1]
-    pmaddwd     m2,  m7
-    pmaddwd     m8,  m7
-    pmaddwd     m13, m7
-    punpckhdq   m3,  m2, m8
-    punpckldq   m2,  m8
-    pshufd      m5,  m13, q3311
-    paddd       m2,  m3
-    paddd       m5,  m13
-    punpckhqdq  m0,  m2, m5
-    punpcklqdq  m2,  m5
-    pavgw       m0,  m2
-    pxor        m1,  m1
-    pavgw       m0,  m1
-    movq      [r2], m0 ; i8x8_v, i8x8_h
-    psrldq      m0, 8
-    movd    [r2+8], m0 ; i8x8_dc
-    RET
-%endif ; ARCH_X86_64
-%endmacro ; INTRA_SA8D_SSE2
-
-; in: r0 = fenc
-; out: m0..m3 = hadamard coefs
-INIT_MMX
-cglobal hadamard_load
-; not really a global, but otherwise cycles get attributed to the wrong function in profiling
-%if HIGH_BIT_DEPTH
-    mova        m0, [r0+0*FENC_STRIDEB]
-    mova        m1, [r0+1*FENC_STRIDEB]
-    mova        m2, [r0+2*FENC_STRIDEB]
-    mova        m3, [r0+3*FENC_STRIDEB]
-%else
-    pxor        m7, m7
-    movd        m0, [r0+0*FENC_STRIDE]
-    movd        m1, [r0+1*FENC_STRIDE]
-    movd        m2, [r0+2*FENC_STRIDE]
-    movd        m3, [r0+3*FENC_STRIDE]
-    punpcklbw   m0, m7
-    punpcklbw   m1, m7
-    punpcklbw   m2, m7
-    punpcklbw   m3, m7
-%endif
-    HADAMARD4_2D 0, 1, 2, 3, 4
-    SAVE_MM_PERMUTATION
-    ret
-
-%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
-%ifidn %1, top
-%if HIGH_BIT_DEPTH
-    mova        %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
-%else
-    movd        %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
-    pxor        %5, %5
-    punpcklbw   %3, %5
-%endif
-%else ; left
-%ifnidn %2, 0
-    shl         %2d, 5 ; log(FDEC_STRIDEB)
-%endif
-    movd        %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
-    pinsrw      %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
-    pinsrw      %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
-    pinsrw      %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
-%if HIGH_BIT_DEPTH == 0
-    psrlw       %3, 8
-%endif
-%ifnidn %2, 0
-    shr         %2d, 5
-%endif
-%endif ; direction
-%if cpuflag(ssse3)
-    %define %%sign psignw
-%else
-    %define %%sign pmullw
-%endif
-    pshufw      %4, %3, q1032
-    %%sign      %4, [pw_ppmmppmm]
-    paddw       %3, %4
-    pshufw      %4, %3, q2301
-    %%sign      %4, [pw_pmpmpmpm]
-    paddw       %3, %4
-    psllw       %3, 2
-    mova        [%1_1d+2*%2], %3
-%endmacro
-
-%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
-    pxor        %7, %7
-    pshufw      %4, %1, q1032
-    pshufw      %5, %2, q1032
-    pshufw      %6, %3, q1032
-    paddw       %1, %4
-    paddw       %2, %5
-    paddw       %3, %6
-    punpcklwd   %1, %7
-    punpcklwd   %2, %7
-    punpcklwd   %3, %7
-    pshufw      %4, %1, q1032
-    pshufw      %5, %2, q1032
-    pshufw      %6, %3, q1032
-    %8          %1, %4
-    %8          %2, %5
-    %8          %3, %6
-%endmacro
-
-; in: m1..m3
-; out: m7
-; clobber: m4..m6
-%macro SUM3x4 0
-    ABSW2       m4, m5, m1, m2, m1, m2
-    ABSW        m7, m3, m3
-    paddw       m4, m5
-    paddw       m7, m4
-%endmacro
-
-; in: m0..m3 (4x4)
-; out: m0 v, m4 h, m5 dc
-; clobber: m1..m3
-%macro SUM4x3 3 ; dc, left, top
-    movq        m4, %2
-%ifnum sizeof%1
-    movq        m5, %1
-%else
-    movd        m5, %1
-%endif
-    psubw       m4, m0
-    psubw       m5, m0
-    punpcklwd   m0, m1
-    punpcklwd   m2, m3
-    punpckldq   m0, m2 ; transpose
-    psubw       m0, %3
-    ABSW2       m4, m5, m4, m5, m2, m3 ; 1x4 sum
-    ABSW        m0, m0, m1 ; 4x1 sum
-%endmacro
-
-%macro INTRA_X3_MMX 0
-;-----------------------------------------------------------------------------
-; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_4x4, 3,3
-%if UNIX64
-    ; stack is 16 byte aligned because abi says so
-    %define  top_1d  rsp-8  ; size 8
-    %define  left_1d rsp-16 ; size 8
-%else
-    ; WIN64:  stack is 16 byte aligned because abi says so
-    ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
-    SUB         rsp, 16
-    %define  top_1d  rsp+8
-    %define  left_1d rsp
-%endif
-
-    call hadamard_load
-    SCALAR_HADAMARD left, 0, m4, m5
-    SCALAR_HADAMARD top,  0, m6, m5, m7
-    paddw       m6, m4
-    pavgw       m6, [pw_16]
-    pand        m6, [sw_f0] ; dc
-
-    SUM3x4
-    SUM4x3 m6, [left_1d], [top_1d]
-    paddw       m4, m7
-    paddw       m5, m7
-    movq        m1, m5
-    psrlq       m1, 16  ; 4x3 sum
-    paddw       m0, m1
-
-    SUM_MM_X3   m0, m4, m5, m1, m2, m3, m6, pavgw
-    movd        [r2+0], m0 ; i4x4_v satd
-    movd        [r2+4], m4 ; i4x4_h satd
-    movd        [r2+8], m5 ; i4x4_dc satd
-%if UNIX64 == 0
-    ADD         rsp, 16
-%endif
-    RET
-
-;-----------------------------------------------------------------------------
-; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_16x16, 0,5
-    %assign  stack_pad  120 + ((stack_offset+120+gprsize)&15)
-    ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
-    SUB         rsp, stack_pad
-%define sums    rsp+64 ; size 56
-%define top_1d  rsp+32 ; size 32
-%define left_1d rsp    ; size 32
-    movifnidn   r1,  r1mp
-
-    pxor        m7, m7
-    mova [sums+ 0], m7
-    mova [sums+ 8], m7
-    mova [sums+16], m7
-%if HIGH_BIT_DEPTH
-    mova [sums+24], m7
-    mova [sums+32], m7
-    mova [sums+40], m7
-    mova [sums+48], m7
-%endif
-
-    ; 1D hadamards
-    mov        r3d, 12
-    movd        m6, [pw_32]
-.loop_edge:
-    SCALAR_HADAMARD left, r3, m0, m1
-    SCALAR_HADAMARD top,  r3, m1, m2, m3
-    pavgw       m0, m1
-    paddw       m6, m0
-    sub        r3d, 4
-    jge .loop_edge
-    psrlw       m6, 2
-    pand        m6, [sw_f0] ; dc
-
-    ; 2D hadamards
-    movifnidn   r0, r0mp
-    mov         r3, -4
-.loop_y:
-    mov         r4, -4
-.loop_x:
-    call hadamard_load
-
-    SUM3x4
-    SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
-    pavgw       m4, m7
-    pavgw       m5, m7
-    paddw       m0, [sums+ 0] ; i16x16_v satd
-    paddw       m4, [sums+ 8] ; i16x16_h satd
-    paddw       m5, [sums+16] ; i16x16_dc satd
-    mova [sums+ 0], m0
-    mova [sums+ 8], m4
-    mova [sums+16], m5
-
-    add         r0, 4*SIZEOF_PIXEL
-    inc         r4
-    jl  .loop_x
-%if HIGH_BIT_DEPTH
-    psrld       m7, m4, 16
-    pslld       m4, 16
-    psrld       m4, 16
-    paddd       m4, m7
-    psrld       m7, m0, 16
-    pslld       m0, 16
-    psrld       m0, 16
-    paddd       m0, m7
-    paddd       m4, [sums+32]
-    paddd       m0, [sums+24]
-    mova [sums+32], m4
-    mova [sums+24], m0
-    pxor        m7, m7
-    punpckhwd   m3, m5, m7
-    punpcklwd   m5, m7
-    paddd       m3, [sums+48]
-    paddd       m5, [sums+40]
-    mova [sums+48], m3
-    mova [sums+40], m5
-    mova [sums+ 0], m7
-    mova [sums+ 8], m7
-    mova [sums+16], m7
-%endif
-    add         r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
-    inc         r3
-    jl  .loop_y
-
-; horizontal sum
-    movifnidn   r2, r2mp
-%if HIGH_BIT_DEPTH
-    mova        m1, m5
-    paddd       m5, m3
-    HADDD       m5, m7 ; DC satd
-    HADDD       m4, m7 ; H satd
-    HADDD       m0, m7 ; the part of V satd that doesn't overlap with DC
-    psrld       m0, 1
-    psrlq       m1, 32 ; DC[1]
-    paddd       m0, m3 ; DC[2]
-    psrlq       m3, 32 ; DC[3]
-    paddd       m0, m1
-    paddd       m0, m3
-%else
-    mova        m7, m5
-    SUM_MM_X3   m0, m4, m5, m3, m1, m2, m6, paddd
-    psrld       m0, 1
-    pslld       m7, 16
-    psrld       m7, 16
-    paddd       m0, m5
-    psubd       m0, m7
-%endif
-    movd    [r2+8], m5 ; i16x16_dc satd
-    movd    [r2+4], m4 ; i16x16_h satd
-    movd    [r2+0], m0 ; i16x16_v satd
-    ADD        rsp, stack_pad
-    RET
-
-%if ARCH_X86_64
-    %define  t0 r6
-%else
-    %define  t0 r2
-%endif
-
-;-----------------------------------------------------------------------------
-; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal intra_satd_x3_8x8c, 0,6
-    ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
-    SUB          rsp, 72
-%define  sums    rsp+48 ; size 24
-%define  dc_1d   rsp+32 ; size 16
-%define  top_1d  rsp+16 ; size 16
-%define  left_1d rsp    ; size 16
-    movifnidn   r1,  r1mp
-    pxor        m7, m7
-    mova [sums+ 0], m7
-    mova [sums+ 8], m7
-    mova [sums+16], m7
-
-    ; 1D hadamards
-    mov         r3d, 4
-.loop_edge:
-    SCALAR_HADAMARD left, r3, m0, m1
-    SCALAR_HADAMARD top,  r3, m0, m1, m2
-    sub         r3d, 4
-    jge .loop_edge
-
-    ; dc
-    movzx       t0d, word [left_1d+0]
-    movzx       r3d, word [top_1d+0]
-    movzx       r4d, word [left_1d+8]
-    movzx       r5d, word [top_1d+8]
-    lea         t0d, [t0 + r3 + 16]
-    lea         r3d, [r4 + r5 + 16]
-    shr         t0d, 1
-    shr         r3d, 1
-    add         r4d, 8
-    add         r5d, 8
-    and         t0d, -16 ; tl
-    and         r3d, -16 ; br
-    and         r4d, -16 ; bl
-    and         r5d, -16 ; tr
-    mov         [dc_1d+ 0], t0d ; tl
-    mov         [dc_1d+ 4], r5d ; tr
-    mov         [dc_1d+ 8], r4d ; bl
-    mov         [dc_1d+12], r3d ; br
-    lea         r5, [dc_1d]
-
-    ; 2D hadamards
-    movifnidn   r0,  r0mp
-    movifnidn   r2,  r2mp
-    mov         r3,  -2
-.loop_y:
-    mov         r4,  -2
-.loop_x:
-    call hadamard_load
-
-    SUM3x4
-    SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
-    pavgw       m4, m7
-    pavgw       m5, m7
-    paddw       m0, [sums+16] ; i4x4_v satd
-    paddw       m4, [sums+8]  ; i4x4_h satd
-    paddw       m5, [sums+0]  ; i4x4_dc satd
-    movq        [sums+16], m0
-    movq        [sums+8], m4
-    movq        [sums+0], m5
-
-    add         r0, 4*SIZEOF_PIXEL
-    inc         r4
-    jl  .loop_x
-    add         r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
-    add         r5, 8
-    inc         r3
-    jl  .loop_y
-
-; horizontal sum
-    movq        m0, [sums+0]
-    movq        m1, [sums+8]
-    movq        m2, [sums+16]
-    movq        m7, m0
-%if HIGH_BIT_DEPTH
-    psrlq       m7, 16
-    HADDW       m7, m3
-    SUM_MM_X3   m0, m1, m2, m3, m4, m5, m6, paddd
-    psrld       m2, 1
-    paddd       m2, m7
-%else
-    psrlq       m7, 15
-    paddw       m2, m7
-    SUM_MM_X3   m0, m1, m2, m3, m4, m5, m6, paddd
-    psrld       m2, 1
-%endif
-    movd        [r2+0], m0 ; i8x8c_dc satd
-    movd        [r2+4], m1 ; i8x8c_h satd
-    movd        [r2+8], m2 ; i8x8c_v satd
-    ADD         rsp, 72
-    RET
-%endmacro ; INTRA_X3_MMX
-
-
-
-%macro PRED4x4_LOWPASS 5
-%ifnum sizeof%5
-    pavgb       %5, %2, %3
-    pxor        %3, %2
-    pand        %3, [pb_1]
-    psubusb     %5, %3
-    pavgb       %1, %4, %5
-%else
-    mova        %5, %2
-    pavgb       %2, %3
-    pxor        %3, %5
-    pand        %3, [pb_1]
-    psubusb     %2, %3
-    pavgb       %1, %4, %2
-%endif
-%endmacro
-
-%macro INTRA_X9_PRED 2
-%if cpuflag(sse4)
-    movu       m1, [r1-1*FDEC_STRIDE-8]
-    pinsrb     m1, [r1+3*FDEC_STRIDE-1], 0
-    pinsrb     m1, [r1+2*FDEC_STRIDE-1], 1
-    pinsrb     m1, [r1+1*FDEC_STRIDE-1], 2
-    pinsrb     m1, [r1+0*FDEC_STRIDE-1], 3
-%else
-    movd      mm0, [r1+3*FDEC_STRIDE-4]
-    punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
-    movd      mm1, [r1+1*FDEC_STRIDE-4]
-    punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
-    punpckhwd mm0, mm1
-    psrlq     mm0, 32
-    movq2dq    m0, mm0
-    movu       m1, [r1-1*FDEC_STRIDE-8]
-    movss      m1, m0                  ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
-%endif ; cpuflag
-    pshufb     m1, [intrax9_edge]      ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
-    psrldq     m0, m1, 1               ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
-    psrldq     m2, m1, 2               ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
-    pavgb      m5, m0, m1              ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5  __  __ __ __ __
-    mova       %2, m1
-    PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
-    ; ddl               ddr
-    ; Ft1 Ft2 Ft3 Ft4   Flt Ft0 Ft1 Ft2
-    ; Ft2 Ft3 Ft4 Ft5   Fl0 Flt Ft0 Ft1
-    ; Ft3 Ft4 Ft5 Ft6   Fl1 Fl0 Flt Ft0
-    ; Ft4 Ft5 Ft6 Ft7   Fl2 Fl1 Fl0 Flt
-    pshufb     m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
-    pshufb     m3, m0, [%1_ddlr2] ; rows 2,3
-    ; hd                hu
-    ; Glt Flt Ft0 Ft1   Gl0 Fl1 Gl1 Fl2
-    ; Gl0 Fl0 Glt Flt   Gl1 Fl2 Gl2 Fl3
-    ; Gl1 Fl1 Gl0 Fl0   Gl2 Fl3 Gl3 Gl3
-    ; Gl2 Fl2 Gl1 Fl1   Gl3 Gl3 Gl3 Gl3
-    pslldq     m0, 5                   ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
-    palignr    m7, m5, m0, 5           ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
-    pshufb     m6, m7, [%1_hdu1]
-    pshufb     m7, m7, [%1_hdu2]
-    ; vr                vl
-    ; Gt0 Gt1 Gt2 Gt3   Gt1 Gt2 Gt3 Gt4
-    ; Flt Ft0 Ft1 Ft2   Ft1 Ft2 Ft3 Ft4
-    ; Fl0 Gt0 Gt1 Gt2   Gt2 Gt3 Gt4 Gt5
-    ; Fl1 Flt Ft0 Ft1   Ft2 Ft3 Ft4 Ft5
-    psrldq     m5, 5                   ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
-    palignr    m5, m0, 6               ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
-    pshufb     m4, m5, [%1_vrl1]
-    pshufb     m5, m5, [%1_vrl2]
-%endmacro ; INTRA_X9_PRED
-
-%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
-    pshufb     m2, m%1, [intrax9b_vh1]
-    pshufb     m3, m%1, [intrax9b_vh2]
-    mova      [pred_buf+0x60], m2
-    mova      [pred_buf+0x70], m3
-    pshufb    m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
-    pmaddubsw m%1, [hmul_4p]
-    pshufhw    m0, m%1, q2301
-    pshuflw    m0, m0,  q2301
-    psignw    m%1, [pw_pmpmpmpm]
-    paddw      m0, m%1
-    psllw      m0, 2 ; hadamard(top), hadamard(left)
-    MOVHL      m3, m0
-    pshufb     m1, m0, [intrax9b_v1]
-    pshufb     m2, m0, [intrax9b_v2]
-    paddw      m0, m3
-    psignw     m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
-    pavgw      m0, [pw_16]
-    pand       m0, [sw_f0] ; dc
-    ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
-    ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
-    ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
-    HADAMARD 0, sumsub, %2, %3, %4, %5
-    HADAMARD 1, sumsub, %2, %3, %4, %5
-    movd      r3d, m0
-    shr       r3d, 4
-    imul      r3d, 0x01010101
-    mov       [pred_buf+0x80], r3d
-    mov       [pred_buf+0x88], r3d
-    mov       [pred_buf+0x90], r3d
-    mov       [pred_buf+0x98], r3d
-    psubw      m3, m%2
-    psubw      m0, m%2
-    psubw      m1, m%2
-    psubw      m2, m%3
-    pabsw     m%3, m%3
-    pabsw      m3, m3
-    pabsw      m0, m0
-    pabsw      m1, m1
-    pabsw      m2, m2
-    pavgw      m3, m%3
-    pavgw      m0, m%3
-    pavgw      m1, m2
-%if cpuflag(sse4)
-    phaddw     m3, m0
-%else
-    SBUTTERFLY qdq, 3, 0, 2
-    paddw      m3, m0
-%endif
-    MOVHL      m2, m1
-    paddw      m1, m2
-%if cpuflag(xop)
-    vphaddwq   m3, m3
-    vphaddwq   m1, m1
-    packssdw   m1, m3
-%else
-    phaddw     m1, m3
-    pmaddwd    m1, [pw_1] ; v, _, h, dc
-%endif
-%endmacro ; INTRA_X9_VHDC
-
-%macro INTRA_X9_END 2
-%if cpuflag(sse4)
-    phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
-    movd      eax, m0
-    add       eax, 1<<16
-    cmp        ax, r3w
-    cmovge    eax, r3d
-%else
-%if %1
-    ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
-    psllw      m0, 3
-    paddw      m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
-%else
-    ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
-    psllw      m0, 2
-    paddusw    m0, m0
-    paddw      m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
-%endif
-    movhlps    m1, m0
-    pminsw     m0, m1
-    pshuflw    m1, m0, q0032
-    pminsw     m0, m1
-    pshuflw    m1, m0, q0001
-    pminsw     m0, m1
-    movd      eax, m0
-    movsx     r2d, ax
-    and       eax, 7
-    sar       r2d, 3
-    shl       eax, 16
-    ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
-    ; 1<<12: undo sign manipulation
-    lea       eax, [rax+r2+(1<<16)+(1<<12)]
-    cmp        ax, r3w
-    cmovge    eax, r3d
-%endif ; cpuflag
-
-    ; output the predicted samples
-    mov       r3d, eax
-    shr       r3d, 16
-%ifdef PIC
-    lea        r2, [%2_lut]
-    movzx     r2d, byte [r2+r3]
-%else
-    movzx     r2d, byte [%2_lut+r3]
-%endif
-%if %1 ; sad
-    movq      mm0, [pred_buf+r2]
-    movq      mm1, [pred_buf+r2+16]
-    movd     [r1+0*FDEC_STRIDE], mm0
-    movd     [r1+2*FDEC_STRIDE], mm1
-    psrlq     mm0, 32
-    psrlq     mm1, 32
-    movd     [r1+1*FDEC_STRIDE], mm0
-    movd     [r1+3*FDEC_STRIDE], mm1
-%else ; satd
-%assign i 0
-%rep 4
-    mov       r3d, [pred_buf+r2+8*i]
-    mov      [r1+i*FDEC_STRIDE], r3d
-%assign i i+1
-%endrep
-%endif
-%endmacro ; INTRA_X9_END
-
-%macro INTRA_X9 0
-;-----------------------------------------------------------------------------
-; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
-;-----------------------------------------------------------------------------
-%if notcpuflag(xop)
-cglobal intra_sad_x9_4x4, 3,4,9
-    %assign pad 0xc0-gprsize-(stack_offset&15)
-    %define pred_buf rsp
-    sub       rsp, pad
-%if ARCH_X86_64
-    INTRA_X9_PRED intrax9a, m8
-%else
-    INTRA_X9_PRED intrax9a, [rsp+0xa0]
-%endif
-    mova [rsp+0x00], m2
-    mova [rsp+0x10], m3
-    mova [rsp+0x20], m4
-    mova [rsp+0x30], m5
-    mova [rsp+0x40], m6
-    mova [rsp+0x50], m7
-%if cpuflag(sse4)
-    movd       m0, [r0+0*FENC_STRIDE]
-    pinsrd     m0, [r0+1*FENC_STRIDE], 1
-    movd       m1, [r0+2*FENC_STRIDE]
-    pinsrd     m1, [r0+3*FENC_STRIDE], 1
-%else
-    movd      mm0, [r0+0*FENC_STRIDE]
-    punpckldq mm0, [r0+1*FENC_STRIDE]
-    movd      mm1, [r0+2*FENC_STRIDE]
-    punpckldq mm1, [r0+3*FENC_STRIDE]
-    movq2dq    m0, mm0
-    movq2dq    m1, mm1
-%endif
-    punpcklqdq m0, m0
-    punpcklqdq m1, m1
-    psadbw     m2, m0
-    psadbw     m3, m1
-    psadbw     m4, m0
-    psadbw     m5, m1
-    psadbw     m6, m0
-    psadbw     m7, m1
-    paddd      m2, m3
-    paddd      m4, m5
-    paddd      m6, m7
-%if ARCH_X86_64
-    SWAP        7, 8
-    pxor       m8, m8
-    %define %%zero m8
-%else
-    mova       m7, [rsp+0xa0]
-    %define %%zero [pb_0]
-%endif
-    pshufb     m3, m7, [intrax9a_vh1]
-    pshufb     m5, m7, [intrax9a_vh2]
-    pshufb     m7, [intrax9a_dc]
-    psadbw     m7, %%zero
-    psrlw      m7, 2
-    mova [rsp+0x60], m3
-    mova [rsp+0x70], m5
-    psadbw     m3, m0
-    pavgw      m7, %%zero
-    pshufb     m7, %%zero
-    psadbw     m5, m1
-    movq [rsp+0x80], m7
-    movq [rsp+0x90], m7
-    psadbw     m0, m7
-    paddd      m3, m5
-    psadbw     m1, m7
-    paddd      m0, m1
-    movzx     r3d, word [r2]
-    movd      r0d, m3 ; v
-    add       r3d, r0d
-    punpckhqdq m3, m0 ; h, dc
-    shufps     m3, m2, q2020
-    psllq      m6, 32
-    por        m4, m6
-    movu       m0, [r2+2]
-    packssdw   m3, m4
-    paddw      m0, m3
-    INTRA_X9_END 1, intrax9a
-    add       rsp, pad
-    RET
-%endif ; cpuflag
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
-;-----------------------------------------------------------------------------
-cglobal intra_satd_x9_4x4, 3,4,16
-    %assign pad 0xb0-gprsize-(stack_offset&15)
-    %define pred_buf rsp
-    sub       rsp, pad
-    INTRA_X9_PRED intrax9b, m15
-    mova [rsp+0x00], m2
-    mova [rsp+0x10], m3
-    mova [rsp+0x20], m4
-    mova [rsp+0x30], m5
-    mova [rsp+0x40], m6
-    mova [rsp+0x50], m7
-    movd       m8, [r0+0*FENC_STRIDE]
-    movd       m9, [r0+1*FENC_STRIDE]
-    movd      m10, [r0+2*FENC_STRIDE]
-    movd      m11, [r0+3*FENC_STRIDE]
-    mova      m12, [hmul_8p]
-    pshufd     m8, m8, 0
-    pshufd     m9, m9, 0
-    pshufd    m10, m10, 0
-    pshufd    m11, m11, 0
-    pmaddubsw  m8, m12
-    pmaddubsw  m9, m12
-    pmaddubsw m10, m12
-    pmaddubsw m11, m12
-    movddup    m0, m2
-    pshufd     m1, m2, q3232
-    movddup    m2, m3
-    punpckhqdq m3, m3
-    call .satd_8x4 ; ddr, ddl
-    movddup    m2, m5
-    pshufd     m3, m5, q3232
-    mova       m5, m0
-    movddup    m0, m4
-    pshufd     m1, m4, q3232
-    call .satd_8x4 ; vr, vl
-    movddup    m2, m7
-    pshufd     m3, m7, q3232
-    mova       m4, m0
-    movddup    m0, m6
-    pshufd     m1, m6, q3232
-    call .satd_8x4 ; hd, hu
-%if cpuflag(sse4)
-    punpckldq  m4, m0
-%else
-    punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
-%endif
-    mova       m1, [pw_ppmmppmm]
-    psignw     m8, m1
-    psignw    m10, m1
-    paddw      m8, m9
-    paddw     m10, m11
-    INTRA_X9_VHDC 15, 8, 10, 6, 7
-    ; find minimum
-    movu       m0, [r2+2]
-    movd      r3d, m1
-    palignr    m5, m1, 8
-%if notcpuflag(sse4)
-    pshufhw    m0, m0, q3120 ; compensate for different order in unpack
-%endif
-    packssdw   m5, m4
-    paddw      m0, m5
-    movzx     r0d, word [r2]
-    add       r3d, r0d
-    INTRA_X9_END 0, intrax9b
-    add       rsp, pad
-    RET
-RESET_MM_PERMUTATION
-ALIGN 16
-.satd_8x4:
-    pmaddubsw  m0, m12
-    pmaddubsw  m1, m12
-    pmaddubsw  m2, m12
-    pmaddubsw  m3, m12
-    psubw      m0, m8
-    psubw      m1, m9
-    psubw      m2, m10
-    psubw      m3, m11
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
-    pmaddwd    m0, [pw_1]
-    MOVHL      m1, m0
-    paddd    xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
-    ret
-
-%else ; !ARCH_X86_64
-cglobal intra_satd_x9_4x4, 3,4,8
-    %assign pad 0x120-gprsize-(stack_offset&15)
-    %define fenc_buf rsp
-    %define pred_buf rsp+0x40
-    %define spill    rsp+0xe0
-    sub       rsp, pad
-    INTRA_X9_PRED intrax9b, [spill+0x20]
-    mova [pred_buf+0x00], m2
-    mova [pred_buf+0x10], m3
-    mova [pred_buf+0x20], m4
-    mova [pred_buf+0x30], m5
-    mova [pred_buf+0x40], m6
-    mova [pred_buf+0x50], m7
-    movd       m4, [r0+0*FENC_STRIDE]
-    movd       m5, [r0+1*FENC_STRIDE]
-    movd       m6, [r0+2*FENC_STRIDE]
-    movd       m0, [r0+3*FENC_STRIDE]
-    mova       m7, [hmul_8p]
-    pshufd     m4, m4, 0
-    pshufd     m5, m5, 0
-    pshufd     m6, m6, 0
-    pshufd     m0, m0, 0
-    pmaddubsw  m4, m7
-    pmaddubsw  m5, m7
-    pmaddubsw  m6, m7
-    pmaddubsw  m0, m7
-    mova [fenc_buf+0x00], m4
-    mova [fenc_buf+0x10], m5
-    mova [fenc_buf+0x20], m6
-    mova [fenc_buf+0x30], m0
-    movddup    m0, m2
-    pshufd     m1, m2, q3232
-    movddup    m2, m3
-    punpckhqdq m3, m3
-    pmaddubsw  m0, m7
-    pmaddubsw  m1, m7
-    pmaddubsw  m2, m7
-    pmaddubsw  m3, m7
-    psubw      m0, m4
-    psubw      m1, m5
-    psubw      m2, m6
-    call .satd_8x4b ; ddr, ddl
-    mova       m3, [pred_buf+0x30]
-    mova       m1, [pred_buf+0x20]
-    movddup    m2, m3
-    punpckhqdq m3, m3
-    movq [spill+0x08], m0
-    movddup    m0, m1
-    punpckhqdq m1, m1
-    call .satd_8x4 ; vr, vl
-    mova       m3, [pred_buf+0x50]
-    mova       m1, [pred_buf+0x40]
-    movddup    m2, m3
-    punpckhqdq m3, m3
-    movq [spill+0x10], m0
-    movddup    m0, m1
-    punpckhqdq m1, m1
-    call .satd_8x4 ; hd, hu
-    movq [spill+0x18], m0
-    mova       m1, [spill+0x20]
-    mova       m4, [fenc_buf+0x00]
-    mova       m5, [fenc_buf+0x20]
-    mova       m2, [pw_ppmmppmm]
-    psignw     m4, m2
-    psignw     m5, m2
-    paddw      m4, [fenc_buf+0x10]
-    paddw      m5, [fenc_buf+0x30]
-    INTRA_X9_VHDC 1, 4, 5, 6, 7
-    ; find minimum
-    movu       m0, [r2+2]
-    movd      r3d, m1
-    punpckhqdq m1, [spill+0x00]
-    packssdw   m1, [spill+0x10]
-%if cpuflag(sse4)
-    pshufhw    m1, m1, q3120
-%else
-    pshufhw    m0, m0, q3120
-%endif
-    paddw      m0, m1
-    movzx     r0d, word [r2]
-    add       r3d, r0d
-    INTRA_X9_END 0, intrax9b
-    add       rsp, pad
-    RET
-RESET_MM_PERMUTATION
-ALIGN 16
-.satd_8x4:
-    pmaddubsw  m0, m7
-    pmaddubsw  m1, m7
-    pmaddubsw  m2, m7
-    pmaddubsw  m3, m7
-    %xdefine fenc_buf fenc_buf+gprsize
-    psubw      m0, [fenc_buf+0x00]
-    psubw      m1, [fenc_buf+0x10]
-    psubw      m2, [fenc_buf+0x20]
-.satd_8x4b:
-    psubw      m3, [fenc_buf+0x30]
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
-    pmaddwd    m0, [pw_1]
-    MOVHL      m1, m0
-    paddd    xmm0, m0, m1
-    ret
-%endif ; ARCH
-%endmacro ; INTRA_X9
-
-%macro INTRA8_X9 0
-;-----------------------------------------------------------------------------
-; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
-;-----------------------------------------------------------------------------
-cglobal intra_sad_x9_8x8, 5,6,9
-    %define fenc02 m4
-    %define fenc13 m5
-    %define fenc46 m6
-    %define fenc57 m7
-%if ARCH_X86_64
-    %define tmp m8
-    %assign padbase 0x0
-%else
-    %define tmp [rsp]
-    %assign padbase 0x10
-%endif
-    %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
-    %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
-
-    SUB        rsp, pad
-    movq    fenc02, [r0+FENC_STRIDE* 0]
-    movq    fenc13, [r0+FENC_STRIDE* 1]
-    movq    fenc46, [r0+FENC_STRIDE* 4]
-    movq    fenc57, [r0+FENC_STRIDE* 5]
-    movhps  fenc02, [r0+FENC_STRIDE* 2]
-    movhps  fenc13, [r0+FENC_STRIDE* 3]
-    movhps  fenc46, [r0+FENC_STRIDE* 6]
-    movhps  fenc57, [r0+FENC_STRIDE* 7]
-
-    ; save instruction size: avoid 4-byte memory offsets
-    lea         r0, [intra8x9_h1+128]
-    %define off(m) (r0+m-(intra8x9_h1+128))
-
-; v
-    movddup     m0, [r2+16]
-    mova pred(0,0), m0
-    psadbw      m1, m0, fenc02
-    mova pred(0,1), m0
-    psadbw      m2, m0, fenc13
-    mova pred(0,2), m0
-    psadbw      m3, m0, fenc46
-    mova pred(0,3), m0
-    psadbw      m0, m0, fenc57
-    paddw       m1, m2
-    paddw       m0, m3
-    paddw       m0, m1
-    MOVHL       m1, m0
-    paddw       m0, m1
-    movd    [r4+0], m0
-
-; h
-    movq        m0, [r2+7]
-    pshufb      m1, m0, [off(intra8x9_h1)]
-    pshufb      m2, m0, [off(intra8x9_h2)]
-    mova pred(1,0), m1
-    psadbw      m1, fenc02
-    mova pred(1,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m3, m0, [off(intra8x9_h3)]
-    pshufb      m2, m0, [off(intra8x9_h4)]
-    mova pred(1,2), m3
-    psadbw      m3, fenc46
-    mova pred(1,3), m2
-    psadbw      m2, fenc57
-    paddw       m1, m3
-    paddw       m1, m2
-    MOVHL       m2, m1
-    paddw       m1, m2
-    movd    [r4+2], m1
-
-    lea         r5, [rsp+padbase+0x100]
-    %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
-
-; dc
-    movhps      m0, [r2+16]
-    pxor        m2, m2
-    psadbw      m0, m2
-    MOVHL       m1, m0
-    paddw       m0, m1
-    psrlw       m0, 3
-    pavgw       m0, m2
-    pshufb      m0, m2
-    mova pred(2,0), m0
-    psadbw      m1, m0, fenc02
-    mova pred(2,1), m0
-    psadbw      m2, m0, fenc13
-    mova pred(2,2), m0
-    psadbw      m3, m0, fenc46
-    mova pred(2,3), m0
-    psadbw      m0, m0, fenc57
-    paddw       m1, m2
-    paddw       m0, m3
-    paddw       m0, m1
-    MOVHL       m1, m0
-    paddw       m0, m1
-    movd    [r4+4], m0
-
-; ddl
-; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
-; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
-; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
-; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
-; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
-; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
-; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
-; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
-    mova        m0, [r2+16]
-    movu        m2, [r2+17]
-    pslldq      m1, m0, 1
-    pavgb       m3, m0, m2              ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
-    PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
-    pshufb      m1, m0, [off(intra8x9_ddl1)]
-    pshufb      m2, m0, [off(intra8x9_ddl2)]
-    mova pred(3,0), m1
-    psadbw      m1, fenc02
-    mova pred(3,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_ddl3)]
-    mova pred(3,2), m2
-    psadbw      m2, fenc46
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_ddl4)]
-    mova pred(3,3), m2
-    psadbw      m2, fenc57
-    paddw       m1, m2
-    MOVHL       m2, m1
-    paddw       m1, m2
-    movd    [r4+6], m1
-
-; vl
-; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
-; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
-; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
-; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
-; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
-; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
-; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
-; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
-    pshufb      m1, m3, [off(intra8x9_vl1)]
-    pshufb      m2, m0, [off(intra8x9_vl2)]
-    pshufb      m3, m3, [off(intra8x9_vl3)]
-    pshufb      m0, m0, [off(intra8x9_vl4)]
-    mova pred(7,0), m1
-    psadbw      m1, fenc02
-    mova pred(7,1), m2
-    psadbw      m2, fenc13
-    mova pred(7,2), m3
-    psadbw      m3, fenc46
-    mova pred(7,3), m0
-    psadbw      m0, fenc57
-    paddw       m1, m2
-    paddw       m0, m3
-    paddw       m0, m1
-    MOVHL       m1, m0
-    paddw       m0, m1
-%if cpuflag(sse4)
-    pextrw [r4+14], m0, 0
-%else
-    movd       r5d, m0
-    mov    [r4+14], r5w
-    lea         r5, [rsp+padbase+0x100]
-%endif
-
-; ddr
-; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
-; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
-; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
-; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
-; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
-; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
-; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
-; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
-    movu        m2, [r2+8]
-    movu        m0, [r2+7]
-    movu        m1, [r2+6]
-    pavgb       m3, m2, m0              ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
-    PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
-    pshufb      m1, m0, [off(intra8x9_ddr1)]
-    pshufb      m2, m0, [off(intra8x9_ddr2)]
-    mova pred(4,0), m1
-    psadbw      m1, fenc02
-    mova pred(4,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_ddr3)]
-    mova pred(4,2), m2
-    psadbw      m2, fenc46
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_ddr4)]
-    mova pred(4,3), m2
-    psadbw      m2, fenc57
-    paddw       m1, m2
-    MOVHL       m2, m1
-    paddw       m1, m2
-    movd    [r4+8], m1
-
-    add         r0, 256
-    add         r5, 0xC0
-    %define off(m) (r0+m-(intra8x9_h1+256+128))
-    %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
-
-; vr
-; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
-; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
-; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
-; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
-; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
-; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
-; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
-; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
-    movsd       m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
-    pshufb      m1, m2, [off(intra8x9_vr1)]
-    pshufb      m2, m2, [off(intra8x9_vr3)]
-    mova pred(5,0), m1
-    psadbw      m1, fenc02
-    mova pred(5,2), m2
-    psadbw      m2, fenc46
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_vr2)]
-    mova pred(5,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_vr4)]
-    mova pred(5,3), m2
-    psadbw      m2, fenc57
-    paddw       m1, m2
-    MOVHL       m2, m1
-    paddw       m1, m2
-    movd   [r4+10], m1
-
-; hd
-; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
-; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
-; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
-; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
-; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
-; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
-; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
-; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
-    pshufd      m2, m3, q0001
-%if cpuflag(sse4)
-    pblendw     m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
-%else
-    movss       m1, m0, m2
-    SWAP        1, 2
-%endif
-    punpcklbw   m0, m3        ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
-    pshufb      m1, m2, [off(intra8x9_hd1)]
-    pshufb      m2, m2, [off(intra8x9_hd2)]
-    mova pred(6,0), m1
-    psadbw      m1, fenc02
-    mova pred(6,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_hd3)]
-    pshufb      m3, m0, [off(intra8x9_hd4)]
-    mova pred(6,2), m2
-    psadbw      m2, fenc46
-    mova pred(6,3), m3
-    psadbw      m3, fenc57
-    paddw       m1, m2
-    paddw       m1, m3
-    MOVHL       m2, m1
-    paddw       m1, m2
-    ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
-    pslldq      m1, 12
-    SWAP        3, 1
-
-; hu
-; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
-; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
-; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
-; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
-; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
-; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
-; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
-; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
-%if cpuflag(sse4)
-    pinsrb      m0, [r2+7], 15 ; Gl7
-%else
-    movd        m1, [r2+7]
-    pslldq      m0, 1
-    palignr     m1, m0, 1
-    SWAP        0, 1
-%endif
-    pshufb      m1, m0, [off(intra8x9_hu1)]
-    pshufb      m2, m0, [off(intra8x9_hu2)]
-    mova pred(8,0), m1
-    psadbw      m1, fenc02
-    mova pred(8,1), m2
-    psadbw      m2, fenc13
-    paddw       m1, m2
-    pshufb      m2, m0, [off(intra8x9_hu3)]
-    pshufb      m0, m0, [off(intra8x9_hu4)]
-    mova pred(8,2), m2
-    psadbw      m2, fenc46
-    mova pred(8,3), m0
-    psadbw      m0, fenc57
-    paddw       m1, m2
-    paddw       m1, m0
-    MOVHL       m2, m1
-    paddw       m1, m2
-    movd       r2d, m1
-
-    movu        m0, [r3]
-    por         m3, [r4]
-    paddw       m0, m3
-    mova      [r4], m0
-    movzx      r5d, word [r3+16]
-    add        r2d, r5d
-    mov    [r4+16], r2w
-
-%if cpuflag(sse4)
-    phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
-    movd      eax, m0
-%else
-    ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
-    paddusw    m0, m0
-    paddusw    m0, m0
-    paddw      m0, [off(pw_s00112233)]
-    MOVHL      m1, m0
-    pminsw     m0, m1
-    pshuflw    m1, m0, q0032
-    pminsw     m0, m1
-    movd      eax, m0
-    ; repack with 3 bit index
-    xor       eax, 0x80008000
-    movzx     r3d, ax
-    shr       eax, 15
-    add       r3d, r3d
-    or        eax, 1
-    cmp       eax, r3d
-    cmovg     eax, r3d
-    ; reverse to phminposuw order
-    mov       r3d, eax
-    and       eax, 7
-    shr       r3d, 3
-    shl       eax, 16
-    or        eax, r3d
-%endif
-    add       r2d, 8<<16
-    cmp        ax, r2w
-    cmovg     eax, r2d
-
-    mov       r2d, eax
-    shr       r2d, 16
-    shl       r2d, 6
-    add        r1, 4*FDEC_STRIDE
-    mova       m0, [rsp+padbase+r2+0x00]
-    mova       m1, [rsp+padbase+r2+0x10]
-    mova       m2, [rsp+padbase+r2+0x20]
-    mova       m3, [rsp+padbase+r2+0x30]
-    movq   [r1+FDEC_STRIDE*-4], m0
-    movhps [r1+FDEC_STRIDE*-2], m0
-    movq   [r1+FDEC_STRIDE*-3], m1
-    movhps [r1+FDEC_STRIDE*-1], m1
-    movq   [r1+FDEC_STRIDE* 0], m2
-    movhps [r1+FDEC_STRIDE* 2], m2
-    movq   [r1+FDEC_STRIDE* 1], m3
-    movhps [r1+FDEC_STRIDE* 3], m3
-    ADD       rsp, pad
-    RET
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
-;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x9_8x8, 5,6,16
-    %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
-    %define fenc_buf rsp
-    %define pred_buf rsp+0x80
-    SUB        rsp, pad
-    mova       m15, [hmul_8p]
-    pxor        m8, m8
-%assign %%i 0
-%rep 8
-    movddup     m %+ %%i, [r0+%%i*FENC_STRIDE]
-    pmaddubsw   m9, m %+ %%i, m15
-    punpcklbw   m %+ %%i, m8
-    mova [fenc_buf+%%i*0x10], m9
-%assign %%i %%i+1
-%endrep
-
-    ; save instruction size: avoid 4-byte memory offsets
-    lea         r0, [intra8x9_h1+0x80]
-    %define off(m) (r0+m-(intra8x9_h1+0x80))
-    lea         r5, [pred_buf+0x80]
-
-; v, h, dc
-    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
-    pabsw      m11, m1
-%assign %%i 2
-%rep 6
-    pabsw       m8, m %+ %%i
-    paddw      m11, m8
-%assign %%i %%i+1
-%endrep
-
-    ; 1D hadamard of edges
-    movq        m8, [r2+7]
-    movddup     m9, [r2+16]
-    mova [r5-0x80], m9
-    mova [r5-0x70], m9
-    mova [r5-0x60], m9
-    mova [r5-0x50], m9
-    punpcklwd   m8, m8
-    pshufb      m9, [intrax3_shuf]
-    pmaddubsw   m8, [pb_pppm]
-    pmaddubsw   m9, [pb_pppm]
-    HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
-    HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
-
-    ; dc
-    paddw      m10, m8, m9
-    paddw      m10, [pw_8]
-    pand       m10, [sw_f0]
-    psrlw      m12, m10, 4
-    psllw      m10, 2
-    pxor       m13, m13
-    pshufb     m12, m13
-    mova [r5+0x00], m12
-    mova [r5+0x10], m12
-    mova [r5+0x20], m12
-    mova [r5+0x30], m12
-
-    ; differences
-    psllw       m8, 3 ; left edge
-    psubw       m8, m0
-    psubw      m10, m0
-    pabsw       m8, m8 ; 1x8 sum
-    pabsw      m10, m10
-    paddw       m8, m11
-    paddw      m11, m10
-    punpcklwd   m0, m1
-    punpcklwd   m2, m3
-    punpcklwd   m4, m5
-    punpcklwd   m6, m7
-    punpckldq   m0, m2
-    punpckldq   m4, m6
-    punpcklqdq  m0, m4 ; transpose
-    psllw       m9, 3  ; top edge
-    psrldq     m10, m11, 2 ; 8x7 sum
-    psubw       m0, m9 ; 8x1 sum
-    pabsw       m0, m0
-    paddw      m10, m0
-
-    phaddd     m10, m8 ; logically phaddw, but this is faster and it won't overflow
-    psrlw      m11, 1
-    psrlw      m10, 1
-
-; store h
-    movq        m3, [r2+7]
-    pshufb      m0, m3, [off(intra8x9_h1)]
-    pshufb      m1, m3, [off(intra8x9_h2)]
-    pshufb      m2, m3, [off(intra8x9_h3)]
-    pshufb      m3, m3, [off(intra8x9_h4)]
-    mova [r5-0x40], m0
-    mova [r5-0x30], m1
-    mova [r5-0x20], m2
-    mova [r5-0x10], m3
-
-; ddl
-    mova        m8, [r2+16]
-    movu        m2, [r2+17]
-    pslldq      m1, m8, 1
-    pavgb       m9, m8, m2
-    PRED4x4_LOWPASS m8, m1, m2, m8, m3
-    pshufb      m0, m8, [off(intra8x9_ddl1)]
-    pshufb      m1, m8, [off(intra8x9_ddl2)]
-    pshufb      m2, m8, [off(intra8x9_ddl3)]
-    pshufb      m3, m8, [off(intra8x9_ddl4)]
-    add         r5, 0x40
-    call .sa8d
-    phaddd     m11, m0
-
-; vl
-    pshufb      m0, m9, [off(intra8x9_vl1)]
-    pshufb      m1, m8, [off(intra8x9_vl2)]
-    pshufb      m2, m9, [off(intra8x9_vl3)]
-    pshufb      m3, m8, [off(intra8x9_vl4)]
-    add         r5, 0x100
-    call .sa8d
-    phaddd     m10, m11
-    mova       m12, m0
-
-; ddr
-    movu        m2, [r2+8]
-    movu        m8, [r2+7]
-    movu        m1, [r2+6]
-    pavgb       m9, m2, m8
-    PRED4x4_LOWPASS m8, m1, m2, m8, m3
-    pshufb      m0, m8, [off(intra8x9_ddr1)]
-    pshufb      m1, m8, [off(intra8x9_ddr2)]
-    pshufb      m2, m8, [off(intra8x9_ddr3)]
-    pshufb      m3, m8, [off(intra8x9_ddr4)]
-    sub         r5, 0xc0
-    call .sa8d
-    mova       m11, m0
-
-    add         r0, 0x100
-    %define off(m) (r0+m-(intra8x9_h1+0x180))
-
-; vr
-    movsd       m2, m9, m8
-    pshufb      m0, m2, [off(intra8x9_vr1)]
-    pshufb      m1, m8, [off(intra8x9_vr2)]
-    pshufb      m2, m2, [off(intra8x9_vr3)]
-    pshufb      m3, m8, [off(intra8x9_vr4)]
-    add         r5, 0x40
-    call .sa8d
-    phaddd     m11, m0
-
-; hd
-%if cpuflag(sse4)
-    pshufd      m1, m9, q0001
-    pblendw     m1, m8, q3330
-%else
-    pshufd      m2, m9, q0001
-    movss       m1, m8, m2
-%endif
-    punpcklbw   m8, m9
-    pshufb      m0, m1, [off(intra8x9_hd1)]
-    pshufb      m1, m1, [off(intra8x9_hd2)]
-    pshufb      m2, m8, [off(intra8x9_hd3)]
-    pshufb      m3, m8, [off(intra8x9_hd4)]
-    add         r5, 0x40
-    call .sa8d
-    phaddd      m0, m12
-    phaddd     m11, m0
-
-; hu
-%if cpuflag(sse4)
-    pinsrb      m8, [r2+7], 15
-%else
-    movd        m9, [r2+7]
-    pslldq      m8, 1
-    palignr     m9, m8, 1
-    SWAP        8, 9
-%endif
-    pshufb      m0, m8, [off(intra8x9_hu1)]
-    pshufb      m1, m8, [off(intra8x9_hu2)]
-    pshufb      m2, m8, [off(intra8x9_hu3)]
-    pshufb      m3, m8, [off(intra8x9_hu4)]
-    add         r5, 0x80
-    call .sa8d
-
-    pmaddwd     m0, [pw_1]
-    phaddw     m10, m11
-    MOVHL       m1, m0
-    paddw       m0, m1
-    pshuflw     m1, m0, q0032
-    pavgw       m0, m1
-    pxor        m2, m2
-    pavgw      m10, m2
-    movd       r2d, m0
-
-    movu        m0, [r3]
-    paddw       m0, m10
-    mova      [r4], m0
-    movzx      r5d, word [r3+16]
-    add        r2d, r5d
-    mov    [r4+16], r2w
-
-%if cpuflag(sse4)
-    phminposuw m0, m0
-    movd      eax, m0
-%else
-    ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
-    paddusw    m0, m0
-    paddw      m0, [off(pw_s00001111)]
-    MOVHL      m1, m0
-    pminsw     m0, m1
-    pshuflw    m1, m0, q0032
-    mova       m2, m0
-    pminsw     m0, m1
-    pcmpgtw    m2, m1 ; 2nd index bit
-    movd      r3d, m0
-    movd      r4d, m2
-    ; repack with 3 bit index
-    xor       r3d, 0x80008000
-    and       r4d, 0x00020002
-    movzx     eax, r3w
-    movzx     r5d, r4w
-    shr       r3d, 16
-    shr       r4d, 16
-    lea       eax, [rax*4+r5]
-    lea       r3d, [ r3*4+r4+1]
-    cmp       eax, r3d
-    cmovg     eax, r3d
-    ; reverse to phminposuw order
-    mov       r3d, eax
-    and       eax, 7
-    shr       r3d, 3
-    shl       eax, 16
-    or        eax, r3d
-%endif
-    add       r2d, 8<<16
-    cmp        ax, r2w
-    cmovg     eax, r2d
-
-    mov       r2d, eax
-    shr       r2d, 16
-    shl       r2d, 6
-    add        r1, 4*FDEC_STRIDE
-    mova       m0, [pred_buf+r2+0x00]
-    mova       m1, [pred_buf+r2+0x10]
-    mova       m2, [pred_buf+r2+0x20]
-    mova       m3, [pred_buf+r2+0x30]
-    movq   [r1+FDEC_STRIDE*-4], m0
-    movhps [r1+FDEC_STRIDE*-2], m0
-    movq   [r1+FDEC_STRIDE*-3], m1
-    movhps [r1+FDEC_STRIDE*-1], m1
-    movq   [r1+FDEC_STRIDE* 0], m2
-    movhps [r1+FDEC_STRIDE* 2], m2
-    movq   [r1+FDEC_STRIDE* 1], m3
-    movhps [r1+FDEC_STRIDE* 3], m3
-    ADD       rsp, pad
-    RET
-
-ALIGN 16
-.sa8d:
-    %xdefine mret m0
-    %xdefine fenc_buf fenc_buf+gprsize
-    mova [r5+0x00], m0
-    mova [r5+0x10], m1
-    mova [r5+0x20], m2
-    mova [r5+0x30], m3
-    movddup     m4, m0
-    movddup     m5, m1
-    movddup     m6, m2
-    movddup     m7, m3
-    punpckhqdq  m0, m0
-    punpckhqdq  m1, m1
-    punpckhqdq  m2, m2
-    punpckhqdq  m3, m3
-    PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
-    pmaddubsw   m0, m15
-    pmaddubsw   m1, m15
-    psubw       m0, [fenc_buf+0x00]
-    psubw       m1, [fenc_buf+0x10]
-    pmaddubsw   m2, m15
-    pmaddubsw   m3, m15
-    psubw       m2, [fenc_buf+0x20]
-    psubw       m3, [fenc_buf+0x30]
-    pmaddubsw   m4, m15
-    pmaddubsw   m5, m15
-    psubw       m4, [fenc_buf+0x40]
-    psubw       m5, [fenc_buf+0x50]
-    pmaddubsw   m6, m15
-    pmaddubsw   m7, m15
-    psubw       m6, [fenc_buf+0x60]
-    psubw       m7, [fenc_buf+0x70]
-    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
-    paddw       m0, m1
-    paddw       m0, m2
-    paddw mret, m0, m3
-    ret
-%endif ; ARCH_X86_64
-%endmacro ; INTRA8_X9
-
-; in:  r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
-; out: [tmp]=hadamard4, m0=satd
-INIT_MMX mmx2
-cglobal hadamard_ac_4x4
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+r1]
-    mova      m2, [r0+r1*2]
-    mova      m3, [r0+r2]
-%else ; !HIGH_BIT_DEPTH
-    movh      m0, [r0]
-    movh      m1, [r0+r1]
-    movh      m2, [r0+r1*2]
-    movh      m3, [r0+r2]
-    punpcklbw m0, m7
-    punpcklbw m1, m7
-    punpcklbw m2, m7
-    punpcklbw m3, m7
-%endif ; HIGH_BIT_DEPTH
-    HADAMARD4_2D 0, 1, 2, 3, 4
-    mova [r3],    m0
-    mova [r3+8],  m1
-    mova [r3+16], m2
-    mova [r3+24], m3
-    ABSW      m0, m0, m4
-    ABSW      m1, m1, m4
-    pand      m0, m6
-    ABSW      m2, m2, m4
-    ABSW      m3, m3, m4
-    paddw     m0, m1
-    paddw     m2, m3
-    paddw     m0, m2
-    SAVE_MM_PERMUTATION
-    ret
-
-cglobal hadamard_ac_2x2max
-    mova      m0, [r3+0x00]
-    mova      m1, [r3+0x20]
-    mova      m2, [r3+0x40]
-    mova      m3, [r3+0x60]
-    sub       r3, 8
-    SUMSUB_BADC w, 0, 1, 2, 3, 4
-    ABSW2 m0, m2, m0, m2, m4, m5
-    ABSW2 m1, m3, m1, m3, m4, m5
-    HADAMARD 0, max, 0, 2, 4, 5
-    HADAMARD 0, max, 1, 3, 4, 5
-%if HIGH_BIT_DEPTH
-    pmaddwd   m0, m7
-    pmaddwd   m1, m7
-    paddd     m6, m0
-    paddd     m6, m1
-%else ; !HIGH_BIT_DEPTH
-    paddw     m7, m0
-    paddw     m7, m1
-%endif ; HIGH_BIT_DEPTH
-    SAVE_MM_PERMUTATION
-    ret
-
-%macro AC_PREP 2
-%if HIGH_BIT_DEPTH
-    pmaddwd %1, %2
-%endif
-%endmacro
-
-%macro AC_PADD 3
-%if HIGH_BIT_DEPTH
-    AC_PREP %2, %3
-    paddd   %1, %2
-%else
-    paddw   %1, %2
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-cglobal hadamard_ac_8x8
-    mova      m6, [mask_ac4]
-%if HIGH_BIT_DEPTH
-    mova      m7, [pw_1]
-%else
-    pxor      m7, m7
-%endif ; HIGH_BIT_DEPTH
-    call hadamard_ac_4x4_mmx2
-    add       r0, 4*SIZEOF_PIXEL
-    add       r3, 32
-    mova      m5, m0
-    AC_PREP   m5, m7
-    call hadamard_ac_4x4_mmx2
-    lea       r0, [r0+4*r1]
-    add       r3, 64
-    AC_PADD   m5, m0, m7
-    call hadamard_ac_4x4_mmx2
-    sub       r0, 4*SIZEOF_PIXEL
-    sub       r3, 32
-    AC_PADD   m5, m0, m7
-    call hadamard_ac_4x4_mmx2
-    AC_PADD   m5, m0, m7
-    sub       r3, 40
-    mova [rsp+gprsize+8], m5 ; save satd
-%if HIGH_BIT_DEPTH
-    pxor      m6, m6
-%endif
-%rep 3
-    call hadamard_ac_2x2max_mmx2
-%endrep
-    mova      m0, [r3+0x00]
-    mova      m1, [r3+0x20]
-    mova      m2, [r3+0x40]
-    mova      m3, [r3+0x60]
-    SUMSUB_BADC w, 0, 1, 2, 3, 4
-    HADAMARD 0, sumsub, 0, 2, 4, 5
-    ABSW2 m1, m3, m1, m3, m4, m5
-    ABSW2 m0, m2, m0, m2, m4, m5
-    HADAMARD 0, max, 1, 3, 4, 5
-%if HIGH_BIT_DEPTH
-    pand      m0, [mask_ac4]
-    pmaddwd   m1, m7
-    pmaddwd   m0, m7
-    pmaddwd   m2, m7
-    paddd     m6, m1
-    paddd     m0, m2
-    paddd     m6, m6
-    paddd     m0, m6
-    SWAP       0,  6
-%else ; !HIGH_BIT_DEPTH
-    pand      m6, m0
-    paddw     m7, m1
-    paddw     m6, m2
-    paddw     m7, m7
-    paddw     m6, m7
-%endif ; HIGH_BIT_DEPTH
-    mova [rsp+gprsize], m6 ; save sa8d
-    SWAP       0,  6
-    SAVE_MM_PERMUTATION
-    ret
-
-%macro HADAMARD_AC_WXH_SUM_MMX 2
-    mova    m1, [rsp+1*mmsize]
-%if HIGH_BIT_DEPTH
-%if %1*%2 >= 128
-    paddd   m0, [rsp+2*mmsize]
-    paddd   m1, [rsp+3*mmsize]
-%endif
-%if %1*%2 == 256
-    mova    m2, [rsp+4*mmsize]
-    paddd   m1, [rsp+5*mmsize]
-    paddd   m2, [rsp+6*mmsize]
-    mova    m3, m0
-    paddd   m1, [rsp+7*mmsize]
-    paddd   m0, m2
-%endif
-    psrld   m0, 1
-    HADDD   m0, m2
-    psrld   m1, 1
-    HADDD   m1, m3
-%else ; !HIGH_BIT_DEPTH
-%if %1*%2 >= 128
-    paddusw m0, [rsp+2*mmsize]
-    paddusw m1, [rsp+3*mmsize]
-%endif
-%if %1*%2 == 256
-    mova    m2, [rsp+4*mmsize]
-    paddusw m1, [rsp+5*mmsize]
-    paddusw m2, [rsp+6*mmsize]
-    mova    m3, m0
-    paddusw m1, [rsp+7*mmsize]
-    pxor    m3, m2
-    pand    m3, [pw_1]
-    pavgw   m0, m2
-    psubusw m0, m3
-    HADDUW  m0, m2
-%else
-    psrlw   m0, 1
-    HADDW   m0, m2
-%endif
-    psrlw   m1, 1
-    HADDW   m1, m3
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro HADAMARD_AC_WXH_MMX 2
-cglobal pixel_hadamard_ac_%1x%2, 2,4
-    %assign pad 16-gprsize-(stack_offset&15)
-    %define ysub r1
-    FIX_STRIDES r1
-    sub  rsp, 16+128+pad
-    lea  r2, [r1*3]
-    lea  r3, [rsp+16]
-    call hadamard_ac_8x8_mmx2
-%if %2==16
-    %define ysub r2
-    lea  r0, [r0+r1*4]
-    sub  rsp, 16
-    call hadamard_ac_8x8_mmx2
-%endif
-%if %1==16
-    neg  ysub
-    sub  rsp, 16
-    lea  r0, [r0+ysub*4+8*SIZEOF_PIXEL]
-    neg  ysub
-    call hadamard_ac_8x8_mmx2
-%if %2==16
-    lea  r0, [r0+r1*4]
-    sub  rsp, 16
-    call hadamard_ac_8x8_mmx2
-%endif
-%endif
-    HADAMARD_AC_WXH_SUM_MMX %1, %2
-    movd edx, m0
-    movd eax, m1
-    shr  edx, 1
-%if ARCH_X86_64
-    shl  rdx, 32
-    add  rax, rdx
-%endif
-    add  rsp, 128+%1*%2/4+pad
-    RET
-%endmacro ; HADAMARD_AC_WXH_MMX
-
-HADAMARD_AC_WXH_MMX 16, 16
-HADAMARD_AC_WXH_MMX  8, 16
-HADAMARD_AC_WXH_MMX 16,  8
-HADAMARD_AC_WXH_MMX  8,  8
-
-%macro LOAD_INC_8x4W_SSE2 5
-%if HIGH_BIT_DEPTH
-    movu      m%1, [r0]
-    movu      m%2, [r0+r1]
-    movu      m%3, [r0+r1*2]
-    movu      m%4, [r0+r2]
-%ifidn %1, 0
-    lea       r0, [r0+r1*4]
-%endif
-%else ; !HIGH_BIT_DEPTH
-    movh      m%1, [r0]
-    movh      m%2, [r0+r1]
-    movh      m%3, [r0+r1*2]
-    movh      m%4, [r0+r2]
-%ifidn %1, 0
-    lea       r0, [r0+r1*4]
-%endif
-    punpcklbw m%1, m%5
-    punpcklbw m%2, m%5
-    punpcklbw m%3, m%5
-    punpcklbw m%4, m%5
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro LOAD_INC_8x4W_SSSE3 5
-    LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
-%ifidn %1, 0
-    lea       r0, [r0+r1*4]
-%endif
-    HSUMSUB %1, %2, %3, %4, %5
-%endmacro
-
-%macro HADAMARD_AC_SSE2 0
-; in:  r0=pix, r1=stride, r2=stride*3
-; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
-cglobal hadamard_ac_8x8
-%if ARCH_X86_64
-    %define spill0 m8
-    %define spill1 m9
-    %define spill2 m10
-%else
-    %define spill0 [rsp+gprsize]
-    %define spill1 [rsp+gprsize+mmsize]
-    %define spill2 [rsp+gprsize+mmsize*2]
-%endif
-%if HIGH_BIT_DEPTH
-    %define vertical 1
-%elif cpuflag(ssse3) && notcpuflag(atom)
-    %define vertical 0
-    ;LOAD_INC loads sumsubs
-    mova      m7, [hmul_8p]
-%else
-    %define vertical 1
-    ;LOAD_INC only unpacks to words
-    pxor      m7, m7
-%endif
-    LOAD_INC_8x4W 0, 1, 2, 3, 7
-%if vertical
-    HADAMARD4_2D_SSE 0, 1, 2, 3, 4
-%else
-    HADAMARD4_V 0, 1, 2, 3, 4
-%endif
-    mova  spill0, m1
-    SWAP 1, 7
-    LOAD_INC_8x4W 4, 5, 6, 7, 1
-%if vertical
-    HADAMARD4_2D_SSE 4, 5, 6, 7, 1
-%else
-    HADAMARD4_V 4, 5, 6, 7, 1
-    ; FIXME SWAP
-    mova      m1, spill0
-    mova      spill0, m6
-    mova      spill1, m7
-    HADAMARD 1, sumsub, 0, 1, 6, 7
-    HADAMARD 1, sumsub, 2, 3, 6, 7
-    mova      m6, spill0
-    mova      m7, spill1
-    mova      spill0, m1
-    mova      spill1, m0
-    HADAMARD 1, sumsub, 4, 5, 1, 0
-    HADAMARD 1, sumsub, 6, 7, 1, 0
-    mova      m0, spill1
-%endif
-    mova  spill1, m2
-    mova  spill2, m3
-    ABSW      m1, m0, m0
-    ABSW      m2, m4, m4
-    ABSW      m3, m5, m5
-    paddw     m1, m2
-    SUMSUB_BA w, 0, 4
-%if vertical
-    pand      m1, [mask_ac4]
-%else
-    pand      m1, [mask_ac4b]
-%endif
-    AC_PREP   m1, [pw_1]
-    ABSW      m2, spill0
-    AC_PADD   m1, m3, [pw_1]
-    ABSW      m3, spill1
-    AC_PADD   m1, m2, [pw_1]
-    ABSW      m2, spill2
-    AC_PADD   m1, m3, [pw_1]
-    ABSW      m3, m6, m6
-    AC_PADD   m1, m2, [pw_1]
-    ABSW      m2, m7, m7
-    AC_PADD   m1, m3, [pw_1]
-    AC_PADD   m1, m2, [pw_1]
-    paddw     m3, m7, spill2
-    psubw     m7, spill2
-    mova  [rsp+gprsize+mmsize*2], m1 ; save satd
-    paddw     m2, m6, spill1
-    psubw     m6, spill1
-    paddw     m1, m5, spill0
-    psubw     m5, spill0
-    %assign %%x 2
-%if vertical
-    %assign %%x 4
-%endif
-    mova  spill1, m4
-    HADAMARD %%x, amax, 3, 7, 4
-    HADAMARD %%x, amax, 2, 6, 7, 4
-    mova      m4, spill1
-    HADAMARD %%x, amax, 1, 5, 6, 7
-    HADAMARD %%x, sumsub, 0, 4, 5, 6
-    AC_PREP   m2, [pw_1]
-    AC_PADD   m2, m3, [pw_1]
-    AC_PADD   m2, m1, [pw_1]
-%if HIGH_BIT_DEPTH
-    paddd     m2, m2
-%else
-    paddw     m2, m2
-%endif ; HIGH_BIT_DEPTH
-    ABSW      m4, m4, m7
-    pand      m0, [mask_ac8]
-    ABSW      m0, m0, m7
-    AC_PADD   m2, m4, [pw_1]
-    AC_PADD   m2, m0, [pw_1]
-    mova [rsp+gprsize+mmsize], m2 ; save sa8d
-    SWAP       0, 2
-    SAVE_MM_PERMUTATION
-    ret
-
-HADAMARD_AC_WXH_SSE2 16, 16
-HADAMARD_AC_WXH_SSE2 16,  8
-%if mmsize <= 16
-HADAMARD_AC_WXH_SSE2  8, 16
-HADAMARD_AC_WXH_SSE2  8,  8
-%endif
-%endmacro ; HADAMARD_AC_SSE2
-
-%macro HADAMARD_AC_WXH_SUM_SSE2 2
-    mova    m1, [rsp+2*mmsize]
-%if HIGH_BIT_DEPTH
-%if %1*%2 >= 128
-    paddd   m0, [rsp+3*mmsize]
-    paddd   m1, [rsp+4*mmsize]
-%endif
-%if %1*%2 == 256
-    paddd   m0, [rsp+5*mmsize]
-    paddd   m1, [rsp+6*mmsize]
-    paddd   m0, [rsp+7*mmsize]
-    paddd   m1, [rsp+8*mmsize]
-    psrld   m0, 1
-%endif
-    HADDD  xm0, xm2
-    HADDD  xm1, xm3
-%else ; !HIGH_BIT_DEPTH
-%if %1*%2*16/mmsize >= 128
-    paddusw m0, [rsp+3*mmsize]
-    paddusw m1, [rsp+4*mmsize]
-%endif
-%if %1*%2*16/mmsize == 256
-    paddusw m0, [rsp+5*mmsize]
-    paddusw m1, [rsp+6*mmsize]
-    paddusw m0, [rsp+7*mmsize]
-    paddusw m1, [rsp+8*mmsize]
-    psrlw   m0, 1
-%endif
-%if mmsize==32
-    vextracti128 xm2, m0, 1
-    vextracti128 xm3, m1, 1
-    paddusw xm0, xm2
-    paddusw xm1, xm3
-%endif
-    HADDUW xm0, xm2
-    HADDW  xm1, xm3
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
-%macro HADAMARD_AC_WXH_SSE2 2
-cglobal pixel_hadamard_ac_%1x%2, 2,4,11
-    %define ysub r1
-    FIX_STRIDES r1
-    mov   r3, rsp
-    and  rsp, ~(mmsize-1)
-    sub  rsp, mmsize*3
-    lea   r2, [r1*3]
-    call hadamard_ac_8x8
-%if %2==16
-    %define ysub r2
-    lea   r0, [r0+r1*4]
-    sub  rsp, mmsize*2
-    call hadamard_ac_8x8
-%endif
-%if %1==16 && mmsize <= 16
-    neg  ysub
-    sub  rsp, mmsize*2
-    lea   r0, [r0+ysub*4+8*SIZEOF_PIXEL]
-    neg  ysub
-    call hadamard_ac_8x8
-%if %2==16
-    lea   r0, [r0+r1*4]
-    sub  rsp, mmsize*2
-    call hadamard_ac_8x8
-%endif
-%endif
-    HADAMARD_AC_WXH_SUM_SSE2 %1, %2
-    movd edx, xm0
-    movd eax, xm1
-    shr  edx, 2 - (%1*%2*16/mmsize >> 8)
-    shr  eax, 1
-%if ARCH_X86_64
-    shl  rdx, 32
-    add  rax, rdx
-%endif
-    mov  rsp, r3
-    RET
-%endmacro ; HADAMARD_AC_WXH_SSE2
-
-; instantiate satds
-
-%if ARCH_X86_64 == 0
-cextern pixel_sa8d_8x8_internal_mmx2
-INIT_MMX mmx2
-SA8D
-%endif
-
-%define TRANS TRANS_SSE2
-%define DIFFOP DIFF_UNPACK_SSE2
-%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
-%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
-%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSE2
-%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
-%define movdqu movups
-%define punpcklqdq movlhps
-INIT_XMM sse2
-SA8D
-SATDS_SSE2
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%if HIGH_BIT_DEPTH == 0
-INTRA_SA8D_SSE2
-%endif
-INIT_MMX mmx2
-INTRA_X3_MMX
-INIT_XMM sse2
-HADAMARD_AC_SSE2
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM ssse3,atom
-SATDS_SSE2
-SA8D
-HADAMARD_AC_SSE2
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%endif
-
-%define DIFFOP DIFF_SUMSUB_SSSE3
-%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
-%if HIGH_BIT_DEPTH == 0
-%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
-%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
-%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSSE3
-%endif
-INIT_XMM ssse3
-SATDS_SSE2
-SA8D
-HADAMARD_AC_SSE2
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%if HIGH_BIT_DEPTH == 0
-INTRA_X9
-INTRA8_X9
-%endif
-%undef movdqa ; nehalem doesn't like movaps
-%undef movdqu ; movups
-%undef punpcklqdq ; or movlhps
-%if HIGH_BIT_DEPTH == 0
-INIT_MMX ssse3
-INTRA_X3_MMX
-%endif
-
-%define TRANS TRANS_SSE4
-%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
-INIT_XMM sse4
-SATDS_SSE2
-SA8D
-HADAMARD_AC_SSE2
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%if HIGH_BIT_DEPTH == 0
-INTRA_X9
-INTRA8_X9
-%endif
-
-; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
-; it's effectively free.
-%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
-INIT_XMM avx
-SATDS_SSE2
-SA8D
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%if HIGH_BIT_DEPTH == 0
-INTRA_X9
-INTRA8_X9
-%endif
-HADAMARD_AC_SSE2
-
-%define TRANS TRANS_XOP
-INIT_XMM xop
-SATDS_SSE2
-SA8D
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-%if HIGH_BIT_DEPTH == 0
-INTRA_X9
-; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
-%endif
-HADAMARD_AC_SSE2
-
-
-%if HIGH_BIT_DEPTH == 0
-%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
-%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
-%define TRANS TRANS_SSE4
-INIT_YMM avx2
-HADAMARD_AC_SSE2
-%if ARCH_X86_64
-SA8D_SATD
-%endif
-
-%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
-    movq   xm%1, [r0]
-    movq   xm%3, [r2]
-    movq   xm%2, [r0+r1]
-    movq   xm%4, [r2+r3]
-    vinserti128 m%1, m%1, [r0+4*r1], 1
-    vinserti128 m%3, m%3, [r2+4*r3], 1
-    vinserti128 m%2, m%2, [r0+r4], 1
-    vinserti128 m%4, m%4, [r2+r5], 1
-    punpcklqdq m%1, m%1
-    punpcklqdq m%3, m%3
-    punpcklqdq m%2, m%2
-    punpcklqdq m%4, m%4
-    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
-    lea      r0, [r0+2*r1]
-    lea      r2, [r2+2*r3]
-
-    movq   xm%3, [r0]
-    movq   xm%5, [r2]
-    movq   xm%4, [r0+r1]
-    movq   xm%6, [r2+r3]
-    vinserti128 m%3, m%3, [r0+4*r1], 1
-    vinserti128 m%5, m%5, [r2+4*r3], 1
-    vinserti128 m%4, m%4, [r0+r4], 1
-    vinserti128 m%6, m%6, [r2+r5], 1
-    punpcklqdq m%3, m%3
-    punpcklqdq m%5, m%5
-    punpcklqdq m%4, m%4
-    punpcklqdq m%6, m%6
-    DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
-%endmacro
-
-%macro SATD_START_AVX2 2-3 0
-    FIX_STRIDES r1, r3
-%if %3
-    mova    %2, [hmul_8p]
-    lea     r4, [5*r1]
-    lea     r5, [5*r3]
-%else
-    mova    %2, [hmul_16p]
-    lea     r4, [3*r1]
-    lea     r5, [3*r3]
-%endif
-    pxor    %1, %1
-%endmacro
-
-%define TRANS TRANS_SSE4
-INIT_YMM avx2
-cglobal pixel_satd_16x8_internal
-    LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
-    LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
-    ret
-
-cglobal pixel_satd_16x16, 4,6,8
-    SATD_START_AVX2 m6, m7
-    call pixel_satd_16x8_internal
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-pixel_satd_16x8_internal:
-    call pixel_satd_16x8_internal
-    vextracti128 xm0, m6, 1
-    paddw        xm0, xm6
-    SATD_END_SSE2 xm0
-    RET
-
-cglobal pixel_satd_16x8, 4,6,8
-    SATD_START_AVX2 m6, m7
-    jmp pixel_satd_16x8_internal
-
-cglobal pixel_satd_8x8_internal
-    LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
-    ret
-
-cglobal pixel_satd_8x16, 4,6,8
-    SATD_START_AVX2 m6, m7, 1
-    call pixel_satd_8x8_internal
-    lea  r0, [r0+2*r1]
-    lea  r2, [r2+2*r3]
-    lea  r0, [r0+4*r1]
-    lea  r2, [r2+4*r3]
-    call pixel_satd_8x8_internal
-    vextracti128 xm0, m6, 1
-    paddw        xm0, xm6
-    SATD_END_SSE2 xm0
-    RET
-
-cglobal pixel_satd_8x8, 4,6,8
-    SATD_START_AVX2 m6, m7, 1
-    call pixel_satd_8x8_internal
-    vextracti128 xm0, m6, 1
-    paddw        xm0, xm6
-    SATD_END_SSE2 xm0
-    RET
-
-cglobal pixel_sa8d_8x8_internal
-    LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
-    HADAMARD4_V 0, 1, 2, 3, 4
-    HADAMARD 8, sumsub, 0, 1, 4, 5
-    HADAMARD 8, sumsub, 2, 3, 4, 5
-    HADAMARD 2, sumsub, 0, 1, 4, 5
-    HADAMARD 2, sumsub, 2, 3, 4, 5
-    HADAMARD 1, amax, 0, 1, 4, 5
-    HADAMARD 1, amax, 2, 3, 4, 5
-    paddw  m6, m0
-    paddw  m6, m2
-    ret
-
-cglobal pixel_sa8d_8x8, 4,6,8
-    SATD_START_AVX2 m6, m7, 1
-    call pixel_sa8d_8x8_internal
-    vextracti128 xm1, m6, 1
-    paddw xm6, xm1
-    HADDW xm6, xm1
-    movd  eax, xm6
-    add   eax, 1
-    shr   eax, 1
-    RET
-
-cglobal intra_sad_x9_8x8, 5,7,8
-    %define pred(i,j) [rsp+i*0x40+j*0x20]
-
-    mov         r6, rsp
-    and        rsp, ~31
-    sub        rsp, 0x240
-    movu        m5, [r0+0*FENC_STRIDE]
-    movu        m6, [r0+4*FENC_STRIDE]
-    punpcklqdq  m5, [r0+2*FENC_STRIDE]
-    punpcklqdq  m6, [r0+6*FENC_STRIDE]
-
-    ; save instruction size: avoid 4-byte memory offsets
-    lea         r0, [intra8x9_h1+128]
-    %define off(m) (r0+m-(intra8x9_h1+128))
-
-    vpbroadcastq m0, [r2+16]
-    psadbw      m4, m0, m5
-    psadbw      m2, m0, m6
-    mova pred(0,0), m0
-    mova pred(0,1), m0
-    paddw       m4, m2
-
-    vpbroadcastq m1, [r2+7]
-    pshufb      m3, m1, [off(intra8x9_h1)]
-    pshufb      m2, m1, [off(intra8x9_h3)]
-    mova pred(1,0), m3
-    mova pred(1,1), m2
-    psadbw      m3, m5
-    psadbw      m2, m6
-    paddw       m3, m2
-
-    lea         r5, [rsp+0x100]
-    %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
-
-    ; combine the first two
-    pslldq      m3, 2
-    por         m4, m3
-
-    pxor        m2, m2
-    psadbw      m0, m2
-    psadbw      m1, m2
-    paddw       m0, m1
-    psrlw       m0, 3
-    pavgw       m0, m2
-    pshufb      m0, m2
-    mova pred(2,0), m0
-    mova pred(2,1), m0
-    psadbw      m3, m0, m5
-    psadbw      m2, m0, m6
-    paddw       m3, m2
-
-    pslldq      m3, 4
-    por         m4, m3
-
-    vbroadcasti128 m0, [r2+16]
-    vbroadcasti128 m2, [r2+17]
-    pslldq      m1, m0, 1
-    pavgb       m3, m0, m2
-    PRED4x4_LOWPASS m0, m1, m2, m0, m7
-    pshufb      m1, m0, [off(intra8x9_ddl1)]
-    pshufb      m2, m0, [off(intra8x9_ddl3)]
-    mova pred(3,0), m1
-    mova pred(3,1), m2
-    psadbw      m1, m5
-    psadbw      m2, m6
-    paddw       m1, m2
-
-    pslldq      m1, 6
-    por         m4, m1
-    vextracti128 xm1, m4, 1
-    paddw      xm4, xm1
-    mova      [r4], xm4
-
-    ; for later
-    vinserti128 m7, m3, xm0, 1
-
-    vbroadcasti128 m2, [r2+8]
-    vbroadcasti128 m0, [r2+7]
-    vbroadcasti128 m1, [r2+6]
-    pavgb       m3, m2, m0
-    PRED4x4_LOWPASS m0, m1, m2, m0, m4
-    pshufb      m1, m0, [off(intra8x9_ddr1)]
-    pshufb      m2, m0, [off(intra8x9_ddr3)]
-    mova pred(4,0), m1
-    mova pred(4,1), m2
-    psadbw      m4, m1, m5
-    psadbw      m2, m6
-    paddw       m4, m2
-
-    add         r0, 256
-    add         r5, 0xC0
-    %define off(m) (r0+m-(intra8x9_h1+256+128))
-    %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
-
-    vpblendd    m2, m3, m0, 11110011b
-    pshufb      m1, m2, [off(intra8x9_vr1)]
-    pshufb      m2, m2, [off(intra8x9_vr3)]
-    mova pred(5,0), m1
-    mova pred(5,1), m2
-    psadbw      m1, m5
-    psadbw      m2, m6
-    paddw       m1, m2
-
-    pslldq      m1, 2
-    por         m4, m1
-
-    psrldq      m2, m3, 4
-    pblendw     m2, m0, q3330
-    punpcklbw   m0, m3
-    pshufb      m1, m2, [off(intra8x9_hd1)]
-    pshufb      m2, m0, [off(intra8x9_hd3)]
-    mova pred(6,0), m1
-    mova pred(6,1), m2
-    psadbw      m1, m5
-    psadbw      m2, m6
-    paddw       m1, m2
-
-    pslldq      m1, 4
-    por         m4, m1
-
-    pshufb      m1, m7, [off(intra8x9_vl1)]
-    pshufb      m2, m7, [off(intra8x9_vl3)]
-    mova pred(7,0), m1
-    mova pred(7,1), m2
-    psadbw      m1, m5
-    psadbw      m2, m6
-    paddw       m1, m2
-
-    pslldq      m1, 6
-    por         m4, m1
-    vextracti128 xm1, m4, 1
-    paddw      xm4, xm1
-    mova       xm3, [r4]
-    SBUTTERFLY qdq, 3, 4, 7
-    paddw      xm3, xm4
-
-    pslldq      m1, m0, 1
-    vpbroadcastd m0, [r2+7]
-    palignr     m0, m1, 1
-    pshufb      m1, m0, [off(intra8x9_hu1)]
-    pshufb      m2, m0, [off(intra8x9_hu3)]
-    mova pred(8,0), m1
-    mova pred(8,1), m2
-    psadbw      m1, m5
-    psadbw      m2, m6
-    paddw       m1, m2
-    vextracti128 xm2, m1, 1
-    paddw      xm1, xm2
-    MOVHL      xm2, xm1
-    paddw      xm1, xm2
-    movd       r2d, xm1
-
-    paddw      xm3, [r3]
-    mova      [r4], xm3
-    add        r2w, word [r3+16]
-    mov    [r4+16], r2w
-
-    phminposuw xm3, xm3
-    movd       r3d, xm3
-    add        r2d, 8<<16
-    cmp        r3w, r2w
-    cmovg      r3d, r2d
-
-    mov        r2d, r3d
-    shr         r3, 16
-    shl         r3, 6
-    add         r1, 4*FDEC_STRIDE
-    mova       xm0, [rsp+r3+0x00]
-    mova       xm1, [rsp+r3+0x10]
-    mova       xm2, [rsp+r3+0x20]
-    mova       xm3, [rsp+r3+0x30]
-    movq   [r1+FDEC_STRIDE*-4], xm0
-    movhps [r1+FDEC_STRIDE*-2], xm0
-    movq   [r1+FDEC_STRIDE*-3], xm1
-    movhps [r1+FDEC_STRIDE*-1], xm1
-    movq   [r1+FDEC_STRIDE* 0], xm2
-    movhps [r1+FDEC_STRIDE* 2], xm2
-    movq   [r1+FDEC_STRIDE* 1], xm3
-    movhps [r1+FDEC_STRIDE* 3], xm3
-    mov        rsp, r6
-    mov        eax, r2d
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-;=============================================================================
-; SSIM
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
-;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
-;-----------------------------------------------------------------------------
-%macro SSIM_ITER 1
-%if HIGH_BIT_DEPTH
-    movu      m4, [r0+(%1&1)*r1]
-    movu      m5, [r2+(%1&1)*r3]
-%elif cpuflag(avx)
-    pmovzxbw  m4, [r0+(%1&1)*r1]
-    pmovzxbw  m5, [r2+(%1&1)*r3]
-%else
-    movq      m4, [r0+(%1&1)*r1]
-    movq      m5, [r2+(%1&1)*r3]
-    punpcklbw m4, m7
-    punpcklbw m5, m7
-%endif
-%if %1==1
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-%endif
-%if %1 == 0 && cpuflag(avx)
-    SWAP       0, 4
-    SWAP       1, 5
-    pmaddwd   m4, m0, m0
-    pmaddwd   m5, m1, m1
-    pmaddwd   m6, m0, m1
-%else
-%if %1 == 0
-    mova      m0, m4
-    mova      m1, m5
-%else
-    paddw     m0, m4
-    paddw     m1, m5
-%endif
-    pmaddwd   m6, m4, m5
-    pmaddwd   m4, m4
-    pmaddwd   m5, m5
-%endif
-    ACCUM  paddd, 2, 4, %1
-    ACCUM  paddd, 3, 6, %1
-    paddd     m2, m5
-%endmacro
-
-%macro SSIM 0
-%if HIGH_BIT_DEPTH
-cglobal pixel_ssim_4x4x2_core, 4,4,7
-    FIX_STRIDES r1, r3
-%else
-cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx)
-%if notcpuflag(avx)
-    pxor      m7, m7
-%endif
-%endif
-    SSIM_ITER 0
-    SSIM_ITER 1
-    SSIM_ITER 2
-    SSIM_ITER 3
-%if UNIX64
-    DECLARE_REG_TMP 4
-%else
-    DECLARE_REG_TMP 0
-    mov       t0, r4mp
-%endif
-%if cpuflag(ssse3)
-    phaddw    m0, m1
-    pmaddwd   m0, [pw_1]
-    phaddd    m2, m3
-%else
-    mova      m4, [pw_1]
-    pmaddwd   m0, m4
-    pmaddwd   m1, m4
-    packssdw  m0, m1
-    shufps    m1, m2, m3, q2020
-    shufps    m2, m3, q3131
-    pmaddwd   m0, m4
-    paddd     m2, m1
-%endif
-    shufps    m1, m0, m2, q2020
-    shufps    m0, m2, q3131
-    mova    [t0], m1
-    mova [t0+16], m0
-    RET
-
-;-----------------------------------------------------------------------------
-; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
-;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 2,3
-    mov      r2d, r2m
-    mova      m0, [r0+ 0]
-    mova      m1, [r0+16]
-    mova      m2, [r0+32]
-    mova      m3, [r0+48]
-    mova      m4, [r0+64]
-    paddd     m0, [r1+ 0]
-    paddd     m1, [r1+16]
-    paddd     m2, [r1+32]
-    paddd     m3, [r1+48]
-    paddd     m4, [r1+64]
-    paddd     m0, m1
-    paddd     m1, m2
-    paddd     m2, m3
-    paddd     m3, m4
-    TRANSPOSE4x4D  0, 1, 2, 3, 4
-
-;   s1=m0, s2=m1, ss=m2, s12=m3
-%if BIT_DEPTH == 10
-    cvtdq2ps  m0, m0
-    cvtdq2ps  m1, m1
-    cvtdq2ps  m2, m2
-    cvtdq2ps  m3, m3
-    mulps     m4, m0, m1  ; s1*s2
-    mulps     m0, m0      ; s1*s1
-    mulps     m1, m1      ; s2*s2
-    mulps     m2, [pf_64] ; ss*64
-    mulps     m3, [pf_128] ; s12*128
-    addps     m4, m4      ; s1*s2*2
-    addps     m0, m1      ; s1*s1 + s2*s2
-    subps     m2, m0      ; vars
-    subps     m3, m4      ; covar*2
-    movaps    m1, [ssim_c1]
-    addps     m4, m1      ; s1*s2*2 + ssim_c1
-    addps     m0, m1      ; s1*s1 + s2*s2 + ssim_c1
-    movaps    m1, [ssim_c2]
-    addps     m2, m1      ; vars + ssim_c2
-    addps     m3, m1      ; covar*2 + ssim_c2
-%else
-    pmaddwd   m4, m1, m0  ; s1*s2
-    pslld     m1, 16
-    por       m0, m1
-    pmaddwd   m0, m0  ; s1*s1 + s2*s2
-    pslld     m4, 1
-    pslld     m3, 7
-    pslld     m2, 6
-    psubd     m3, m4  ; covar*2
-    psubd     m2, m0  ; vars
-    mova      m1, [ssim_c1]
-    paddd     m0, m1
-    paddd     m4, m1
-    mova      m1, [ssim_c2]
-    paddd     m3, m1
-    paddd     m2, m1
-    cvtdq2ps  m0, m0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
-    cvtdq2ps  m4, m4  ; (float)(s1*s2*2 + ssim_c1)
-    cvtdq2ps  m3, m3  ; (float)(covar*2 + ssim_c2)
-    cvtdq2ps  m2, m2  ; (float)(vars + ssim_c2)
-%endif
-    mulps     m4, m3
-    mulps     m0, m2
-    divps     m4, m0  ; ssim
-
-    cmp       r2d, 4
-    je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
-    neg       r2
-
-%ifdef PIC
-    lea       r3, [mask_ff + 16]
-    %xdefine %%mask r3
-%else
-    %xdefine %%mask mask_ff + 16
-%endif
-%if cpuflag(avx)
-    andps     m4, [%%mask + r2*4]
-%else
-    movups    m0, [%%mask + r2*4]
-    andps     m4, m0
-%endif
-
-.skip:
-    movhlps   m0, m4
-    addps     m0, m4
-%if cpuflag(ssse3)
-    movshdup  m4, m0
-%else
-    pshuflw   m4, m0, q0032
-%endif
-    addss     m0, m4
-%if ARCH_X86_64 == 0
-    movss    r0m, m0
-    fld     dword r0m
-%endif
-    RET
-%endmacro ; SSIM
-
-INIT_XMM sse2
-SSIM
-INIT_XMM avx
-SSIM
-
-;-----------------------------------------------------------------------------
-; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-;-----------------------------------------------------------------------------
-%macro ASD8 0
-cglobal pixel_asd8, 5,5
-    pxor     m0, m0
-    pxor     m1, m1
-.loop:
-%if HIGH_BIT_DEPTH
-    paddw    m0, [r0]
-    paddw    m1, [r2]
-    paddw    m0, [r0+2*r1]
-    paddw    m1, [r2+2*r3]
-    lea      r0, [r0+4*r1]
-    paddw    m0, [r0]
-    paddw    m1, [r2+4*r3]
-    lea      r2, [r2+4*r3]
-    paddw    m0, [r0+2*r1]
-    paddw    m1, [r2+2*r3]
-    lea      r0, [r0+4*r1]
-    lea      r2, [r2+4*r3]
-%else
-    movq     m2, [r0]
-    movq     m3, [r2]
-    movhps   m2, [r0+r1]
-    movhps   m3, [r2+r3]
-    lea      r0, [r0+2*r1]
-    psadbw   m2, m1
-    psadbw   m3, m1
-    movq     m4, [r0]
-    movq     m5, [r2+2*r3]
-    lea      r2, [r2+2*r3]
-    movhps   m4, [r0+r1]
-    movhps   m5, [r2+r3]
-    lea      r0, [r0+2*r1]
-    paddw    m0, m2
-    psubw    m0, m3
-    psadbw   m4, m1
-    psadbw   m5, m1
-    lea      r2, [r2+2*r3]
-    paddw    m0, m4
-    psubw    m0, m5
-%endif
-    sub     r4d, 4
-    jg .loop
-%if HIGH_BIT_DEPTH
-    psubw    m0, m1
-    HADDW    m0, m1
-    ABSD     m1, m0
-%else
-    MOVHL    m1, m0
-    paddw    m0, m1
-    ABSW     m1, m0
-%endif
-    movd    eax, m1
-    RET
-%endmacro
-
-INIT_XMM sse2
-ASD8
-INIT_XMM ssse3
-ASD8
-%if HIGH_BIT_DEPTH
-INIT_XMM xop
-ASD8
-%endif
-
-;=============================================================================
-; Successive Elimination ADS
-;=============================================================================
-
-%macro ADS_START 0
-%if UNIX64
-    movsxd  r5, r5d
-%else
-    mov    r5d, r5m
-%endif
-    mov    r0d, r5d
-    lea     r6, [r4+r5+(mmsize-1)]
-    and     r6, ~(mmsize-1)
-    shl     r2d,  1
-%endmacro
-
-%macro ADS_END 1 ; unroll_size
-    add     r1, 8*%1
-    add     r3, 8*%1
-    add     r6, 4*%1
-    sub    r0d, 4*%1
-    jg .loop
-    WIN64_RESTORE_XMM rsp
-%if mmsize==32
-    vzeroupper
-%endif
-    lea     r6, [r4+r5+(mmsize-1)]
-    and     r6, ~(mmsize-1)
-%if cpuflag(ssse3)
-    jmp ads_mvs_ssse3
-%else
-    jmp ads_mvs_mmx
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
-;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_ads4, 5,7
-    mova    m6, [r0]
-    mova    m4, [r0+8]
-    pshufw  m7, m6, 0
-    pshufw  m6, m6, q2222
-    pshufw  m5, m4, 0
-    pshufw  m4, m4, q2222
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+16]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    movu      m2, [r1+r2]
-    movu      m3, [r1+r2+16]
-    psubw     m2, m5
-    psubw     m3, m4
-    paddw     m0, m1
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    pshufw    m1, r6m, 0
-    paddusw   m0, [r3]
-    psubusw   m1, m0
-    packsswb  m1, m1
-    movd    [r6], m1
-    ADS_END 1
-
-cglobal pixel_ads2, 5,7
-    mova      m6, [r0]
-    pshufw    m5, r6m, 0
-    pshufw    m7, m6, 0
-    pshufw    m6, m6, q2222
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+r2]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddw     m0, m1
-    paddusw   m0, [r3]
-    mova      m4, m5
-    psubusw   m4, m0
-    packsswb  m4, m4
-    movd    [r6], m4
-    ADS_END 1
-
-cglobal pixel_ads1, 5,7
-    pshufw    m7, [r0], 0
-    pshufw    m6, r6m, 0
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+8]
-    psubw     m0, m7
-    psubw     m1, m7
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddusw   m0, [r3]
-    paddusw   m1, [r3+8]
-    mova      m4, m6
-    mova      m5, m6
-    psubusw   m4, m0
-    psubusw   m5, m1
-    packsswb  m4, m5
-    mova    [r6], m4
-    ADS_END 2
-
-%macro ADS_XMM 0
-%if mmsize==32
-cglobal pixel_ads4, 5,7,8
-    vpbroadcastw m7, [r0+ 0]
-    vpbroadcastw m6, [r0+ 4]
-    vpbroadcastw m5, [r0+ 8]
-    vpbroadcastw m4, [r0+12]
-%else
-cglobal pixel_ads4, 5,7,12
-    mova      m4, [r0]
-    pshuflw   m7, m4, q0000
-    pshuflw   m6, m4, q2222
-    pshufhw   m5, m4, q0000
-    pshufhw   m4, m4, q2222
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-    punpckhqdq m5, m5
-    punpckhqdq m4, m4
-%endif
-%if ARCH_X86_64 && mmsize == 16
-    movd      m8, r6m
-    SPLATW    m8, m8
-    ADS_START
-    movu     m10, [r1]
-    movu     m11, [r1+r2]
-.loop:
-    psubw     m0, m10, m7
-    movu     m10, [r1+16]
-    psubw     m1, m10, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    psubw     m2, m11, m5
-    movu     m11, [r1+r2+16]
-    paddw     m0, m1
-    psubw     m3, m11, m4
-    movu      m9, [r3]
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    paddusw   m0, m9
-    psubusw   m1, m8, m0
-%else
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+16]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    movu      m2, [r1+r2]
-    movu      m3, [r1+r2+16]
-    psubw     m2, m5
-    psubw     m3, m4
-    paddw     m0, m1
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    movu      m2, [r3]
-%if mmsize==32
-    vpbroadcastw m1, r6m
-%else
-    movd      m1, r6m
-    pshuflw   m1, m1, 0
-    punpcklqdq m1, m1
-%endif
-    paddusw   m0, m2
-    psubusw   m1, m0
-%endif ; ARCH
-    packsswb  m1, m1
-%if mmsize==32
-    vpermq    m1, m1, q3120
-    mova    [r6], xm1
-%else
-    movh    [r6], m1
-%endif
-    ADS_END mmsize/8
-
-cglobal pixel_ads2, 5,7,8
-%if mmsize==32
-    vpbroadcastw m7, [r0+0]
-    vpbroadcastw m6, [r0+4]
-    vpbroadcastw m5, r6m
-%else
-    movq      m6, [r0]
-    movd      m5, r6m
-    pshuflw   m7, m6, 0
-    pshuflw   m6, m6, q2222
-    pshuflw   m5, m5, 0
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-    punpcklqdq m5, m5
-%endif
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+r2]
-    psubw     m0, m7
-    psubw     m1, m6
-    movu      m4, [r3]
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddw     m0, m1
-    paddusw   m0, m4
-    psubusw   m1, m5, m0
-    packsswb  m1, m1
-%if mmsize==32
-    vpermq    m1, m1, q3120
-    mova    [r6], xm1
-%else
-    movh    [r6], m1
-%endif
-    ADS_END mmsize/8
-
-cglobal pixel_ads1, 5,7,8
-%if mmsize==32
-    vpbroadcastw m7, [r0]
-    vpbroadcastw m6, r6m
-%else
-    movd      m7, [r0]
-    movd      m6, r6m
-    pshuflw   m7, m7, 0
-    pshuflw   m6, m6, 0
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-%endif
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+mmsize]
-    psubw     m0, m7
-    psubw     m1, m7
-    movu      m2, [r3]
-    movu      m3, [r3+mmsize]
-    ABSW      m0, m0, m4
-    ABSW      m1, m1, m5
-    paddusw   m0, m2
-    paddusw   m1, m3
-    psubusw   m4, m6, m0
-    psubusw   m5, m6, m1
-    packsswb  m4, m5
-%if mmsize==32
-    vpermq    m4, m4, q3120
-%endif
-    mova    [r6], m4
-    ADS_END mmsize/4
-%endmacro
-
-INIT_XMM sse2
-ADS_XMM
-INIT_XMM ssse3
-ADS_XMM
-INIT_XMM avx
-ADS_XMM
-INIT_YMM avx2
-ADS_XMM
-
-; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
-; {
-;     int nmv=0, i, j;
-;     *(uint32_t*)(masks+width) = 0;
-;     for( i=0; i<width; i+=8 )
-;     {
-;         uint64_t mask = *(uint64_t*)(masks+i);
-;         if( !mask ) continue;
-;         for( j=0; j<8; j++ )
-;             if( mask & (255<<j*8) )
-;                 mvs[nmv++] = i+j;
-;     }
-;     return nmv;
-; }
-
-%macro TEST 1
-    mov     [r4+r0*2], r1w
-    test    r2d, 0xff<<(%1*8)
-    setne   r3b
-    add     r0d, r3d
-    inc     r1d
-%endmacro
-
-INIT_MMX mmx
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_mmx:
-    ; mvs = r4
-    ; masks = r6
-    ; width = r5
-    ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
-    xor     r0d, r0d
-    xor     r1d, r1d
-    mov     [r6+r5], r0d
-    jmp .loopi
-ALIGN 16
-.loopi0:
-    add     r1d, 8
-    cmp     r1d, r5d
-    jge .end
-.loopi:
-    mov     r2,  [r6+r1]
-%if ARCH_X86_64
-    test    r2,  r2
-%else
-    mov     r3,  r2
-    add    r3d, [r6+r1+4]
-%endif
-    jz .loopi0
-    xor     r3d, r3d
-    TEST 0
-    TEST 1
-    TEST 2
-    TEST 3
-%if ARCH_X86_64
-    shr     r2,  32
-%else
-    mov     r2d, [r6+r1]
-%endif
-    TEST 0
-    TEST 1
-    TEST 2
-    TEST 3
-    cmp     r1d, r5d
-    jl .loopi
-.end:
-    movifnidn eax, r0d
-    RET
-
-INIT_XMM ssse3
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_ssse3:
-    mova      m3, [pw_8]
-    mova      m4, [pw_76543210]
-    pxor      m5, m5
-    add       r5, r6
-    xor      r0d, r0d ; nmv
-    mov     [r5], r0d
-%ifdef PIC
-    lea       r1, [$$]
-    %define GLOBAL +r1-$$
-%else
-    %define GLOBAL
-%endif
-.loop:
-    movh      m0, [r6]
-    pcmpeqb   m0, m5
-    pmovmskb r2d, m0
-    xor      r2d, 0xffff                         ; skipping if r2d is zero is slower (branch mispredictions)
-    movzx    r3d, byte [r2+popcnt_table GLOBAL]  ; popcnt
-    add      r2d, r2d
-    ; shuffle counters based on mv mask
-    pshufb    m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
-    movu [r4+r0*2], m2
-    add      r0d, r3d
-    paddw     m4, m3                             ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
-    add       r6, 8
-    cmp       r6, r5
-    jl .loop
-    movifnidn eax, r0d
-    RET
diff --git a/android/src/main/libenc/jni/libx264/common/x86/pixel.h b/android/src/main/libenc/jni/libx264/common/x86/pixel.h
deleted file mode 100755
index a8ed389..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/pixel.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*****************************************************************************
- * pixel.h: x86 pixel metrics
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_I386_PIXEL_H
-#define X264_I386_PIXEL_H
-
-#define DECL_PIXELS( ret, name, suffix, args ) \
-    ret x264_pixel_##name##_16x16_##suffix args;\
-    ret x264_pixel_##name##_16x8_##suffix args;\
-    ret x264_pixel_##name##_8x16_##suffix args;\
-    ret x264_pixel_##name##_8x8_##suffix args;\
-    ret x264_pixel_##name##_8x4_##suffix args;\
-    ret x264_pixel_##name##_4x16_##suffix args;\
-    ret x264_pixel_##name##_4x8_##suffix args;\
-    ret x264_pixel_##name##_4x4_##suffix args;\
-
-#define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
-
-#define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
-
-DECL_X1( sad, mmx2 )
-DECL_X1( sad, sse2 )
-DECL_X1( sad, sse3 )
-DECL_X1( sad, sse2_aligned )
-DECL_X1( sad, ssse3 )
-DECL_X1( sad, ssse3_aligned )
-DECL_X1( sad, avx2 )
-DECL_X4( sad, mmx2 )
-DECL_X4( sad, sse2 )
-DECL_X4( sad, sse3 )
-DECL_X4( sad, ssse3 )
-DECL_X4( sad, xop )
-DECL_X4( sad, avx )
-DECL_X4( sad, avx2 )
-DECL_X1( ssd, mmx )
-DECL_X1( ssd, mmx2 )
-DECL_X1( ssd, sse2slow )
-DECL_X1( ssd, sse2 )
-DECL_X1( ssd, ssse3 )
-DECL_X1( ssd, avx )
-DECL_X1( ssd, xop )
-DECL_X1( ssd, avx2 )
-DECL_X1( satd, mmx2 )
-DECL_X1( satd, sse2 )
-DECL_X1( satd, ssse3 )
-DECL_X1( satd, ssse3_atom )
-DECL_X1( satd, sse4 )
-DECL_X1( satd, avx )
-DECL_X1( satd, xop )
-DECL_X1( satd, avx2 )
-DECL_X1( sa8d, mmx2 )
-DECL_X1( sa8d, sse2 )
-DECL_X1( sa8d, ssse3 )
-DECL_X1( sa8d, ssse3_atom )
-DECL_X1( sa8d, sse4 )
-DECL_X1( sa8d, avx )
-DECL_X1( sa8d, xop )
-DECL_X1( sa8d, avx2 )
-DECL_X1( sad, cache32_mmx2 );
-DECL_X1( sad, cache64_mmx2 );
-DECL_X1( sad, cache64_sse2 );
-DECL_X1( sad, cache64_ssse3 );
-DECL_X4( sad, cache32_mmx2 );
-DECL_X4( sad, cache64_mmx2 );
-DECL_X4( sad, cache64_sse2 );
-DECL_X4( sad, cache64_ssse3 );
-
-DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, avx2,  ( pixel *pix, intptr_t i_stride ))
-
-
-void x264_intra_satd_x3_4x4_mmx2   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_4x4_mmx2    ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_4x4_sse2    ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_4x4_ssse3   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_4x4_avx     ( pixel   *, pixel   *, int * );
-void x264_intra_satd_x3_8x8c_mmx2  ( pixel   *, pixel   *, int * );
-void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_8x8c_mmx2   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8c_sse2   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8c_ssse3  ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8c_avx2   ( pixel   *, pixel   *, int * );
-void x264_intra_satd_x3_16x16_mmx2 ( pixel   *, pixel   *, int * );
-void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_16x16_mmx2  ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_16x16_sse2  ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_16x16_ssse3 ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_16x16_avx2  ( pixel   *, pixel   *, int * );
-void x264_intra_sa8d_x3_8x8_mmx2   ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8_mmx2    ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8_sse2    ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8_ssse3   ( pixel   *, pixel   *, int * );
-void x264_intra_sad_x3_8x8_avx2    ( uint16_t*, uint16_t*, int * );
-int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_satd_x9_4x4_avx  ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_satd_x9_4x4_xop  ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_sad_x9_4x4_sse4  ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_sad_x9_4x4_avx   ( uint8_t *, uint8_t *, uint16_t * );
-int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sa8d_x9_8x8_avx  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-int x264_intra_sad_x9_8x8_avx2  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-
-void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
-                                      const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
-                                      const pixel *pix2, intptr_t stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
-                                      const pixel *pix2, intptr_t stride2, int sums[2][4] );
-float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
-float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
-int  x264_pixel_var2_8x8_mmx2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
-int  x264_pixel_var2_8x8_sse2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
-int  x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_var2_8x8_xop   ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_var2_8x8_avx2  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_var2_8x16_mmx2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
-int  x264_pixel_var2_8x16_sse2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
-int  x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_var2_8x16_xop  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int  x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
-int  x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
-int  x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
-int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
-int  x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height );
-int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-int x264_pixel_asd8_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-uint64_t x264_pixel_sa8d_satd_16x16_sse2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_ssse3     ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_sse4      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_avx       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_xop       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_avx2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-
-
-#define DECL_ADS( size, suffix ) \
-int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
-                                     uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
-DECL_ADS( 4, mmx2 )
-DECL_ADS( 2, mmx2 )
-DECL_ADS( 1, mmx2 )
-DECL_ADS( 4, sse2 )
-DECL_ADS( 2, sse2 )
-DECL_ADS( 1, sse2 )
-DECL_ADS( 4, ssse3 )
-DECL_ADS( 2, ssse3 )
-DECL_ADS( 1, ssse3 )
-DECL_ADS( 4, avx )
-DECL_ADS( 2, avx )
-DECL_ADS( 1, avx )
-DECL_ADS( 4, avx2 )
-DECL_ADS( 2, avx2 )
-DECL_ADS( 1, avx2 )
-
-#undef DECL_PIXELS
-#undef DECL_X1
-#undef DECL_X4
-#undef DECL_ADS
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/predict-a.asm b/android/src/main/libenc/jni/libx264/common/x86/predict-a.asm
deleted file mode 100755
index e8954e3..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/predict-a.asm
+++ /dev/null
@@ -1,2181 +0,0 @@
-;*****************************************************************************
-;* predict-a.asm: x86 intra prediction
-;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Holger Lubitz <holger@lubitz.org>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
-pw_m3:       times 16 dw -3
-pw_m7:       times 16 dw -7
-pb_00s_ff:   times 8 db 0
-pb_0s_ff:    times 7 db 0
-             db 0xff
-shuf_fixtr:  db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-shuf_nop:    db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-shuf_hu:     db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
-shuf_vr:     db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
-pw_reverse:  db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
-
-SECTION .text
-
-cextern pb_0
-cextern pb_1
-cextern pb_3
-cextern pw_1
-cextern pw_2
-cextern pw_4
-cextern pw_8
-cextern pw_16
-cextern pw_00ff
-cextern pw_pixel_max
-cextern pw_0to15
-
-%macro STORE8 1
-    mova [r0+0*FDEC_STRIDEB], %1
-    mova [r0+1*FDEC_STRIDEB], %1
-    add  r0, 4*FDEC_STRIDEB
-    mova [r0-2*FDEC_STRIDEB], %1
-    mova [r0-1*FDEC_STRIDEB], %1
-    mova [r0+0*FDEC_STRIDEB], %1
-    mova [r0+1*FDEC_STRIDEB], %1
-    mova [r0+2*FDEC_STRIDEB], %1
-    mova [r0+3*FDEC_STRIDEB], %1
-%endmacro
-
-%macro STORE16 1-4
-%if %0 > 1
-    mov  r1d, 2*%0
-.loop:
-    mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
-    mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
-    mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
-    mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
-%ifidn %0, 4
-    mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
-    mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
-    mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
-    mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
-    add  r0, 2*FDEC_STRIDEB
-%else ; %0 == 2
-    add  r0, 4*FDEC_STRIDEB
-    mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
-    mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
-    mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
-    mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
-%endif
-    dec  r1d
-    jg .loop
-%else ; %0 == 1
-    STORE8 %1
-%if HIGH_BIT_DEPTH ; Different code paths to reduce code size
-    add  r0, 6*FDEC_STRIDEB
-    mova [r0-2*FDEC_STRIDEB], %1
-    mova [r0-1*FDEC_STRIDEB], %1
-    mova [r0+0*FDEC_STRIDEB], %1
-    mova [r0+1*FDEC_STRIDEB], %1
-    add  r0, 4*FDEC_STRIDEB
-    mova [r0-2*FDEC_STRIDEB], %1
-    mova [r0-1*FDEC_STRIDEB], %1
-    mova [r0+0*FDEC_STRIDEB], %1
-    mova [r0+1*FDEC_STRIDEB], %1
-%else
-    add  r0, 8*FDEC_STRIDE
-    mova [r0-4*FDEC_STRIDE], %1
-    mova [r0-3*FDEC_STRIDE], %1
-    mova [r0-2*FDEC_STRIDE], %1
-    mova [r0-1*FDEC_STRIDE], %1
-    mova [r0+0*FDEC_STRIDE], %1
-    mova [r0+1*FDEC_STRIDE], %1
-    mova [r0+2*FDEC_STRIDE], %1
-    mova [r0+3*FDEC_STRIDE], %1
-%endif ; HIGH_BIT_DEPTH
-%endif
-%endmacro
-
-%macro PRED_H_LOAD 2 ; reg, offset
-%if cpuflag(avx2)
-    vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
-%elif HIGH_BIT_DEPTH
-    movd           %1, [r0+(%2)*FDEC_STRIDEB-4]
-    SPLATW         %1, %1, 1
-%else
-    SPLATB_LOAD    %1, r0+(%2)*FDEC_STRIDE-1, m2
-%endif
-%endmacro
-
-%macro PRED_H_STORE 3 ; reg, offset, width
-%assign %%w %3*SIZEOF_PIXEL
-%if %%w == 8
-    movq [r0+(%2)*FDEC_STRIDEB], %1
-%else
-    %assign %%i 0
-    %rep %%w/mmsize
-        mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
-    %assign %%i %%i+mmsize
-    %endrep
-%endif
-%endmacro
-
-%macro PRED_H_4ROWS 2 ; width, inc_ptr
-    PRED_H_LOAD  m0, 0
-    PRED_H_LOAD  m1, 1
-    PRED_H_STORE m0, 0, %1
-    PRED_H_STORE m1, 1, %1
-    PRED_H_LOAD  m0, 2
-%if %2
-    add          r0, 4*FDEC_STRIDEB
-%endif
-    PRED_H_LOAD  m1, 3-4*%2
-    PRED_H_STORE m0, 2-4*%2, %1
-    PRED_H_STORE m1, 3-4*%2, %1
-%endmacro
-
-; dest, left, right, src, tmp
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED8x8_LOWPASS 4-5
-%if HIGH_BIT_DEPTH
-    paddw       %2, %3
-    psrlw       %2, 1
-    pavgw       %1, %4, %2
-%else
-    mova        %5, %2
-    pavgb       %2, %3
-    pxor        %3, %5
-    pand        %3, [pb_1]
-    psubusb     %2, %3
-    pavgb       %1, %4, %2
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_h( pixel *src )
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM avx2
-cglobal predict_4x4_h, 1,1
-    PRED_H_4ROWS 4, 0
-    RET
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_ddl( pixel *src )
-;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_DDL 0
-cglobal predict_4x4_ddl, 1,1
-    movu    m1, [r0-FDEC_STRIDEB]
-    PSLLPIX m2, m1, 1
-    mova    m0, m1
-%if HIGH_BIT_DEPTH
-    PSRLPIX m1, m1, 1
-    pshufhw m1, m1, q2210
-%else
-    pxor    m1, m2
-    PSRLPIX m1, m1, 1
-    pxor    m1, m0
-%endif
-    PRED8x8_LOWPASS m0, m2, m1, m0, m3
-
-%assign Y 0
-%rep 4
-    PSRLPIX m0, m0, 1
-    movh   [r0+Y*FDEC_STRIDEB], m0
-%assign Y (Y+1)
-%endrep
-
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_4x4_DDL
-INIT_XMM avx
-PREDICT_4x4_DDL
-INIT_MMX mmx2
-cglobal predict_4x4_ddl, 1,2
-    movu    m1, [r0-FDEC_STRIDEB+4]
-    PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
-    mova    m3, [r0-FDEC_STRIDEB+8]
-    mova    [r0+0*FDEC_STRIDEB], m0
-    pshufw  m4, m3, q3321
-    PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
-    mova    [r0+3*FDEC_STRIDEB], m2
-    pshufw  m1, m0, q0021
-    punpckldq m1, m2
-    mova    [r0+1*FDEC_STRIDEB], m1
-    psllq   m0, 16
-    PALIGNR m2, m0, 6, m0
-    mova    [r0+2*FDEC_STRIDEB], m2
-    RET
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-PREDICT_4x4_DDL
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_vr( pixel *src )
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH == 0
-INIT_MMX ssse3
-cglobal predict_4x4_vr, 1,1
-    movd    m1, [r0-1*FDEC_STRIDEB]        ; ........t3t2t1t0
-    mova    m4, m1
-    palignr m1, [r0-1*FDEC_STRIDEB-8], 7   ; ......t3t2t1t0lt
-    pavgb   m4, m1
-    palignr m1, [r0+0*FDEC_STRIDEB-8], 7   ; ....t3t2t1t0ltl0
-    mova    m0, m1
-    palignr m1, [r0+1*FDEC_STRIDEB-8], 7   ; ..t3t2t1t0ltl0l1
-    mova    m2, m1
-    palignr m1, [r0+2*FDEC_STRIDEB-8], 7   ; t3t2t1t0ltl0l1l2
-    PRED8x8_LOWPASS m2, m0, m1, m2, m3
-    pshufw  m0, m2, 0
-    psrlq   m2, 16
-    movd    [r0+0*FDEC_STRIDEB], m4
-    palignr m4, m0, 7
-    movd    [r0+1*FDEC_STRIDEB], m2
-    psllq   m0, 8
-    movd    [r0+2*FDEC_STRIDEB], m4
-    palignr m2, m0, 7
-    movd    [r0+3*FDEC_STRIDEB], m2
-    RET
-%endif ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_ddr( pixel *src )
-;-----------------------------------------------------------------------------
-%macro PREDICT_4x4 4
-cglobal predict_4x4_ddr, 1,1
-%if HIGH_BIT_DEPTH
-    movu      m2, [r0-1*FDEC_STRIDEB-8]
-    pinsrw    m2, [r0+0*FDEC_STRIDEB-2], 2
-    pinsrw    m2, [r0+1*FDEC_STRIDEB-2], 1
-    pinsrw    m2, [r0+2*FDEC_STRIDEB-2], 0
-    movhps    m3, [r0+3*FDEC_STRIDEB-8]
-%else ; !HIGH_BIT_DEPTH
-    movd      m0, [r0+2*FDEC_STRIDEB-4]
-    movd      m1, [r0+0*FDEC_STRIDEB-4]
-    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
-    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
-    punpckhwd m0, m1
-    movd      m2, [r0-1*FDEC_STRIDEB]
-%if cpuflag(ssse3)
-    palignr   m2, m0, 4
-%else
-    psllq     m2, 32
-    punpckhdq m0, m2
-    SWAP       2, 0
-%endif
-    movd      m3, [r0+3*FDEC_STRIDEB-4]
-    psllq     m3, 32
-%endif ; !HIGH_BIT_DEPTH
-
-    PSRLPIX   m1, m2, 1
-    mova      m0, m2
-    PALIGNR   m2, m3, 7*SIZEOF_PIXEL, m3
-    PRED8x8_LOWPASS m0, m2, m1, m0, m3
-%assign Y 3
-    movh      [r0+Y*FDEC_STRIDEB], m0
-%rep 3
-%assign Y (Y-1)
-    PSRLPIX   m0, m0, 1
-    movh      [r0+Y*FDEC_STRIDEB], m0
-%endrep
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_vr( pixel *src )
-;-----------------------------------------------------------------------------
-cglobal predict_4x4_vr, 1,1
-%if HIGH_BIT_DEPTH
-    movu      m1, [r0-1*FDEC_STRIDEB-8]
-    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 2
-    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 1
-    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 0
-%else ; !HIGH_BIT_DEPTH
-    movd      m0, [r0+2*FDEC_STRIDEB-4]
-    movd      m1, [r0+0*FDEC_STRIDEB-4]
-    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
-    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
-    punpckhwd m0, m1
-    movd      m1, [r0-1*FDEC_STRIDEB]
-%if cpuflag(ssse3)
-    palignr   m1, m0, 4
-%else
-    psllq     m1, 32
-    punpckhdq m0, m1
-    SWAP       1, 0
-%endif
-%endif ; !HIGH_BIT_DEPTH
-    PSRLPIX   m2, m1, 1
-    PSRLPIX   m0, m1, 2
-    pavg%1    m4, m1, m2
-    PSRLPIX   m4, m4, 3
-    PRED8x8_LOWPASS m2, m0, m1, m2, m3
-    PSLLPIX   m0, m2, 6
-    PSRLPIX   m2, m2, 2
-    movh      [r0+0*FDEC_STRIDEB], m4
-    PALIGNR   m4, m0, 7*SIZEOF_PIXEL, m3
-    movh      [r0+1*FDEC_STRIDEB], m2
-    PSLLPIX   m0, m0, 1
-    movh      [r0+2*FDEC_STRIDEB], m4
-    PALIGNR   m2, m0, 7*SIZEOF_PIXEL, m0
-    movh      [r0+3*FDEC_STRIDEB], m2
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_hd( pixel *src )
-;-----------------------------------------------------------------------------
-cglobal predict_4x4_hd, 1,1
-%if HIGH_BIT_DEPTH
-    movu      m1, [r0-1*FDEC_STRIDEB-8]
-    PSLLPIX   m1, m1, 1
-    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 3
-    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 2
-    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 1
-    pinsrw    m1, [r0+3*FDEC_STRIDEB-2], 0
-%else
-    movd      m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
-    punpckldq m0, [r0-1*FDEC_STRIDEB]   ; t3 t2 t1 t0 lt .. .. ..
-    PSLLPIX   m0, m0, 1                 ; t2 t1 t0 lt .. .. .. ..
-    movd      m1, [r0+3*FDEC_STRIDEB-4] ; l3
-    punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
-    movd      m2, [r0+1*FDEC_STRIDEB-4] ; l1
-    punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
-    punpckh%3 m1, m2                    ; l0 l1 l2 l3
-    punpckh%4 m1, m0                    ; t2 t1 t0 lt l0 l1 l2 l3
-%endif
-    PSRLPIX   m2, m1, 1                 ; .. t2 t1 t0 lt l0 l1 l2
-    PSRLPIX   m0, m1, 2                 ; .. .. t2 t1 t0 lt l0 l1
-    pavg%1    m5, m1, m2
-    PRED8x8_LOWPASS m3, m1, m0, m2, m4
-    punpckl%2 m5, m3
-    PSRLPIX   m3, m3, 4
-    PALIGNR   m3, m5, 6*SIZEOF_PIXEL, m4
-%assign Y 3
-    movh      [r0+Y*FDEC_STRIDEB], m5
-%rep 2
-%assign Y (Y-1)
-    PSRLPIX   m5, m5, 2
-    movh      [r0+Y*FDEC_STRIDEB], m5
-%endrep
-    movh      [r0+0*FDEC_STRIDEB], m3
-    RET
-%endmacro ; PREDICT_4x4
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_ddr( pixel *src )
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_MMX mmx2
-cglobal predict_4x4_ddr, 1,1
-    mova      m0, [r0+1*FDEC_STRIDEB-8]
-    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
-    mova      m3, [r0+3*FDEC_STRIDEB-8]
-    punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
-    punpckhdq m3, m0
-
-    pshufw  m0, m3, q3321
-    pinsrw  m0, [r0-1*FDEC_STRIDEB-2], 3
-    pshufw  m1, m0, q3321
-    PRED8x8_LOWPASS m0, m1, m3, m0
-    movq    [r0+3*FDEC_STRIDEB], m0
-
-    movq    m2, [r0-1*FDEC_STRIDEB-0]
-    pshufw  m4, m2, q2100
-    pinsrw  m4, [r0-1*FDEC_STRIDEB-2], 0
-    movq    m1, m4
-    PALIGNR m4, m3, 6, m3
-    PRED8x8_LOWPASS m1, m4, m2, m1
-    movq    [r0+0*FDEC_STRIDEB], m1
-
-    pshufw  m2, m0, q3321
-    punpckldq m2, m1
-    psllq   m0, 16
-    PALIGNR m1, m0, 6, m0
-    movq    [r0+1*FDEC_STRIDEB], m1
-    movq    [r0+2*FDEC_STRIDEB], m2
-    movd    [r0+3*FDEC_STRIDEB+4], m1
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_hd( pixel *src )
-;-----------------------------------------------------------------------------
-cglobal predict_4x4_hd, 1,1
-    mova      m0, [r0+1*FDEC_STRIDEB-8]
-    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
-    mova      m1, [r0+3*FDEC_STRIDEB-8]
-    punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
-    punpckhdq m1, m0
-    mova      m0, m1
-
-    movu      m3, [r0-1*FDEC_STRIDEB-2]
-    pshufw    m4, m1, q0032
-    mova      m7, m3
-    punpckldq m4, m3
-    PALIGNR   m3, m1, 2, m2
-    PRED8x8_LOWPASS m2, m4, m1, m3
-
-    pavgw     m0, m3
-    punpcklwd m5, m0, m2
-    punpckhwd m4, m0, m2
-    mova      [r0+3*FDEC_STRIDEB], m5
-    mova      [r0+1*FDEC_STRIDEB], m4
-    psrlq     m5, 32
-    punpckldq m5, m4
-    mova      [r0+2*FDEC_STRIDEB], m5
-
-    pshufw    m4, m7, q2100
-    mova      m6, [r0-1*FDEC_STRIDEB+0]
-    pinsrw    m4, [r0+0*FDEC_STRIDEB-2], 0
-    PRED8x8_LOWPASS m3, m4, m6, m7
-    PALIGNR   m3, m0, 6, m0
-    mova      [r0+0*FDEC_STRIDEB], m3
-    RET
-
-INIT_XMM sse2
-PREDICT_4x4 w, wd, dq, qdq
-INIT_XMM ssse3
-PREDICT_4x4 w, wd, dq, qdq
-INIT_XMM avx
-PREDICT_4x4 w, wd, dq, qdq
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-PREDICT_4x4 b, bw, wd, dq
-INIT_MMX ssse3
-%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
-PREDICT_4x4 b, bw, wd, dq
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_hu( pixel *src )
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_hu_mmx2, 1,1
-    movq      m0, [r0+0*FDEC_STRIDEB-8]
-    punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
-    movq      m1, [r0+2*FDEC_STRIDEB-8]
-    punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
-    punpckhdq m0, m1
-    pshufw    m1, m1, q3333
-    movq      [r0+3*FDEC_STRIDEB], m1
-    pshufw    m3, m0, q3321
-    pshufw    m4, m0, q3332
-    pavgw     m2, m0, m3
-    PRED8x8_LOWPASS m3, m0, m4, m3
-    punpcklwd m4, m2, m3
-    mova      [r0+0*FDEC_STRIDEB], m4
-    psrlq     m2, 16
-    psrlq     m3, 16
-    punpcklwd m2, m3
-    mova      [r0+1*FDEC_STRIDEB], m2
-    punpckhdq m2, m1
-    mova      [r0+2*FDEC_STRIDEB], m2
-    RET
-
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_hu_mmx2, 1,1
-    movd      m1, [r0+0*FDEC_STRIDEB-4]
-    punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
-    movd      m0, [r0+2*FDEC_STRIDEB-4]
-    punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
-    punpckhwd m1, m0
-    movq      m0, m1
-    punpckhbw m1, m1
-    pshufw    m1, m1, q3333
-    punpckhdq m0, m1
-    movq      m2, m0
-    movq      m3, m0
-    movq      m5, m0
-    psrlq     m3, 8
-    psrlq     m2, 16
-    pavgb     m5, m3
-    PRED8x8_LOWPASS m3, m0, m2, m3, m4
-    movd      [r0+3*FDEC_STRIDEB], m1
-    punpcklbw m5, m3
-    movd      [r0+0*FDEC_STRIDEB], m5
-    psrlq     m5, 16
-    movd      [r0+1*FDEC_STRIDEB], m5
-    psrlq     m5, 16
-    movd      [r0+2*FDEC_STRIDEB], m5
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_vl( pixel *src )
-;-----------------------------------------------------------------------------
-%macro PREDICT_4x4_V1 1
-cglobal predict_4x4_vl, 1,1
-    movu        m1, [r0-FDEC_STRIDEB]
-    PSRLPIX     m3, m1, 1
-    PSRLPIX     m2, m1, 2
-    pavg%1      m4, m3, m1
-    PRED8x8_LOWPASS m0, m1, m2, m3, m5
-
-    movh        [r0+0*FDEC_STRIDEB], m4
-    movh        [r0+1*FDEC_STRIDEB], m0
-    PSRLPIX     m4, m4, 1
-    PSRLPIX     m0, m0, 1
-    movh        [r0+2*FDEC_STRIDEB], m4
-    movh        [r0+3*FDEC_STRIDEB], m0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_4x4_V1 w
-INIT_XMM avx
-PREDICT_4x4_V1 w
-
-INIT_MMX mmx2
-cglobal predict_4x4_vl, 1,4
-    mova    m1, [r0-FDEC_STRIDEB+0]
-    mova    m2, [r0-FDEC_STRIDEB+8]
-    mova    m0, m2
-    PALIGNR m2, m1, 4, m4
-    PALIGNR m0, m1, 2, m4
-    mova    m3, m0
-    pavgw   m3, m1
-    mova    [r0+0*FDEC_STRIDEB], m3
-    psrlq   m3, 16
-    mova    [r0+2*FDEC_STRIDEB], m3
-    PRED8x8_LOWPASS m0, m1, m2, m0
-    mova    [r0+1*FDEC_STRIDEB], m0
-    psrlq   m0, 16
-    mova    [r0+3*FDEC_STRIDEB], m0
-
-    movzx   r1d, word [r0-FDEC_STRIDEB+ 8]
-    movzx   r2d, word [r0-FDEC_STRIDEB+10]
-    movzx   r3d, word [r0-FDEC_STRIDEB+12]
-    lea     r1d, [r1+r2+1]
-    add     r3d, r2d
-    lea     r3d, [r3+r1+1]
-    shr     r1d, 1
-    shr     r3d, 2
-    mov     [r0+2*FDEC_STRIDEB+6], r1w
-    mov     [r0+3*FDEC_STRIDEB+6], r3w
-    RET
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-PREDICT_4x4_V1 b
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_dc( pixel *src )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-%if HIGH_BIT_DEPTH
-cglobal predict_4x4_dc, 1,1
-    mova   m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
-    paddw  m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
-    paddw  m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
-    paddw  m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
-    psrlq  m2, 48
-    mova   m0, [r0-FDEC_STRIDEB]
-    HADDW  m0, m1
-    paddw  m0, [pw_4]
-    paddw  m0, m2
-    psrlw  m0, 3
-    SPLATW m0, m0
-    mova   [r0+0*FDEC_STRIDEB], m0
-    mova   [r0+1*FDEC_STRIDEB], m0
-    mova   [r0+2*FDEC_STRIDEB], m0
-    mova   [r0+3*FDEC_STRIDEB], m0
-    RET
-
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_4x4_dc, 1,4
-    pxor   mm7, mm7
-    movd   mm0, [r0-FDEC_STRIDEB]
-    psadbw mm0, mm7
-    movd   r3d, mm0
-    movzx  r1d, byte [r0-1]
-%assign Y 1
-%rep 3
-    movzx  r2d, byte [r0+FDEC_STRIDEB*Y-1]
-    add    r1d, r2d
-%assign Y Y+1
-%endrep
-    lea    r1d, [r1+r3+4]
-    shr    r1d, 3
-    imul   r1d, 0x01010101
-    mov   [r0+FDEC_STRIDEB*0], r1d
-    mov   [r0+FDEC_STRIDEB*1], r1d
-    mov   [r0+FDEC_STRIDEB*2], r1d
-    mov   [r0+FDEC_STRIDEB*3], r1d
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-%macro PREDICT_FILTER 4
-;-----------------------------------------------------------------------------
-;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_filter, 4,6,6
-    add          r0, 0x58*SIZEOF_PIXEL
-%define src r0-0x58*SIZEOF_PIXEL
-%if ARCH_X86_64 == 0
-    mov          r4, r1
-%define t1 r4
-%define t4 r1
-%else
-%define t1 r1
-%define t4 r4
-%endif
-    test       r3b, 1
-    je .check_top
-    mov        t4d, r2d
-    and        t4d, 8
-    neg         t4
-    mova        m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
-    mova        m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%2%3 m1, m0
-    mova        m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    mova        m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    punpckh%2%3 m3, m2
-    punpckh%3%4 m3, m1
-    mova        m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    mova        m1, [src-1*FDEC_STRIDEB]
-    PALIGNR     m4, m3, m0, 7*SIZEOF_PIXEL, m0
-    PALIGNR     m1, m1, m3, 1*SIZEOF_PIXEL, m2
-    PRED8x8_LOWPASS m3, m1, m4, m3, m5
-    mova        [t1+8*SIZEOF_PIXEL], m3
-    movzx      t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
-    movzx      r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
-    lea        t4d, [t4*3+2]
-    add        t4d, r5d
-    shr        t4d, 2
-    mov         [t1+7*SIZEOF_PIXEL], t4%1
-    mov         [t1+6*SIZEOF_PIXEL], t4%1
-    test       r3b, 2
-    je .done
-.check_top:
-%if SIZEOF_PIXEL==1 && cpuflag(ssse3)
-INIT_XMM cpuname
-    movu        m3, [src-1*FDEC_STRIDEB]
-    movhps      m0, [src-1*FDEC_STRIDEB-8]
-    test       r2b, 8
-    je .fix_lt_2
-.do_top:
-    and        r2d, 4
-%ifdef PIC
-    lea         r3, [shuf_fixtr]
-    pshufb      m3, [r3+r2*4]
-%else
-    pshufb      m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
-%endif
-    psrldq      m1, m3, 15
-    PALIGNR     m2, m3, m0, 15, m0
-    PALIGNR     m1, m3, 1, m5
-    PRED8x8_LOWPASS m0, m2, m1, m3, m5
-    mova        [t1+16*SIZEOF_PIXEL], m0
-    psrldq      m0, 15
-    movd        [t1+32*SIZEOF_PIXEL], m0
-.done:
-    REP_RET
-.fix_lt_2:
-    pslldq      m0, m3, 15
-    jmp .do_top
-
-%else
-    mova        m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
-    mova        m3, [src-1*FDEC_STRIDEB]
-    mova        m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
-    test       r2b, 8
-    je .fix_lt_2
-    test       r2b, 4
-    je .fix_tr_1
-.do_top:
-    PALIGNR     m2, m3, m0, 7*SIZEOF_PIXEL, m0
-    PALIGNR     m0, m1, m3, 1*SIZEOF_PIXEL, m5
-    PRED8x8_LOWPASS m4, m2, m0, m3, m5
-    mova        [t1+16*SIZEOF_PIXEL], m4
-    test       r3b, 4
-    je .done
-    PSRLPIX     m5, m1, 7
-    PALIGNR     m2, m1, m3, 7*SIZEOF_PIXEL, m3
-    PALIGNR     m5, m1, 1*SIZEOF_PIXEL, m4
-    PRED8x8_LOWPASS m0, m2, m5, m1, m4
-    mova        [t1+24*SIZEOF_PIXEL], m0
-    PSRLPIX     m0, m0, 7
-    movd        [t1+32*SIZEOF_PIXEL], m0
-.done:
-    REP_RET
-.fix_lt_2:
-    PSLLPIX     m0, m3, 7
-    test       r2b, 4
-    jne .do_top
-.fix_tr_1:
-    punpckh%1%2 m1, m3, m3
-    pshuf%2     m1, m1, q3333
-    jmp .do_top
-%endif
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_FILTER w, d, q, dq
-INIT_XMM ssse3
-PREDICT_FILTER w, d, q, dq
-INIT_XMM avx
-PREDICT_FILTER w, d, q, dq
-%else
-INIT_MMX mmx2
-PREDICT_FILTER b, w, d, q
-INIT_MMX ssse3
-PREDICT_FILTER b, w, d, q
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_v( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_V 0
-cglobal predict_8x8_v, 2,2
-    mova        m0, [r1+16*SIZEOF_PIXEL]
-    STORE8      m0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse
-PREDICT_8x8_V
-%else
-INIT_MMX mmx2
-PREDICT_8x8_V
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_h( pixel *src, pixel edge[36] )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_H 2
-cglobal predict_8x8_h, 2,2
-    movu      m1, [r1+7*SIZEOF_PIXEL]
-    add       r0, 4*FDEC_STRIDEB
-    punpckl%1 m2, m1, m1
-    punpckh%1 m1, m1
-%assign Y 0
-%rep 8
-%assign i 1+Y/4
-    SPLAT%2 m0, m %+ i, (3-Y)&3
-    mova [r0+(Y-4)*FDEC_STRIDEB], m0
-%assign Y Y+1
-%endrep
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_H wd, D
-%else
-INIT_MMX mmx2
-PREDICT_8x8_H bw, W
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_dc( pixel *src, pixel *edge );
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal predict_8x8_dc, 2,2
-    movu        m0, [r1+14]
-    paddw       m0, [r1+32]
-    HADDW       m0, m1
-    paddw       m0, [pw_8]
-    psrlw       m0, 4
-    SPLATW      m0, m0
-    STORE8      m0
-    RET
-
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-cglobal predict_8x8_dc, 2,2
-    pxor        mm0, mm0
-    pxor        mm1, mm1
-    psadbw      mm0, [r1+7]
-    psadbw      mm1, [r1+16]
-    paddw       mm0, [pw_8]
-    paddw       mm0, mm1
-    psrlw       mm0, 4
-    pshufw      mm0, mm0, 0
-    packuswb    mm0, mm0
-    STORE8      mm0
-    RET
-%endif ; HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_dc_top ( pixel *src, pixel *edge );
-; void predict_8x8_dc_left( pixel *src, pixel *edge );
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-%macro PREDICT_8x8_DC 3
-cglobal %1, 2,2
-    %3          m0, [r1+%2]
-    HADDW       m0, m1
-    paddw       m0, [pw_4]
-    psrlw       m0, 3
-    SPLATW      m0, m0
-    STORE8      m0
-    RET
-%endmacro
-INIT_XMM sse2
-PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
-PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
-
-%else ; !HIGH_BIT_DEPTH
-%macro PREDICT_8x8_DC 2
-cglobal %1, 2,2
-    pxor        mm0, mm0
-    psadbw      mm0, [r1+%2]
-    paddw       mm0, [pw_4]
-    psrlw       mm0, 3
-    pshufw      mm0, mm0, 0
-    packuswb    mm0, mm0
-    STORE8      mm0
-    RET
-%endmacro
-INIT_MMX
-PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
-PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
-%endif ; HIGH_BIT_DEPTH
-
-; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
-; size on the 8-bit mmx functions below if we know sse2 is available.
-%macro PREDICT_8x8_DDLR 0
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddl( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl, 2,2,7
-    mova        m0, [r1+16*SIZEOF_PIXEL]
-    mova        m1, [r1+24*SIZEOF_PIXEL]
-%if cpuflag(cache64)
-    movd        m5, [r1+32*SIZEOF_PIXEL]
-    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
-    palignr     m5, m5, m1, 1*SIZEOF_PIXEL
-    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
-%else
-    movu        m3, [r1+17*SIZEOF_PIXEL]
-    movu        m4, [r1+23*SIZEOF_PIXEL]
-    movu        m5, [r1+25*SIZEOF_PIXEL]
-%endif
-    PSLLPIX     m2, m0, 1
-    add         r0, FDEC_STRIDEB*4
-    PRED8x8_LOWPASS m0, m2, m3, m0, m6
-    PRED8x8_LOWPASS m1, m4, m5, m1, m6
-    mova        [r0+3*FDEC_STRIDEB], m1
-%assign Y 2
-%rep 6
-    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
-    PSLLPIX     m0, m0, 1
-    mova        [r0+Y*FDEC_STRIDEB], m1
-%assign Y (Y-1)
-%endrep
-    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
-    mova        [r0+Y*FDEC_STRIDEB], m1
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddr( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr, 2,2,7
-    add         r0, FDEC_STRIDEB*4
-    mova        m0, [r1+ 8*SIZEOF_PIXEL]
-    mova        m1, [r1+16*SIZEOF_PIXEL]
-    ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
-    movu        m2, [r1+ 7*SIZEOF_PIXEL]
-    movu        m5, [r1+17*SIZEOF_PIXEL]
-%if cpuflag(cache64)
-    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
-    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
-%else
-    movu        m3, [r1+ 9*SIZEOF_PIXEL]
-    movu        m4, [r1+15*SIZEOF_PIXEL]
-%endif
-    PRED8x8_LOWPASS m0, m2, m3, m0, m6
-    PRED8x8_LOWPASS m1, m4, m5, m1, m6
-    mova        [r0+3*FDEC_STRIDEB], m0
-%assign Y -4
-%rep 6
-    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
-    PSLLPIX     m0, m0, 1
-    mova        [r0+Y*FDEC_STRIDEB], m1
-%assign Y (Y+1)
-%endrep
-    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
-    mova        [r0+Y*FDEC_STRIDEB], m1
-    RET
-%endmacro ; PREDICT_8x8_DDLR
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_DDLR
-INIT_XMM ssse3
-PREDICT_8x8_DDLR
-INIT_XMM ssse3, cache64
-PREDICT_8x8_DDLR
-%elif ARCH_X86_64 == 0
-INIT_MMX mmx2
-PREDICT_8x8_DDLR
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_hu( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HU 2
-cglobal predict_8x8_hu, 2,2,8
-    add       r0, 4*FDEC_STRIDEB
-%if HIGH_BIT_DEPTH
-%if cpuflag(ssse3)
-    movu      m5, [r1+7*SIZEOF_PIXEL]
-    pshufb    m5, [pw_reverse]
-%else
-    movq      m6, [r1+7*SIZEOF_PIXEL]
-    movq      m5, [r1+11*SIZEOF_PIXEL]
-    pshuflw   m6, m6, q0123
-    pshuflw   m5, m5, q0123
-    movlhps   m5, m6
-%endif ; cpuflag
-    psrldq    m2, m5, 2
-    pshufd    m3, m5, q0321
-    pshufhw   m2, m2, q2210
-    pshufhw   m3, m3, q1110
-    pavgw     m4, m5, m2
-%else ; !HIGH_BIT_DEPTH
-    movu      m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
-    pshufw    m0, m1, q0123           ; l6 l7 l4 l5 l2 l3 l0 l1
-    psllq     m1, 56                  ; l7 .. .. .. .. .. .. ..
-    mova      m2, m0
-    psllw     m0, 8
-    psrlw     m2, 8
-    por       m2, m0
-    mova      m3, m2
-    mova      m4, m2
-    mova      m5, m2                  ; l7 l6 l5 l4 l3 l2 l1 l0
-    psrlq     m3, 16
-    psrlq     m2, 8
-    por       m2, m1                  ; l7 l7 l6 l5 l4 l3 l2 l1
-    punpckhbw m1, m1
-    por       m3, m1                  ; l7 l7 l7 l6 l5 l4 l3 l2
-    pavgb     m4, m2
-%endif ; !HIGH_BIT_DEPTH
-    PRED8x8_LOWPASS m2, m3, m5, m2, m6
-    punpckh%2 m0, m4, m2              ; p8 p7 p6 p5
-    punpckl%2 m4, m2                  ; p4 p3 p2 p1
-    PALIGNR   m5, m0, m4, 2*SIZEOF_PIXEL, m3
-    pshuf%1   m1, m0, q3321
-    PALIGNR   m6, m0, m4, 4*SIZEOF_PIXEL, m3
-    pshuf%1   m2, m0, q3332
-    PALIGNR   m7, m0, m4, 6*SIZEOF_PIXEL, m3
-    pshuf%1   m3, m0, q3333
-    mova      [r0-4*FDEC_STRIDEB], m4
-    mova      [r0-3*FDEC_STRIDEB], m5
-    mova      [r0-2*FDEC_STRIDEB], m6
-    mova      [r0-1*FDEC_STRIDEB], m7
-    mova      [r0+0*FDEC_STRIDEB], m0
-    mova      [r0+1*FDEC_STRIDEB], m1
-    mova      [r0+2*FDEC_STRIDEB], m2
-    mova      [r0+3*FDEC_STRIDEB], m3
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_HU d, wd
-INIT_XMM ssse3
-PREDICT_8x8_HU d, wd
-INIT_XMM avx
-PREDICT_8x8_HU d, wd
-%elif ARCH_X86_64 == 0
-INIT_MMX mmx2
-PREDICT_8x8_HU w, bw
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vr( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_VR 1
-cglobal predict_8x8_vr, 2,3
-    mova        m2, [r1+16*SIZEOF_PIXEL]
-%ifidn cpuname, ssse3
-    mova        m0, [r1+8*SIZEOF_PIXEL]
-    palignr     m3, m2, m0, 7*SIZEOF_PIXEL
-    palignr     m1, m2, m0, 6*SIZEOF_PIXEL
-%else
-    movu        m3, [r1+15*SIZEOF_PIXEL]
-    movu        m1, [r1+14*SIZEOF_PIXEL]
-%endif
-    pavg%1      m4, m3, m2
-    add         r0, FDEC_STRIDEB*4
-    PRED8x8_LOWPASS m3, m1, m2, m3, m5
-    mova        [r0-4*FDEC_STRIDEB], m4
-    mova        [r0-3*FDEC_STRIDEB], m3
-    mova        m1, [r1+8*SIZEOF_PIXEL]
-    PSLLPIX     m0, m1, 1
-    PSLLPIX     m2, m1, 2
-    PRED8x8_LOWPASS m0, m1, m2, m0, m6
-
-%assign Y -2
-%rep 5
-    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m5
-    mova        [r0+Y*FDEC_STRIDEB], m4
-    PSLLPIX     m0, m0, 1
-    SWAP 3, 4
-%assign Y (Y+1)
-%endrep
-    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m0
-    mova        [r0+Y*FDEC_STRIDEB], m4
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_VR w
-INIT_XMM ssse3
-PREDICT_8x8_VR w
-INIT_XMM avx
-PREDICT_8x8_VR w
-%elif ARCH_X86_64 == 0
-INIT_MMX mmx2
-PREDICT_8x8_VR b
-%endif
-
-%macro LOAD_PLANE_ARGS 0
-%if cpuflag(avx2) && ARCH_X86_64 == 0
-    vpbroadcastw m0, r1m
-    vpbroadcastw m2, r2m
-    vpbroadcastw m4, r3m
-%elif mmsize == 8 ; MMX is only used on x86_32
-    SPLATW       m0, r1m
-    SPLATW       m2, r2m
-    SPLATW       m4, r3m
-%else
-    movd        xm0, r1m
-    movd        xm2, r2m
-    movd        xm4, r3m
-    SPLATW       m0, xm0
-    SPLATW       m2, xm2
-    SPLATW       m4, xm4
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
-%macro PREDICT_CHROMA_P_MMX 1
-cglobal predict_8x%1c_p_core, 1,2
-    LOAD_PLANE_ARGS
-    movq        m1, m2
-    pmullw      m2, [pw_0to15]
-    psllw       m1, 2
-    paddsw      m0, m2        ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
-    paddsw      m1, m0        ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
-    mov         r1d, %1
-ALIGN 4
-.loop:
-    movq        m5, m0
-    movq        m6, m1
-    psraw       m5, 5
-    psraw       m6, 5
-    packuswb    m5, m6
-    movq        [r0], m5
-
-    paddsw      m0, m4
-    paddsw      m1, m4
-    add         r0, FDEC_STRIDE
-    dec         r1d
-    jg .loop
-    RET
-%endmacro ; PREDICT_CHROMA_P_MMX
-
-INIT_MMX mmx2
-PREDICT_CHROMA_P_MMX 8
-PREDICT_CHROMA_P_MMX 16
-%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
-
-%macro PREDICT_CHROMA_P 1
-%if HIGH_BIT_DEPTH
-cglobal predict_8x%1c_p_core, 1,2,7
-    LOAD_PLANE_ARGS
-    mova        m3, [pw_pixel_max]
-    pxor        m1, m1
-    pmullw      m2, [pw_43210123] ; b
-%if %1 == 16
-    pmullw      m5, m4, [pw_m7]   ; c
-%else
-    pmullw      m5, m4, [pw_m3]
-%endif
-    paddw       m5, [pw_16]
-%if mmsize == 32
-    mova       xm6, xm4
-    paddw       m4, m4
-    paddw       m5, m6
-%endif
-    mov        r1d, %1/(mmsize/16)
-.loop:
-    paddsw      m6, m2, m5
-    paddsw      m6, m0
-    psraw       m6, 5
-    CLIPW       m6, m1, m3
-    paddw       m5, m4
-%if mmsize == 32
-    vextracti128 [r0], m6, 1
-    mova [r0+FDEC_STRIDEB], xm6
-    add         r0, 2*FDEC_STRIDEB
-%else
-    mova      [r0], m6
-    add         r0, FDEC_STRIDEB
-%endif
-    dec        r1d
-    jg .loop
-    RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_8x%1c_p_core, 1,2
-    LOAD_PLANE_ARGS
-%if mmsize == 32
-    vbroadcasti128 m1, [pw_0to15]   ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
-    pmullw      m2, m1
-    mova       xm1, xm4             ; zero upper half
-    paddsw      m4, m4
-    paddsw      m0, m1
-%else
-    pmullw      m2, [pw_0to15]
-%endif
-    paddsw      m0, m2              ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
-    paddsw      m1, m0, m4
-    paddsw      m4, m4
-    mov        r1d, %1/(mmsize/8)
-.loop:
-    psraw       m2, m0, 5
-    psraw       m3, m1, 5
-    paddsw      m0, m4
-    paddsw      m1, m4
-    packuswb    m2, m3
-%if mmsize == 32
-    movq        [r0+FDEC_STRIDE*1], xm2
-    movhps      [r0+FDEC_STRIDE*3], xm2
-    vextracti128 xm2, m2, 1
-    movq        [r0+FDEC_STRIDE*0], xm2
-    movhps      [r0+FDEC_STRIDE*2], xm2
-%else
-    movq        [r0+FDEC_STRIDE*0], xm2
-    movhps      [r0+FDEC_STRIDE*1], xm2
-%endif
-    add         r0, FDEC_STRIDE*mmsize/8
-    dec        r1d
-    jg .loop
-    RET
-%endif ; HIGH_BIT_DEPTH
-%endmacro ; PREDICT_CHROMA_P
-
-INIT_XMM sse2
-PREDICT_CHROMA_P 8
-PREDICT_CHROMA_P 16
-INIT_XMM avx
-PREDICT_CHROMA_P 8
-PREDICT_CHROMA_P 16
-INIT_YMM avx2
-PREDICT_CHROMA_P 8
-PREDICT_CHROMA_P 16
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
-INIT_MMX mmx2
-cglobal predict_16x16_p_core, 1,2
-    LOAD_PLANE_ARGS
-    movq        mm5, mm2
-    movq        mm1, mm2
-    pmullw      mm5, [pw_0to15]
-    psllw       mm2, 3
-    psllw       mm1, 2
-    movq        mm3, mm2
-    paddsw      mm0, mm5        ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
-    paddsw      mm1, mm0        ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
-    paddsw      mm2, mm0        ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
-    paddsw      mm3, mm1        ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
-
-    mov         r1d, 16
-ALIGN 4
-.loop:
-    movq        mm5, mm0
-    movq        mm6, mm1
-    psraw       mm5, 5
-    psraw       mm6, 5
-    packuswb    mm5, mm6
-    movq        [r0], mm5
-
-    movq        mm5, mm2
-    movq        mm6, mm3
-    psraw       mm5, 5
-    psraw       mm6, 5
-    packuswb    mm5, mm6
-    movq        [r0+8], mm5
-
-    paddsw      mm0, mm4
-    paddsw      mm1, mm4
-    paddsw      mm2, mm4
-    paddsw      mm3, mm4
-    add         r0, FDEC_STRIDE
-    dec         r1d
-    jg          .loop
-    RET
-%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
-
-%macro PREDICT_16x16_P 0
-cglobal predict_16x16_p_core, 1,2,8
-    movd     m0, r1m
-    movd     m1, r2m
-    movd     m2, r3m
-    SPLATW   m0, m0, 0
-    SPLATW   m1, m1, 0
-    SPLATW   m2, m2, 0
-    pmullw   m3, m1, [pw_0to15]
-    psllw    m1, 3
-%if HIGH_BIT_DEPTH
-    pxor     m6, m6
-    mov     r1d, 16
-.loop:
-    mova     m4, m0
-    mova     m5, m0
-    mova     m7, m3
-    paddsw   m7, m6
-    paddsw   m4, m7
-    paddsw   m7, m1
-    paddsw   m5, m7
-    psraw    m4, 5
-    psraw    m5, 5
-    CLIPW    m4, [pb_0], [pw_pixel_max]
-    CLIPW    m5, [pb_0], [pw_pixel_max]
-    mova   [r0], m4
-    mova [r0+16], m5
-    add      r0, FDEC_STRIDEB
-    paddw    m6, m2
-%else ; !HIGH_BIT_DEPTH
-    paddsw   m0, m3  ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
-    paddsw   m1, m0  ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
-    paddsw   m7, m2, m2
-    mov     r1d, 8
-ALIGN 4
-.loop:
-    psraw    m3, m0, 5
-    psraw    m4, m1, 5
-    paddsw   m5, m0, m2
-    paddsw   m6, m1, m2
-    psraw    m5, 5
-    psraw    m6, 5
-    packuswb m3, m4
-    packuswb m5, m6
-    mova [r0+FDEC_STRIDE*0], m3
-    mova [r0+FDEC_STRIDE*1], m5
-    paddsw   m0, m7
-    paddsw   m1, m7
-    add      r0, FDEC_STRIDE*2
-%endif ; !HIGH_BIT_DEPTH
-    dec     r1d
-    jg .loop
-    RET
-%endmacro ; PREDICT_16x16_P
-
-INIT_XMM sse2
-PREDICT_16x16_P
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM avx
-PREDICT_16x16_P
-%endif
-
-INIT_YMM avx2
-cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
-    LOAD_PLANE_ARGS
-%if HIGH_BIT_DEPTH
-    pmullw       m2, [pw_0to15]
-    pxor         m5, m5
-    pxor         m6, m6
-    mova         m7, [pw_pixel_max]
-    mov         r1d, 8
-.loop:
-    paddsw       m1, m2, m5
-    paddw        m5, m4
-    paddsw       m1, m0
-    paddsw       m3, m2, m5
-    psraw        m1, 5
-    paddsw       m3, m0
-    psraw        m3, 5
-    CLIPW        m1, m6, m7
-    mova [r0+0*FDEC_STRIDEB], m1
-    CLIPW        m3, m6, m7
-    mova [r0+1*FDEC_STRIDEB], m3
-    paddw        m5, m4
-    add          r0, 2*FDEC_STRIDEB
-%else ; !HIGH_BIT_DEPTH
-    vbroadcasti128 m1, [pw_0to15]
-    mova        xm3, xm4    ; zero high bits
-    pmullw       m1, m2
-    psllw        m2, 3
-    paddsw       m0, m3
-    paddsw       m0, m1     ; X+1*C X+0*C
-    paddsw       m1, m0, m2 ; Y+1*C Y+0*C
-    paddsw       m4, m4
-    mov         r1d, 4
-.loop:
-    psraw        m2, m0, 5
-    psraw        m3, m1, 5
-    paddsw       m0, m4
-    paddsw       m1, m4
-    packuswb     m2, m3     ; X+1*C Y+1*C X+0*C Y+0*C
-    vextracti128 [r0+0*FDEC_STRIDE], m2, 1
-    mova         [r0+1*FDEC_STRIDE], xm2
-    psraw        m2, m0, 5
-    psraw        m3, m1, 5
-    paddsw       m0, m4
-    paddsw       m1, m4
-    packuswb     m2, m3     ; X+3*C Y+3*C X+2*C Y+2*C
-    vextracti128 [r0+2*FDEC_STRIDE], m2, 1
-    mova         [r0+3*FDEC_STRIDE], xm2
-    add          r0, FDEC_STRIDE*4
-%endif ; !HIGH_BIT_DEPTH
-    dec         r1d
-    jg .loop
-    RET
-
-%if HIGH_BIT_DEPTH == 0
-%macro PREDICT_8x8 0
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl, 2,2
-    mova        m0, [r1+16]
-%ifidn cpuname, ssse3
-    movd        m2, [r1+32]
-    palignr     m2, m0, 1
-%else
-    movu        m2, [r1+17]
-%endif
-    pslldq      m1, m0, 1
-    add        r0, FDEC_STRIDE*4
-    PRED8x8_LOWPASS m0, m1, m2, m0, m3
-
-%assign Y -4
-%rep 8
-    psrldq      m0, 1
-    movq        [r0+Y*FDEC_STRIDE], m0
-%assign Y (Y+1)
-%endrep
-    RET
-
-%ifnidn cpuname, ssse3
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr, 2,2
-    movu        m0, [r1+8]
-    movu        m1, [r1+7]
-    psrldq      m2, m0, 1
-    add         r0, FDEC_STRIDE*4
-    PRED8x8_LOWPASS m0, m1, m2, m0, m3
-
-    psrldq      m1, m0, 1
-%assign Y 3
-%rep 3
-    movq        [r0+Y*FDEC_STRIDE], m0
-    movq        [r0+(Y-1)*FDEC_STRIDE], m1
-    psrldq      m0, 2
-    psrldq      m1, 2
-%assign Y (Y-2)
-%endrep
-    movq        [r0-3*FDEC_STRIDE], m0
-    movq        [r0-4*FDEC_STRIDE], m1
-    RET
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl, 2,2
-    mova        m0, [r1+16]
-    pslldq      m1, m0, 1
-    psrldq      m2, m0, 1
-    pavgb       m3, m0, m2
-    add         r0, FDEC_STRIDE*4
-    PRED8x8_LOWPASS m0, m1, m2, m0, m5
-; m0: (t0 + 2*t1 + t2 + 2) >> 2
-; m3: (t0 + t1 + 1) >> 1
-
-%assign Y -4
-%rep 3
-    psrldq      m0, 1
-    movq        [r0+ Y   *FDEC_STRIDE], m3
-    movq        [r0+(Y+1)*FDEC_STRIDE], m0
-    psrldq      m3, 1
-%assign Y (Y+2)
-%endrep
-    psrldq      m0, 1
-    movq        [r0+ Y   *FDEC_STRIDE], m3
-    movq        [r0+(Y+1)*FDEC_STRIDE], m0
-    RET
-%endif ; !ssse3
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_vr, 2,2
-    movu        m2, [r1+8]
-    add         r0, 4*FDEC_STRIDE
-    pslldq      m1, m2, 2
-    pslldq      m0, m2, 1
-    pavgb       m3, m2, m0
-    PRED8x8_LOWPASS m0, m2, m1, m0, m4
-    movhps      [r0-4*FDEC_STRIDE], m3
-    movhps      [r0-3*FDEC_STRIDE], m0
-%if cpuflag(ssse3)
-    punpckhqdq  m3, m3
-    pshufb      m0, [shuf_vr]
-    palignr     m3, m0, 13
-%else
-    mova        m2, m0
-    mova        m1, [pw_00ff]
-    pand        m1, m0
-    psrlw       m0, 8
-    packuswb    m1, m0
-    pslldq      m1, 4
-    movhlps     m3, m1
-    shufps      m1, m2, q3210
-    psrldq      m3, 5
-    psrldq      m1, 5
-    SWAP         0, 1
-%endif
-    movq        [r0+3*FDEC_STRIDE], m0
-    movq        [r0+2*FDEC_STRIDE], m3
-    psrldq      m0, 1
-    psrldq      m3, 1
-    movq        [r0+1*FDEC_STRIDE], m0
-    movq        [r0+0*FDEC_STRIDE], m3
-    psrldq      m0, 1
-    psrldq      m3, 1
-    movq        [r0-1*FDEC_STRIDE], m0
-    movq        [r0-2*FDEC_STRIDE], m3
-    RET
-%endmacro ; PREDICT_8x8
-
-INIT_XMM sse2
-PREDICT_8x8
-INIT_XMM ssse3
-PREDICT_8x8
-INIT_XMM avx
-PREDICT_8x8
-
-%endif ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vl( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_VL_10 1
-cglobal predict_8x8_vl, 2,2,8
-    mova         m0, [r1+16*SIZEOF_PIXEL]
-    mova         m1, [r1+24*SIZEOF_PIXEL]
-    PALIGNR      m2, m1, m0, SIZEOF_PIXEL*1, m4
-    PSRLPIX      m4, m1, 1
-    pavg%1       m6, m0, m2
-    pavg%1       m7, m1, m4
-    add          r0, FDEC_STRIDEB*4
-    mova         [r0-4*FDEC_STRIDEB], m6
-    PALIGNR      m3, m7, m6, SIZEOF_PIXEL*1, m5
-    mova         [r0-2*FDEC_STRIDEB], m3
-    PALIGNR      m3, m7, m6, SIZEOF_PIXEL*2, m5
-    mova         [r0+0*FDEC_STRIDEB], m3
-    PALIGNR      m7, m7, m6, SIZEOF_PIXEL*3, m5
-    mova         [r0+2*FDEC_STRIDEB], m7
-    PALIGNR      m3, m1, m0, SIZEOF_PIXEL*7, m6
-    PSLLPIX      m5, m0, 1
-    PRED8x8_LOWPASS m0, m5, m2, m0, m7
-    PRED8x8_LOWPASS m1, m3, m4, m1, m7
-    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*1, m2
-    mova         [r0-3*FDEC_STRIDEB], m4
-    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*2, m2
-    mova         [r0-1*FDEC_STRIDEB], m4
-    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*3, m2
-    mova         [r0+1*FDEC_STRIDEB], m4
-    PALIGNR      m1, m1, m0, SIZEOF_PIXEL*4, m2
-    mova         [r0+3*FDEC_STRIDEB], m1
-    RET
-%endmacro
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_VL_10 w
-INIT_XMM ssse3
-PREDICT_8x8_VL_10 w
-INIT_XMM avx
-PREDICT_8x8_VL_10 w
-%else
-INIT_MMX mmx2
-PREDICT_8x8_VL_10 b
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_hd( pixel *src, pixel *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HD 2
-cglobal predict_8x8_hd, 2,2
-    add       r0, 4*FDEC_STRIDEB
-    mova      m0, [r1+ 8*SIZEOF_PIXEL]     ; lt l0 l1 l2 l3 l4 l5 l6
-    movu      m1, [r1+ 7*SIZEOF_PIXEL]     ; l0 l1 l2 l3 l4 l5 l6 l7
-%ifidn cpuname, ssse3
-    mova      m2, [r1+16*SIZEOF_PIXEL]     ; t7 t6 t5 t4 t3 t2 t1 t0
-    mova      m4, m2                       ; t7 t6 t5 t4 t3 t2 t1 t0
-    palignr   m2, m0, 7*SIZEOF_PIXEL       ; t6 t5 t4 t3 t2 t1 t0 lt
-    palignr   m4, m0, 1*SIZEOF_PIXEL       ; t0 lt l0 l1 l2 l3 l4 l5
-%else
-    movu      m2, [r1+15*SIZEOF_PIXEL]
-    movu      m4, [r1+ 9*SIZEOF_PIXEL]
-%endif ; cpuflag
-    pavg%1    m3, m0, m1
-    PRED8x8_LOWPASS m0, m4, m1, m0, m5
-    PSRLPIX   m4, m2, 2                    ; .. .. t6 t5 t4 t3 t2 t1
-    PSRLPIX   m1, m2, 1                    ; .. t6 t5 t4 t3 t2 t1 t0
-    PRED8x8_LOWPASS m1, m4, m2, m1, m5
-                                           ; .. p11 p10 p9
-    punpckh%2 m2, m3, m0                   ; p8 p7 p6 p5
-    punpckl%2 m3, m0                       ; p4 p3 p2 p1
-    mova      [r0+3*FDEC_STRIDEB], m3
-    PALIGNR   m0, m2, m3, 2*SIZEOF_PIXEL, m5
-    mova      [r0+2*FDEC_STRIDEB], m0
-    PALIGNR   m0, m2, m3, 4*SIZEOF_PIXEL, m5
-    mova      [r0+1*FDEC_STRIDEB], m0
-    PALIGNR   m0, m2, m3, 6*SIZEOF_PIXEL, m3
-    mova      [r0+0*FDEC_STRIDEB], m0
-    mova      [r0-1*FDEC_STRIDEB], m2
-    PALIGNR   m0, m1, m2, 2*SIZEOF_PIXEL, m5
-    mova      [r0-2*FDEC_STRIDEB], m0
-    PALIGNR   m0, m1, m2, 4*SIZEOF_PIXEL, m5
-    mova      [r0-3*FDEC_STRIDEB], m0
-    PALIGNR   m1, m1, m2, 6*SIZEOF_PIXEL, m2
-    mova      [r0-4*FDEC_STRIDEB], m1
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_8x8_HD w, wd
-INIT_XMM ssse3
-PREDICT_8x8_HD w, wd
-INIT_XMM avx
-PREDICT_8x8_HD w, wd
-%else
-INIT_MMX mmx2
-PREDICT_8x8_HD b, bw
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-%macro PREDICT_8x8_HD 0
-cglobal predict_8x8_hd, 2,2
-    add     r0, 4*FDEC_STRIDE
-    movu    m1, [r1+7]
-    movu    m3, [r1+8]
-    movu    m2, [r1+9]
-    pavgb   m4, m1, m3
-    PRED8x8_LOWPASS m0, m1, m2, m3, m5
-    punpcklbw m4, m0
-    movhlps m0, m4
-
-%assign Y 3
-%rep 3
-    movq   [r0+(Y)*FDEC_STRIDE], m4
-    movq   [r0+(Y-4)*FDEC_STRIDE], m0
-    psrldq m4, 2
-    psrldq m0, 2
-%assign Y (Y-1)
-%endrep
-    movq   [r0+(Y)*FDEC_STRIDE], m4
-    movq   [r0+(Y-4)*FDEC_STRIDE], m0
-    RET
-%endmacro
-
-INIT_XMM sse2
-PREDICT_8x8_HD
-INIT_XMM avx
-PREDICT_8x8_HD
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-;-----------------------------------------------------------------------------
-; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal predict_8x8_hu_sse2, 2,2
-    add        r0, 4*FDEC_STRIDE
-    movq      mm1, [r1+7]           ; l0 l1 l2 l3 l4 l5 l6 l7
-    pshufw    mm0, mm1, q0123       ; l6 l7 l4 l5 l2 l3 l0 l1
-    movq      mm2, mm0
-    psllw     mm0, 8
-    psrlw     mm2, 8
-    por       mm2, mm0              ; l7 l6 l5 l4 l3 l2 l1 l0
-    psllq     mm1, 56               ; l7 .. .. .. .. .. .. ..
-    movq      mm3, mm2
-    movq      mm4, mm2
-    movq      mm5, mm2
-    psrlq     mm2, 8
-    psrlq     mm3, 16
-    por       mm2, mm1              ; l7 l7 l6 l5 l4 l3 l2 l1
-    punpckhbw mm1, mm1
-    por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
-    pavgb     mm4, mm2
-    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
-
-    movq2dq   xmm0, mm4
-    movq2dq   xmm1, mm1
-    punpcklbw xmm0, xmm1
-    punpckhbw  mm4, mm1
-%assign Y -4
-%rep 3
-    movq     [r0+Y*FDEC_STRIDE], xmm0
-    psrldq    xmm0, 2
-%assign Y (Y+1)
-%endrep
-    pshufw     mm5, mm4, q3321
-    pshufw     mm6, mm4, q3332
-    pshufw     mm7, mm4, q3333
-    movq     [r0+Y*FDEC_STRIDE], xmm0
-    movq     [r0+0*FDEC_STRIDE], mm4
-    movq     [r0+1*FDEC_STRIDE], mm5
-    movq     [r0+2*FDEC_STRIDE], mm6
-    movq     [r0+3*FDEC_STRIDE], mm7
-    RET
-
-INIT_XMM
-cglobal predict_8x8_hu_ssse3, 2,2
-    add       r0, 4*FDEC_STRIDE
-    movq      m3, [r1+7]
-    pshufb    m3, [shuf_hu]
-    psrldq    m1, m3, 1
-    psrldq    m2, m3, 2
-    pavgb     m0, m1, m3
-    PRED8x8_LOWPASS m1, m3, m2, m1, m4
-    punpcklbw m0, m1
-%assign Y -4
-%rep 3
-    movq   [r0+ Y   *FDEC_STRIDE], m0
-    movhps [r0+(Y+4)*FDEC_STRIDE], m0
-    psrldq    m0, 2
-    pshufhw   m0, m0, q2210
-%assign Y (Y+1)
-%endrep
-    movq   [r0+ Y   *FDEC_STRIDE], m0
-    movhps [r0+(Y+4)*FDEC_STRIDE], m0
-    RET
-%endif ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_v( uint8_t *src )
-;-----------------------------------------------------------------------------
-
-%macro PREDICT_8x8C_V 0
-cglobal predict_8x8c_v, 1,1
-    mova        m0, [r0 - FDEC_STRIDEB]
-    STORE8      m0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse
-PREDICT_8x8C_V
-%else
-INIT_MMX mmx
-PREDICT_8x8C_V
-%endif
-
-%if HIGH_BIT_DEPTH
-
-INIT_MMX
-cglobal predict_8x8c_v_mmx, 1,1
-    mova        m0, [r0 - FDEC_STRIDEB]
-    mova        m1, [r0 - FDEC_STRIDEB + 8]
-%assign Y 0
-%rep 8
-    mova        [r0 + (Y&1)*FDEC_STRIDEB], m0
-    mova        [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
-%if (Y&1) && (Y!=7)
-    add         r0, FDEC_STRIDEB*2
-%endif
-%assign Y Y+1
-%endrep
-    RET
-
-%endif
-
-%macro PREDICT_8x16C_V 0
-cglobal predict_8x16c_v, 1,1
-    mova        m0, [r0 - FDEC_STRIDEB]
-    STORE16     m0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse
-PREDICT_8x16C_V
-%else
-INIT_MMX mmx
-PREDICT_8x16C_V
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_h( uint8_t *src )
-;-----------------------------------------------------------------------------
-%macro PREDICT_C_H 0
-cglobal predict_8x8c_h, 1,1
-%if cpuflag(ssse3) && notcpuflag(avx2)
-    mova  m2, [pb_3]
-%endif
-    PRED_H_4ROWS 8, 1
-    PRED_H_4ROWS 8, 0
-    RET
-
-cglobal predict_8x16c_h, 1,2
-%if cpuflag(ssse3) && notcpuflag(avx2)
-    mova  m2, [pb_3]
-%endif
-    mov  r1d, 4
-.loop:
-    PRED_H_4ROWS 8, 1
-    dec  r1d
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_C_H
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_C_H
-INIT_XMM avx2
-PREDICT_C_H
-%else
-INIT_MMX ssse3
-PREDICT_C_H
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_dc( pixel *src )
-;-----------------------------------------------------------------------------
-%macro LOAD_LEFT 1
-    movzx    r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
-    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
-    add      r1d, r2d
-    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
-    add      r1d, r2d
-    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
-    add      r1d, r2d
-%endmacro
-
-%macro PREDICT_8x8C_DC 0
-cglobal predict_8x8c_dc, 1,3
-    pxor      m7, m7
-%if HIGH_BIT_DEPTH
-    movq      m0, [r0-FDEC_STRIDEB+0]
-    movq      m1, [r0-FDEC_STRIDEB+8]
-    HADDW     m0, m2
-    HADDW     m1, m2
-%else ; !HIGH_BIT_DEPTH
-    movd      m0, [r0-FDEC_STRIDEB+0]
-    movd      m1, [r0-FDEC_STRIDEB+4]
-    psadbw    m0, m7            ; s0
-    psadbw    m1, m7            ; s1
-%endif
-    add       r0, FDEC_STRIDEB*4
-
-    LOAD_LEFT 0                 ; s2
-    movd      m2, r1d
-    LOAD_LEFT 4                 ; s3
-    movd      m3, r1d
-
-    punpcklwd m0, m1
-    punpcklwd m2, m3
-    punpckldq m0, m2            ; s0, s1, s2, s3
-    pshufw    m3, m0, q3312     ; s2, s1, s3, s3
-    pshufw    m0, m0, q1310     ; s0, s1, s3, s1
-    paddw     m0, m3
-    psrlw     m0, 2
-    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
-%if HIGH_BIT_DEPTH
-%if cpuflag(sse2)
-    movq2dq   xmm0, m0
-    punpcklwd xmm0, xmm0
-    pshufd    xmm1, xmm0, q3322
-    punpckldq xmm0, xmm0
-%assign Y 0
-%rep 8
-%assign i (0 + (Y/4))
-    movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
-%assign Y Y+1
-%endrep
-%else ; !sse2
-    pshufw    m1, m0, q0000
-    pshufw    m2, m0, q1111
-    pshufw    m3, m0, q2222
-    pshufw    m4, m0, q3333
-%assign Y 0
-%rep 8
-%assign i (1 + (Y/4)*2)
-%assign j (2 + (Y/4)*2)
-    movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
-    movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
-%assign Y Y+1
-%endrep
-%endif
-%else ; !HIGH_BIT_DEPTH
-    packuswb  m0, m0
-    punpcklbw m0, m0
-    movq      m1, m0
-    punpcklbw m0, m0
-    punpckhbw m1, m1
-%assign Y 0
-%rep 8
-%assign i (0 + (Y/4))
-    movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
-%assign Y Y+1
-%endrep
-%endif
-    RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_8x8C_DC
-%if HIGH_BIT_DEPTH
-INIT_MMX sse2
-PREDICT_8x8C_DC
-%endif
-
-%if HIGH_BIT_DEPTH
-%macro STORE_4LINES 3
-%if cpuflag(sse2)
-    movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
-    movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
-    movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
-    movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
-%else
-    movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
-    movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
-    movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
-    movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
-    movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
-    movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
-    movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
-    movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
-%endif
-%endmacro
-%else
-%macro STORE_4LINES 2
-    movq [r0+FDEC_STRIDEB*(%2-4)], %1
-    movq [r0+FDEC_STRIDEB*(%2-3)], %1
-    movq [r0+FDEC_STRIDEB*(%2-2)], %1
-    movq [r0+FDEC_STRIDEB*(%2-1)], %1
-%endmacro
-%endif
-
-%macro PREDICT_8x16C_DC 0
-cglobal predict_8x16c_dc, 1,3
-    pxor      m7, m7
-%if HIGH_BIT_DEPTH
-    movq      m0, [r0-FDEC_STRIDEB+0]
-    movq      m1, [r0-FDEC_STRIDEB+8]
-    HADDW     m0, m2
-    HADDW     m1, m2
-%else
-    movd      m0, [r0-FDEC_STRIDEB+0]
-    movd      m1, [r0-FDEC_STRIDEB+4]
-    psadbw    m0, m7            ; s0
-    psadbw    m1, m7            ; s1
-%endif
-    punpcklwd m0, m1            ; s0, s1
-
-    add       r0, FDEC_STRIDEB*4
-    LOAD_LEFT 0                 ; s2
-    pinsrw    m0, r1d, 2
-    LOAD_LEFT 4                 ; s3
-    pinsrw    m0, r1d, 3        ; s0, s1, s2, s3
-    add       r0, FDEC_STRIDEB*8
-    LOAD_LEFT 0                 ; s4
-    pinsrw    m1, r1d, 2
-    LOAD_LEFT 4                 ; s5
-    pinsrw    m1, r1d, 3        ; s1, __, s4, s5
-    sub       r0, FDEC_STRIDEB*8
-
-    pshufw    m2, m0, q1310     ; s0, s1, s3, s1
-    pshufw    m0, m0, q3312     ; s2, s1, s3, s3
-    pshufw    m3, m1, q0302     ; s4, s1, s5, s1
-    pshufw    m1, m1, q3322     ; s4, s4, s5, s5
-    paddw     m0, m2
-    paddw     m1, m3
-    psrlw     m0, 2
-    psrlw     m1, 2
-    pavgw     m0, m7
-    pavgw     m1, m7
-%if HIGH_BIT_DEPTH
-%if cpuflag(sse2)
-    movq2dq xmm0, m0
-    movq2dq xmm1, m1
-    punpcklwd xmm0, xmm0
-    punpcklwd xmm1, xmm1
-    pshufd    xmm2, xmm0, q3322
-    pshufd    xmm3, xmm1, q3322
-    punpckldq xmm0, xmm0
-    punpckldq xmm1, xmm1
-    STORE_4LINES xmm0, xmm0, 0
-    STORE_4LINES xmm2, xmm2, 4
-    STORE_4LINES xmm1, xmm1, 8
-    STORE_4LINES xmm3, xmm3, 12
-%else
-    pshufw    m2, m0, q0000
-    pshufw    m3, m0, q1111
-    pshufw    m4, m0, q2222
-    pshufw    m5, m0, q3333
-    STORE_4LINES m2, m3, 0
-    STORE_4LINES m4, m5, 4
-    pshufw    m2, m1, q0000
-    pshufw    m3, m1, q1111
-    pshufw    m4, m1, q2222
-    pshufw    m5, m1, q3333
-    STORE_4LINES m2, m3, 8
-    STORE_4LINES m4, m5, 12
-%endif
-%else
-    packuswb  m0, m0            ; dc0, dc1, dc2, dc3
-    packuswb  m1, m1            ; dc4, dc5, dc6, dc7
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-    pshufw    m2, m0, q1100
-    pshufw    m3, m0, q3322
-    pshufw    m4, m1, q1100
-    pshufw    m5, m1, q3322
-    STORE_4LINES m2, 0
-    STORE_4LINES m3, 4
-    add       r0, FDEC_STRIDEB*8
-    STORE_4LINES m4, 0
-    STORE_4LINES m5, 4
-%endif
-    RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_8x16C_DC
-%if HIGH_BIT_DEPTH
-INIT_MMX sse2
-PREDICT_8x16C_DC
-%endif
-
-%macro PREDICT_C_DC_TOP 1
-%if HIGH_BIT_DEPTH
-INIT_XMM
-cglobal predict_8x%1c_dc_top_sse2, 1,1
-    pxor        m2, m2
-    mova        m0, [r0 - FDEC_STRIDEB]
-    pshufd      m1, m0, q2301
-    paddw       m0, m1
-    pshuflw     m1, m0, q2301
-    pshufhw     m1, m1, q2301
-    paddw       m0, m1
-    psrlw       m0, 1
-    pavgw       m0, m2
-    STORE%1     m0
-    RET
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_8x%1c_dc_top_mmx2, 1,1
-    movq        mm0, [r0 - FDEC_STRIDE]
-    pxor        mm1, mm1
-    pxor        mm2, mm2
-    punpckhbw   mm1, mm0
-    punpcklbw   mm0, mm2
-    psadbw      mm1, mm2        ; s1
-    psadbw      mm0, mm2        ; s0
-    psrlw       mm1, 1
-    psrlw       mm0, 1
-    pavgw       mm1, mm2
-    pavgw       mm0, mm2
-    pshufw      mm1, mm1, 0
-    pshufw      mm0, mm0, 0     ; dc0 (w)
-    packuswb    mm0, mm1        ; dc0,dc1 (b)
-    STORE%1     mm0
-    RET
-%endif
-%endmacro
-
-PREDICT_C_DC_TOP 8
-PREDICT_C_DC_TOP 16
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_v( pixel *src )
-;-----------------------------------------------------------------------------
-
-%macro PREDICT_16x16_V 0
-cglobal predict_16x16_v, 1,2
-%assign %%i 0
-%rep 16*SIZEOF_PIXEL/mmsize
-    mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
-%assign %%i %%i+1
-%endrep
-%if 16*SIZEOF_PIXEL/mmsize == 4
-    STORE16 m0, m1, m2, m3
-%elif 16*SIZEOF_PIXEL/mmsize == 2
-    STORE16 m0, m1
-%else
-    STORE16 m0
-%endif
-    RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_16x16_V
-INIT_XMM sse
-PREDICT_16x16_V
-%if HIGH_BIT_DEPTH
-INIT_YMM avx
-PREDICT_16x16_V
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_h( pixel *src )
-;-----------------------------------------------------------------------------
-%macro PREDICT_16x16_H 0
-cglobal predict_16x16_h, 1,2
-%if cpuflag(ssse3) && notcpuflag(avx2)
-    mova  m2, [pb_3]
-%endif
-    mov  r1d, 4
-.loop:
-    PRED_H_4ROWS 16, 1
-    dec  r1d
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_16x16_H
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PREDICT_16x16_H
-INIT_YMM avx2
-PREDICT_16x16_H
-%else
-;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
-INIT_XMM ssse3
-PREDICT_16x16_H
-%endif
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc( pixel *src )
-;-----------------------------------------------------------------------------
-%if WIN64
-DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
-%else
-DECLARE_REG_TMP 3
-%endif
-
-INIT_XMM
-; Returns the sum of the left pixels in r1d+r2d
-cglobal predict_16x16_dc_left_internal, 0,4
-    movzx r1d, pixel [r0-SIZEOF_PIXEL]
-    movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
-%assign i 2*FDEC_STRIDEB
-%rep 7
-    movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
-    add   r1d, t0d
-    movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
-    add   r2d, t0d
-%assign i i+2*FDEC_STRIDEB
-%endrep
-    RET
-
-%macro PRED16x16_DC 2
-%if HIGH_BIT_DEPTH
-    mova      xm0, [r0 - FDEC_STRIDEB+ 0]
-    paddw     xm0, [r0 - FDEC_STRIDEB+16]
-    HADDW     xm0, xm2
-    paddw     xm0, %1
-    psrlw     xm0, %2
-    SPLATW     m0, xm0
-%if mmsize == 32
-    STORE16    m0
-%else
-    STORE16    m0, m0
-%endif
-%else ; !HIGH_BIT_DEPTH
-    pxor        m0, m0
-    psadbw      m0, [r0 - FDEC_STRIDE]
-    MOVHL       m1, m0
-    paddw       m0, m1
-    paddusw     m0, %1
-    psrlw       m0, %2              ; dc
-    SPLATW      m0, m0
-    packuswb    m0, m0              ; dc in bytes
-    STORE16     m0
-%endif
-%endmacro
-
-%macro PREDICT_16x16_DC 0
-cglobal predict_16x16_dc, 1,3
-    call predict_16x16_dc_left_internal
-    lea          r1d, [r1+r2+16]
-    movd         xm3, r1d
-    PRED16x16_DC xm3, 5
-    RET
-
-cglobal predict_16x16_dc_top, 1,2
-    PRED16x16_DC [pw_8], 4
-    RET
-
-cglobal predict_16x16_dc_left, 1,3
-    call predict_16x16_dc_left_internal
-    lea       r1d, [r1+r2+8]
-    shr       r1d, 4
-    movd      xm0, r1d
-    SPLATW     m0, xm0
-%if HIGH_BIT_DEPTH && mmsize == 16
-    STORE16    m0, m0
-%else
-%if HIGH_BIT_DEPTH == 0
-    packuswb   m0, m0
-%endif
-    STORE16    m0
-%endif
-    RET
-%endmacro
-
-INIT_XMM sse2
-PREDICT_16x16_DC
-%if HIGH_BIT_DEPTH
-INIT_YMM avx2
-PREDICT_16x16_DC
-%else
-INIT_XMM avx2
-PREDICT_16x16_DC
-%endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/predict-c.c b/android/src/main/libenc/jni/libx264/common/x86/predict-c.c
deleted file mode 100755
index 38ff39e..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/predict-c.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*****************************************************************************
- * predict-c.c: intra prediction
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "predict.h"
-#include "pixel.h"
-
-#define PREDICT_P_SUM(j,i)\
-    H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
-    V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
-
-#if HAVE_X86_INLINE_ASM
-#if HIGH_BIT_DEPTH
-ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
-ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
-#else // !HIGH_BIT_DEPTH
-ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
-#endif // HIGH_BIT_DEPTH
-#endif // HAVE_X86_INLINE_ASM
-
-#define PREDICT_16x16_P_CORE\
-    int H = 0;\
-    int V = 0;\
-    PREDICT_P_SUM(7,1)\
-    PREDICT_P_SUM(7,2)\
-    PREDICT_P_SUM(7,3)\
-    PREDICT_P_SUM(7,4)\
-    PREDICT_P_SUM(7,5)\
-    PREDICT_P_SUM(7,6)\
-    PREDICT_P_SUM(7,7)\
-    PREDICT_P_SUM(7,8)
-
-#define PREDICT_16x16_P_END(name)\
-    int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
-    int b = ( 5 * H + 32 ) >> 6;\
-    int c = ( 5 * V + 32 ) >> 6;\
-    int i00 = a - b * 7 - c * 7 + 16;\
-    /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
-     * than to try to consider it in the asm. */\
-    if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
-        x264_predict_16x16_p_c( src );\
-    else\
-        x264_predict_16x16_p_core_##name( src, i00, b, c );
-
-#define PREDICT_16x16_P(name, name2)\
-static void x264_predict_16x16_p_##name( pixel *src )\
-{\
-    PREDICT_16x16_P_CORE\
-    PREDICT_16x16_P_END(name2)\
-}
-
-#if HAVE_X86_INLINE_ASM
-#if HIGH_BIT_DEPTH
-#define PREDICT_16x16_P_ASM\
-    asm (\
-        "movdqu           %1, %%xmm1 \n"\
-        "movdqa           %2, %%xmm0 \n"\
-        "pmaddwd          %3, %%xmm0 \n"\
-        "pmaddwd          %4, %%xmm1 \n"\
-        "paddd        %%xmm1, %%xmm0 \n"\
-        "movhlps      %%xmm0, %%xmm1 \n"\
-        "paddd        %%xmm1, %%xmm0 \n"\
-        "pshuflw $14, %%xmm0, %%xmm1 \n"\
-        "paddd        %%xmm1, %%xmm0 \n"\
-        "movd         %%xmm0, %0     \n"\
-        :"=r"(H)\
-        :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\
-         "m"(*pw_12345678), "m"(*pw_m87654321)\
-    );
-#else // !HIGH_BIT_DEPTH
-#define PREDICT_16x16_P_ASM\
-    asm (\
-        "movq           %1, %%mm1 \n"\
-        "movq           %2, %%mm0 \n"\
-        "palignr $7,    %3, %%mm1 \n"\
-        "pmaddubsw      %4, %%mm0 \n"\
-        "pmaddubsw      %5, %%mm1 \n"\
-        "paddw       %%mm1, %%mm0 \n"\
-        "pshufw $14, %%mm0, %%mm1 \n"\
-        "paddw       %%mm1, %%mm0 \n"\
-        "pshufw  $1, %%mm0, %%mm1 \n"\
-        "paddw       %%mm1, %%mm0 \n"\
-        "movd        %%mm0, %0    \n"\
-        "movswl        %w0, %0    \n"\
-        :"=r"(H)\
-        :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\
-         "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\
-    );
-#endif // HIGH_BIT_DEPTH
-
-#define PREDICT_16x16_P_CORE_INLINE\
-    int H, V;\
-    PREDICT_16x16_P_ASM\
-    V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\
-      + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\
-      + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\
-      + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\
-      + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\
-      + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\
-      + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\
-      + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
-
-#define PREDICT_16x16_P_INLINE(name, name2)\
-static void x264_predict_16x16_p_##name( pixel *src )\
-{\
-    PREDICT_16x16_P_CORE_INLINE\
-    PREDICT_16x16_P_END(name2)\
-}
-#else // !HAVE_X86_INLINE_ASM
-#define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2)
-#endif // HAVE_X86_INLINE_ASM
-
-#if HIGH_BIT_DEPTH
-PREDICT_16x16_P_INLINE( sse2, sse2 )
-#else // !HIGH_BIT_DEPTH
-#if !ARCH_X86_64
-PREDICT_16x16_P( mmx2, mmx2 )
-#endif // !ARCH_X86_64
-PREDICT_16x16_P( sse2, sse2 )
-#if HAVE_X86_INLINE_ASM
-PREDICT_16x16_P_INLINE( ssse3, sse2 )
-#endif // HAVE_X86_INLINE_ASM
-PREDICT_16x16_P_INLINE( avx, avx )
-#endif // HIGH_BIT_DEPTH
-PREDICT_16x16_P_INLINE( avx2, avx2 )
-
-#define PREDICT_8x16C_P_CORE\
-    int H = 0, V = 0;\
-    for( int i = 0; i < 4; i++ )\
-        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
-    for( int i = 0; i < 8; i++ )\
-        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
-
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x16C_P_END(name)\
-    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
-    int b = ( 17 * H + 16 ) >> 5;\
-    int c = ( 5 * V + 32 ) >> 6;\
-    x264_predict_8x16c_p_core_##name( src, a, b, c );
-#else // !HIGH_BIT_DEPTH
-#define PREDICT_8x16C_P_END(name)\
-    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
-    int b = ( 17 * H + 16 ) >> 5;\
-    int c = ( 5 * V + 32 ) >> 6;\
-    int i00 = a -3*b -7*c + 16;\
-    x264_predict_8x16c_p_core_##name( src, i00, b, c );
-#endif // HIGH_BIT_DEPTH
-
-#define PREDICT_8x16C_P(name)\
-static void x264_predict_8x16c_p_##name( pixel *src )\
-{\
-    PREDICT_8x16C_P_CORE\
-    PREDICT_8x16C_P_END(name)\
-}
-
-#if !ARCH_X86_64 && !HIGH_BIT_DEPTH
-PREDICT_8x16C_P( mmx2 )
-#endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH
-PREDICT_8x16C_P( sse2 )
-PREDICT_8x16C_P( avx )
-PREDICT_8x16C_P( avx2 )
-
-#define PREDICT_8x8C_P_CORE\
-    int H = 0;\
-    int V = 0;\
-    PREDICT_P_SUM(3,1)\
-    PREDICT_P_SUM(3,2)\
-    PREDICT_P_SUM(3,3)\
-    PREDICT_P_SUM(3,4)
-
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x8C_P_END(name)\
-    int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
-    int b = ( 17 * H + 16 ) >> 5;\
-    int c = ( 17 * V + 16 ) >> 5;\
-    x264_predict_8x8c_p_core_##name( src, a, b, c );
-#else // !HIGH_BIT_DEPTH
-#define PREDICT_8x8C_P_END(name)\
-    int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
-    int b = ( 17 * H + 16 ) >> 5;\
-    int c = ( 17 * V + 16 ) >> 5;\
-    int i00 = a -3*b -3*c + 16;\
-    x264_predict_8x8c_p_core_##name( src, i00, b, c );
-#endif // HIGH_BIT_DEPTH
-
-#define PREDICT_8x8C_P(name, name2)\
-static void x264_predict_8x8c_p_##name( pixel *src )\
-{\
-    PREDICT_8x8C_P_CORE\
-    PREDICT_8x8C_P_END(name2)\
-}
-
-#if HAVE_X86_INLINE_ASM
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x8C_P_ASM\
-    asm (\
-        "movdqa           %1, %%xmm0 \n"\
-        "pmaddwd          %2, %%xmm0 \n"\
-        "movhlps      %%xmm0, %%xmm1 \n"\
-        "paddd        %%xmm1, %%xmm0 \n"\
-        "pshuflw $14, %%xmm0, %%xmm1 \n"\
-        "paddd        %%xmm1, %%xmm0 \n"\
-        "movd         %%xmm0, %0     \n"\
-        :"=r"(H)\
-        :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\
-    );
-#else // !HIGH_BIT_DEPTH
-#define PREDICT_8x8C_P_ASM\
-    asm (\
-        "movq           %1, %%mm0 \n"\
-        "pmaddubsw      %2, %%mm0 \n"\
-        "pshufw $14, %%mm0, %%mm1 \n"\
-        "paddw       %%mm1, %%mm0 \n"\
-        "pshufw  $1, %%mm0, %%mm1 \n"\
-        "paddw       %%mm1, %%mm0 \n"\
-        "movd        %%mm0, %0    \n"\
-        "movswl        %w0, %0    \n"\
-        :"=r"(H)\
-        :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\
-    );
-#endif // HIGH_BIT_DEPTH
-
-#define PREDICT_8x8C_P_CORE_INLINE\
-    int H, V;\
-    PREDICT_8x8C_P_ASM\
-    V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
-      + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
-      + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
-      + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
-    H += -4 * src[-1*FDEC_STRIDE -1];
-
-#define PREDICT_8x8C_P_INLINE(name, name2)\
-static void x264_predict_8x8c_p_##name( pixel *src )\
-{\
-    PREDICT_8x8C_P_CORE_INLINE\
-    PREDICT_8x8C_P_END(name2)\
-}
-#else // !HAVE_X86_INLINE_ASM
-#define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2)
-#endif // HAVE_X86_INLINE_ASM
-
-#if HIGH_BIT_DEPTH
-PREDICT_8x8C_P_INLINE( sse2, sse2 )
-#else  //!HIGH_BIT_DEPTH
-#if !ARCH_X86_64
-PREDICT_8x8C_P( mmx2, mmx2 )
-#endif // !ARCH_X86_64
-PREDICT_8x8C_P( sse2, sse2 )
-#if HAVE_X86_INLINE_ASM
-PREDICT_8x8C_P_INLINE( ssse3, sse2 )
-#endif // HAVE_X86_INLINE_ASM
-#endif // HIGH_BIT_DEPTH
-PREDICT_8x8C_P_INLINE( avx, avx )
-PREDICT_8x8C_P_INLINE( avx2, avx2 )
-
-#if ARCH_X86_64 && !HIGH_BIT_DEPTH
-static void x264_predict_8x8c_dc_left( uint8_t *src )
-{
-    int y;
-    uint32_t s0 = 0, s1 = 0;
-    uint64_t dc0, dc1;
-
-    for( y = 0; y < 4; y++ )
-    {
-        s0 += src[y * FDEC_STRIDE     - 1];
-        s1 += src[(y+4) * FDEC_STRIDE - 1];
-    }
-    dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
-    dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
-
-    for( y = 0; y < 4; y++ )
-    {
-        M64( src ) = dc0;
-        src += FDEC_STRIDE;
-    }
-    for( y = 0; y < 4; y++ )
-    {
-        M64( src ) = dc1;
-        src += FDEC_STRIDE;
-    }
-}
-#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
-
-/****************************************************************************
- * Exported functions:
- ****************************************************************************/
-void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
-{
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_mmx2;
-    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_mmx2;
-#if HIGH_BIT_DEPTH
-    if( !(cpu&X264_CPU_SSE) )
-        return;
-    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_sse2;
-    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_sse2;
-    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
-    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_sse2;
-    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_avx;
-    if( !(cpu&X264_CPU_AVX2) )
-        return;
-    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_avx2;
-#else
-#if !ARCH_X86_64
-    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_mmx2;
-#endif
-    if( !(cpu&X264_CPU_SSE) )
-        return;
-    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_sse2;
-    if( cpu&X264_CPU_SSE2_IS_SLOW )
-        return;
-    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_sse2;
-    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
-    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
-        pf[I_PRED_16x16_H]       = x264_predict_16x16_h_ssse3;
-#if HAVE_X86_INLINE_ASM
-    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
-#endif
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_avx;
-#endif // HIGH_BIT_DEPTH
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf[I_PRED_16x16_P]       = x264_predict_16x16_p_avx2;
-        pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_avx2;
-        pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_avx2;
-        pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
-    }
-}
-
-void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
-{
-    if( !(cpu&X264_CPU_MMX) )
-        return;
-#if HIGH_BIT_DEPTH
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_mmx;
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmx2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_mmx2;
-    if( !(cpu&X264_CPU_SSE) )
-        return;
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_sse;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_sse2;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_sse2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_sse2;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_avx;
-    if( !(cpu&X264_CPU_AVX2) )
-        return;
-    pf[I_PRED_CHROMA_H]   = x264_predict_8x8c_h_avx2;
-#else
-#if ARCH_X86_64
-    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
-#endif
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_mmx;
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_mmx2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_mmx2;
-#if !ARCH_X86_64
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_mmx2;
-#endif
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmx2;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_ssse3;
-#if HAVE_X86_INLINE_ASM
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
-#endif
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_avx;
-#endif // HIGH_BIT_DEPTH
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf[I_PRED_CHROMA_P]   = x264_predict_8x8c_p_avx2;
-    }
-}
-
-void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
-{
-    if( !(cpu&X264_CPU_MMX) )
-        return;
-#if HIGH_BIT_DEPTH
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_mmx2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_mmx2;
-    if( !(cpu&X264_CPU_SSE) )
-        return;
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_sse;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_sse2;
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_sse2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_sse2;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_sse2;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_avx;
-    if( !(cpu&X264_CPU_AVX2) )
-        return;
-    pf[I_PRED_CHROMA_H]   = x264_predict_8x16c_h_avx2;
-#else
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_mmx;
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_mmx2;
-    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_mmx2;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_mmx2;
-#if !ARCH_X86_64
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_mmx2;
-#endif
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_ssse3;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_avx;
-#endif // HIGH_BIT_DEPTH
-
-    if( cpu&X264_CPU_AVX2 )
-    {
-        pf[I_PRED_CHROMA_P]   = x264_predict_8x16c_p_avx2;
-    }
-}
-
-void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
-{
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-#if HIGH_BIT_DEPTH
-    if( !(cpu&X264_CPU_SSE) )
-        return;
-    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_sse;
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_sse2;
-    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_sse2;
-    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
-    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
-    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_sse2;
-    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_sse2;
-    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_sse2;
-    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_sse2;
-    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_sse2;
-    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_sse2;
-    *predict_8x8_filter   = x264_predict_8x8_filter_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_ssse3;
-    pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_ssse3;
-    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_ssse3;
-    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_ssse3;
-    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_ssse3;
-    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_ssse3;
-    *predict_8x8_filter   = x264_predict_8x8_filter_ssse3;
-    if( cpu&X264_CPU_CACHELINE_64 )
-    {
-        pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64;
-        pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64;
-    }
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_avx;
-    pf[I_PRED_8x8_HU]     = x264_predict_8x8_hu_avx;
-    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_avx;
-    pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_avx;
-    *predict_8x8_filter   = x264_predict_8x8_filter_avx;
-#else
-    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmx2;
-    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmx2;
-    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmx2;
-    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
-    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
-    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_mmx2;
-    pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_mmx2;
-    *predict_8x8_filter   = x264_predict_8x8_filter_mmx2;
-#if ARCH_X86
-    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_mmx2;
-    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_mmx2;
-    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_mmx2;
-    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_mmx2;
-#endif
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_sse2;
-    pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_sse2;
-    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_sse2;
-    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_sse2;
-    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_sse2;
-    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
-    {
-        pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_ssse3;
-        pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_ssse3;
-    }
-    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
-    *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_avx;
-    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_avx;
-    pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_avx;
-    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_avx;
-    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_avx;
-#endif // HIGH_BIT_DEPTH
-}
-
-void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
-{
-    if( !(cpu&X264_CPU_MMX2) )
-        return;
-    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_mmx2;
-    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2;
-    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmx2;
-    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_mmx2;
-    pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_mmx2;
-#if HIGH_BIT_DEPTH
-    if( !(cpu&X264_CPU_SSE2) )
-        return;
-    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_sse2;
-    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_sse2;
-    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_sse2;
-    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_sse2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
-    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
-    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
-    if( !(cpu&X264_CPU_AVX) )
-        return;
-    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
-    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_avx;
-    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_avx;
-    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_avx;
-    if( !(cpu&X264_CPU_AVX2) )
-        return;
-    pf[I_PRED_4x4_H]  = x264_predict_4x4_h_avx2;
-#else
-    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmx2;
-    if( !(cpu&X264_CPU_SSSE3) )
-        return;
-    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
-    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
-    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
-    if( cpu&X264_CPU_CACHELINE_64 )
-        pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64;
-#endif // HIGH_BIT_DEPTH
-}
diff --git a/android/src/main/libenc/jni/libx264/common/x86/predict.h b/android/src/main/libenc/jni/libx264/common/x86/predict.h
deleted file mode 100755
index ba1dd6b..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/predict.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*****************************************************************************
- * predict.h: x86 intra prediction
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_I386_PREDICT_H
-#define X264_I386_PREDICT_H
-
-void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x16c_init_mmx  ( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x8c_init_mmx  ( int cpu, x264_predict_t pf[7] );
-void x264_predict_4x4_init_mmx   ( int cpu, x264_predict_t pf[12] );
-void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
-
-void x264_predict_16x16_v_mmx2( pixel *src );
-void x264_predict_16x16_v_sse ( pixel *src );
-void x264_predict_16x16_v_avx ( uint16_t *src );
-void x264_predict_16x16_h_mmx2( pixel *src );
-void x264_predict_16x16_h_sse2( uint16_t *src );
-void x264_predict_16x16_h_ssse3( uint8_t *src );
-void x264_predict_16x16_h_avx2( uint16_t *src );
-void x264_predict_16x16_dc_sse2( pixel *src );
-void x264_predict_16x16_dc_avx2( pixel *src );
-void x264_predict_16x16_dc_left_sse2( pixel *src );
-void x264_predict_16x16_dc_left_avx2( pixel *src );
-void x264_predict_16x16_dc_top_sse2( pixel *src );
-void x264_predict_16x16_dc_top_avx2( pixel *src );
-void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
-void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
-void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_dc_mmx2( pixel *src );
-void x264_predict_8x16c_dc_sse2( uint16_t *src );
-void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
-void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
-void x264_predict_8x16c_v_mmx( uint8_t *src );
-void x264_predict_8x16c_v_sse( uint16_t *src );
-void x264_predict_8x16c_h_mmx2( pixel *src );
-void x264_predict_8x16c_h_sse2( uint16_t *src );
-void x264_predict_8x16c_h_ssse3( uint8_t *src );
-void x264_predict_8x16c_h_avx2( uint16_t *src );
-void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_dc_mmx2( pixel *src );
-void x264_predict_8x8c_dc_sse2( uint16_t *src );
-void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
-void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
-void x264_predict_8x8c_v_mmx( pixel *src );
-void x264_predict_8x8c_v_sse( uint16_t *src );
-void x264_predict_8x8c_h_mmx2( pixel *src );
-void x264_predict_8x8c_h_sse2( uint16_t *src );
-void x264_predict_8x8c_h_ssse3( uint8_t *src );
-void x264_predict_8x8c_h_avx2( uint16_t *src );
-void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
-void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
-void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
-void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
-void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
-void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
-void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
-void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
-void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
-void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
-void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
-void x264_predict_4x4_h_avx2( uint16_t *src );
-void x264_predict_4x4_ddl_mmx2( pixel *src );
-void x264_predict_4x4_ddl_sse2( uint16_t *src );
-void x264_predict_4x4_ddl_avx( uint16_t *src );
-void x264_predict_4x4_ddr_mmx2( pixel *src );
-void x264_predict_4x4_vl_mmx2( pixel *src );
-void x264_predict_4x4_vl_sse2( uint16_t *src );
-void x264_predict_4x4_vl_avx( uint16_t *src );
-void x264_predict_4x4_vr_mmx2( uint8_t *src );
-void x264_predict_4x4_vr_sse2( uint16_t *src );
-void x264_predict_4x4_vr_ssse3( pixel *src );
-void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
-void x264_predict_4x4_vr_avx( uint16_t *src );
-void x264_predict_4x4_hd_mmx2( pixel *src );
-void x264_predict_4x4_hd_sse2( uint16_t *src );
-void x264_predict_4x4_hd_ssse3( pixel *src );
-void x264_predict_4x4_hd_avx( uint16_t *src );
-void x264_predict_4x4_dc_mmx2( pixel *src );
-void x264_predict_4x4_ddr_sse2( uint16_t *src );
-void x264_predict_4x4_ddr_ssse3( pixel *src );
-void x264_predict_4x4_ddr_avx( uint16_t *src );
-void x264_predict_4x4_hu_mmx2( pixel *src );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/quant-a.asm b/android/src/main/libenc/jni/libx264/common/x86/quant-a.asm
deleted file mode 100755
index 2dc0249..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/quant-a.asm
+++ /dev/null
@@ -1,1974 +0,0 @@
-;*****************************************************************************
-;* quant-a.asm: x86 quantization and level-run
-;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Christian Heine <sennindemokrit@gmx.net>
-;*          Oskar Arvidsson <oskar@irock.se>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-%macro DQM4 3
-    dw %1, %2, %1, %2, %2, %3, %2, %3
-%endmacro
-%macro DQM8 6
-    dw %1, %4, %5, %4, %1, %4, %5, %4
-    dw %4, %2, %6, %2, %4, %2, %6, %2
-    dw %5, %6, %3, %6, %5, %6, %3, %6
-    dw %4, %2, %6, %2, %4, %2, %6, %2
-%endmacro
-
-dequant4_scale:
-    DQM4 10, 13, 16
-    DQM4 11, 14, 18
-    DQM4 13, 16, 20
-    DQM4 14, 18, 23
-    DQM4 16, 20, 25
-    DQM4 18, 23, 29
-
-dequant8_scale:
-    DQM8 20, 18, 32, 19, 25, 24
-    DQM8 22, 19, 35, 21, 28, 26
-    DQM8 26, 23, 42, 24, 33, 31
-    DQM8 28, 25, 45, 26, 35, 33
-    DQM8 32, 28, 51, 30, 40, 38
-    DQM8 36, 32, 58, 34, 46, 43
-
-decimate_mask_table4:
-    db  0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
-    db  3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
-    db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
-    db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
-    db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
-    db  6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
-    db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
-    db  9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
-    db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
-
-chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
-chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
-chroma_dc_dct_mask:     dw 1, 1,-1,-1, 1, 1,-1,-1
-chroma_dc_dmf_mask:     dw 1, 1,-1,-1, 1,-1,-1, 1
-
-%if HIGH_BIT_DEPTH==0
-dct_coef_shuffle:
-%macro DCT_COEF_SHUFFLE 8
-    %assign y x
-    %rep 8
-        %rep 7
-            %rotate (~(y>>7))&1
-            %assign y y<<((~(y>>7))&1)
-        %endrep
-        db %1*2
-        %rotate 1
-        %assign y y<<1
-    %endrep
-%endmacro
-%assign x 0
-%rep 256
-    DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
-%assign x x+1
-%endrep
-%endif
-
-SECTION .text
-
-cextern pb_1
-cextern pw_1
-cextern pw_2
-cextern pw_256
-cextern pd_1
-cextern pb_01
-cextern pd_1024
-cextern deinterleave_shufd
-cextern popcnt_table
-
-%macro QUANT_DC_START 2
-    movd      xm%1, r1m     ; mf
-    movd      xm%2, r2m     ; bias
-%if cpuflag(avx2)
-    vpbroadcastdct m%1, xm%1
-    vpbroadcastdct m%2, xm%2
-%elif HIGH_BIT_DEPTH
-    SPLATD     m%1, m%1
-    SPLATD     m%2, m%2
-%elif cpuflag(sse4) ; ssse3, but not faster on conroe
-    mova       m5, [pb_01]
-    pshufb     m%1, m5
-    pshufb     m%2, m5
-%else
-    SPLATW     m%1, m%1
-    SPLATW     m%2, m%2
-%endif
-%endmacro
-
-%macro QUANT_END 0
-    xor      eax, eax
-%if cpuflag(sse4)
-    ptest     m5, m5
-%else ; !sse4
-%if ARCH_X86_64
-%if mmsize == 16
-    packsswb  m5, m5
-%endif
-    movq     rcx, m5
-    test     rcx, rcx
-%else
-%if mmsize == 16
-    pxor      m4, m4
-    pcmpeqb   m5, m4
-    pmovmskb ecx, m5
-    cmp      ecx, (1<<mmsize)-1
-%else
-    packsswb  m5, m5
-    movd     ecx, m5
-    test     ecx, ecx
-%endif
-%endif
-%endif ; cpuflag
-    setne     al
-%endmacro
-
-%if HIGH_BIT_DEPTH
-%macro QUANT_ONE_DC 4
-%if cpuflag(sse4)
-    mova        m0, [%1]
-    ABSD        m1, m0
-    paddd       m1, %3
-    pmulld      m1, %2
-    psrad       m1, 16
-%else ; !sse4
-    mova        m0, [%1]
-    ABSD        m1, m0
-    paddd       m1, %3
-    mova        m2, m1
-    psrlq       m2, 32
-    pmuludq     m1, %2
-    pmuludq     m2, %2
-    psllq       m2, 32
-    paddd       m1, m2
-    psrld       m1, 16
-%endif ; cpuflag
-    PSIGND      m1, m0
-    mova      [%1], m1
-    ACCUM     por, 5, 1, %4
-%endmacro
-
-%macro QUANT_TWO_DC 4
-%if cpuflag(sse4)
-    mova        m0, [%1       ]
-    mova        m1, [%1+mmsize]
-    ABSD        m2, m0
-    ABSD        m3, m1
-    paddd       m2, %3
-    paddd       m3, %3
-    pmulld      m2, %2
-    pmulld      m3, %2
-    psrad       m2, 16
-    psrad       m3, 16
-    PSIGND      m2, m0
-    PSIGND      m3, m1
-    mova [%1       ], m2
-    mova [%1+mmsize], m3
-    ACCUM      por, 5, 2, %4
-    por         m5, m3
-%else ; !sse4
-    QUANT_ONE_DC %1, %2, %3, %4
-    QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
-%endif ; cpuflag
-%endmacro
-
-%macro QUANT_ONE_AC_MMX 5
-    mova        m0, [%1]
-    mova        m2, [%2]
-    ABSD        m1, m0
-    mova        m4, m2
-    paddd       m1, [%3]
-    mova        m3, m1
-    psrlq       m4, 32
-    psrlq       m3, 32
-    pmuludq     m1, m2
-    pmuludq     m3, m4
-    psllq       m3, 32
-    paddd       m1, m3
-    psrad       m1, 16
-    PSIGND      m1, m0
-    mova      [%1], m1
-    ACCUM      por, %5, 1, %4
-%endmacro
-
-%macro QUANT_TWO_AC 5
-%if cpuflag(sse4)
-    mova        m0, [%1       ]
-    mova        m1, [%1+mmsize]
-    ABSD        m2, m0
-    ABSD        m3, m1
-    paddd       m2, [%3       ]
-    paddd       m3, [%3+mmsize]
-    pmulld      m2, [%2       ]
-    pmulld      m3, [%2+mmsize]
-    psrad       m2, 16
-    psrad       m3, 16
-    PSIGND      m2, m0
-    PSIGND      m3, m1
-    mova [%1       ], m2
-    mova [%1+mmsize], m3
-    ACCUM      por, %5, 2, %4
-    por        m%5, m3
-%else ; !sse4
-    QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
-    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
-%endif ; cpuflag
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int quant_2x2( int32_t dct[M*N], int mf, int bias )
-;-----------------------------------------------------------------------------
-%macro QUANT_DC 2
-cglobal quant_%1x%2_dc, 3,3,8
-    QUANT_DC_START 6,7
-%if %1*%2 <= mmsize/4
-    QUANT_ONE_DC r0, m6, m7, 0
-%else
-%assign x 0
-%rep %1*%2/(mmsize/2)
-    QUANT_TWO_DC r0+x, m6, m7, x
-%assign x x+mmsize*2
-%endrep
-%endif
-    QUANT_END
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
-;-----------------------------------------------------------------------------
-%macro QUANT_AC 2
-cglobal quant_%1x%2, 3,3,8
-%assign x 0
-%rep %1*%2/(mmsize/2)
-    QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
-%assign x x+mmsize*2
-%endrep
-    QUANT_END
-    RET
-%endmacro
-
-%macro QUANT_4x4 2
-    QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
-    QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
-%endmacro
-
-%macro QUANT_4x4x4 0
-cglobal quant_4x4x4, 3,3,8
-    QUANT_4x4  0, 5
-    QUANT_4x4 64, 6
-    add       r0, 128
-    packssdw  m5, m6
-    QUANT_4x4  0, 6
-    QUANT_4x4 64, 7
-    packssdw  m6, m7
-    packssdw  m5, m6  ; AAAA BBBB CCCC DDDD
-    pxor      m4, m4
-    pcmpeqd   m5, m4
-    movmskps eax, m5
-    xor      eax, 0xf
-    RET
-%endmacro
-
-INIT_XMM sse2
-QUANT_DC 2, 2
-QUANT_DC 4, 4
-QUANT_AC 4, 4
-QUANT_AC 8, 8
-QUANT_4x4x4
-
-INIT_XMM ssse3
-QUANT_DC 2, 2
-QUANT_DC 4, 4
-QUANT_AC 4, 4
-QUANT_AC 8, 8
-QUANT_4x4x4
-
-INIT_XMM sse4
-QUANT_DC 2, 2
-QUANT_DC 4, 4
-QUANT_AC 4, 4
-QUANT_AC 8, 8
-QUANT_4x4x4
-
-INIT_YMM avx2
-QUANT_DC 4, 4
-QUANT_AC 4, 4
-QUANT_AC 8, 8
-
-INIT_YMM avx2
-cglobal quant_4x4x4, 3,3,6
-    QUANT_TWO_AC r0,    r1, r2, 0, 4
-    QUANT_TWO_AC r0+64, r1, r2, 0, 5
-    add       r0, 128
-    packssdw  m4, m5
-    QUANT_TWO_AC r0,    r1, r2, 0, 5
-    QUANT_TWO_AC r0+64, r1, r2, 0, 1
-    packssdw  m5, m1
-    packssdw  m4, m5
-    pxor      m3, m3
-    pcmpeqd   m4, m3
-    movmskps eax, m4
-    mov      edx, eax
-    shr      eax, 4
-    and      eax, edx
-    xor      eax, 0xf
-    RET
-
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-%macro QUANT_ONE 5
-;;; %1      (m64)       dct[y][x]
-;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
-;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
-    mova       m1, %1   ; load dct coeffs
-    ABSW       m0, m1, sign
-    paddusw    m0, %3   ; round
-    pmulhuw    m0, %2   ; divide
-    PSIGNW     m0, m1   ; restore sign
-    mova       %1, m0   ; store
-    ACCUM     por, %5, 0, %4
-%endmacro
-
-%macro QUANT_TWO 8
-    mova       m1, %1
-    mova       m3, %2
-    ABSW       m0, m1, sign
-    ABSW       m2, m3, sign
-    paddusw    m0, %5
-    paddusw    m2, %6
-    pmulhuw    m0, %3
-    pmulhuw    m2, %4
-    PSIGNW     m0, m1
-    PSIGNW     m2, m3
-    mova       %1, m0
-    mova       %2, m2
-    ACCUM     por, %8, 0, %7
-    ACCUM     por, %8, 2, %7+mmsize
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
-;-----------------------------------------------------------------------------
-%macro QUANT_DC 2-3 0
-cglobal %1, 1,1,%3
-%if %2==1
-    QUANT_DC_START 2,3
-    QUANT_ONE [r0], m2, m3, 0, 5
-%else
-    QUANT_DC_START 4,6
-%assign x 0
-%rep %2/2
-    QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
-%assign x x+mmsize*2
-%endrep
-%endif
-    QUANT_END
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-;-----------------------------------------------------------------------------
-%macro QUANT_AC 2
-cglobal %1, 3,3
-%if %2==1
-    QUANT_ONE [r0], [r1], [r2], 0, 5
-%else
-%assign x 0
-%rep %2/2
-    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
-%assign x x+mmsize*2
-%endrep
-%endif
-    QUANT_END
-    RET
-%endmacro
-
-%macro QUANT_4x4 2
-%if UNIX64
-    QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
-%else
-    QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
-%if mmsize==8
-    QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
-%endif
-%endif
-%endmacro
-
-%macro QUANT_4x4x4 0
-cglobal quant_4x4x4, 3,3,7
-%if UNIX64
-    mova      m8, [r1+mmsize*0]
-    mova      m9, [r1+mmsize*1]
-    mova     m10, [r2+mmsize*0]
-    mova     m11, [r2+mmsize*1]
-%endif
-    QUANT_4x4  0, 4
-    QUANT_4x4 32, 5
-    packssdw  m4, m5
-    QUANT_4x4 64, 5
-    QUANT_4x4 96, 6
-    packssdw  m5, m6
-    packssdw  m4, m5  ; AAAA BBBB CCCC DDDD
-    pxor      m3, m3
-    pcmpeqd   m4, m3
-    movmskps eax, m4
-    xor      eax, 0xf
-    RET
-%endmacro
-
-INIT_MMX mmx2
-QUANT_DC quant_2x2_dc, 1
-%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
-QUANT_DC quant_4x4_dc, 4
-INIT_MMX mmx2
-QUANT_AC quant_4x4, 4
-QUANT_AC quant_8x8, 16
-%endif
-
-INIT_XMM sse2
-QUANT_DC quant_4x4_dc, 2, 7
-QUANT_AC quant_4x4, 2
-QUANT_AC quant_8x8, 8
-QUANT_4x4x4
-
-INIT_XMM ssse3
-QUANT_DC quant_4x4_dc, 2, 7
-QUANT_AC quant_4x4, 2
-QUANT_AC quant_8x8, 8
-QUANT_4x4x4
-
-INIT_MMX ssse3
-QUANT_DC quant_2x2_dc, 1
-
-INIT_XMM sse4
-;Not faster on Conroe, so only used in SSE4 versions
-QUANT_DC quant_4x4_dc, 2, 7
-QUANT_AC quant_4x4, 2
-QUANT_AC quant_8x8, 8
-
-INIT_YMM avx2
-QUANT_AC quant_4x4, 1
-QUANT_AC quant_8x8, 4
-QUANT_DC quant_4x4_dc, 1, 6
-
-INIT_YMM avx2
-cglobal quant_4x4x4, 3,3,6
-    mova      m2, [r1]
-    mova      m3, [r2]
-    QUANT_ONE [r0+ 0], m2, m3, 0, 4
-    QUANT_ONE [r0+32], m2, m3, 0, 5
-    packssdw  m4, m5
-    QUANT_ONE [r0+64], m2, m3, 0, 5
-    QUANT_ONE [r0+96], m2, m3, 0, 1
-    packssdw  m5, m1
-    packssdw  m4, m5
-    pxor      m3, m3
-    pcmpeqd   m4, m3
-    movmskps eax, m4
-    mov      edx, eax
-    shr      eax, 4
-    and      eax, edx
-    xor      eax, 0xf
-    RET
-%endif ; !HIGH_BIT_DEPTH
-
-
-
-;=============================================================================
-; dequant
-;=============================================================================
-
-%macro DEQUANT16_L 4
-;;; %1      dct[y][x]
-;;; %2,%3   dequant_mf[i_mf][y][x]
-;;; m2      i_qbits
-%if HIGH_BIT_DEPTH
-    mova     m0, %1
-    mova     m1, %4
-    pmaddwd  m0, %2
-    pmaddwd  m1, %3
-    pslld    m0, xm2
-    pslld    m1, xm2
-    mova     %1, m0
-    mova     %4, m1
-%else
-    mova     m0, %2
-    packssdw m0, %3
-%if mmsize==32
-    vpermq   m0, m0, q3120
-%endif
-    pmullw   m0, %1
-    psllw    m0, xm2
-    mova     %1, m0
-%endif
-%endmacro
-
-%macro DEQUANT32_R 4
-;;; %1      dct[y][x]
-;;; %2,%3   dequant_mf[i_mf][y][x]
-;;; m2      -i_qbits
-;;; m3      f
-;;; m4      0
-%if HIGH_BIT_DEPTH
-    mova      m0, %1
-    mova      m1, %4
-    pmadcswd  m0, m0, %2, m3
-    pmadcswd  m1, m1, %3, m3
-    psrad     m0, xm2
-    psrad     m1, xm2
-    mova      %1, m0
-    mova      %4, m1
-%else
-%if mmsize == 32
-    pmovzxwd  m0, %1
-    pmovzxwd  m1, %4
-%else
-    mova      m0, %1
-    punpckhwd m1, m0, m4
-    punpcklwd m0, m4
-%endif
-    pmadcswd  m0, m0, %2, m3
-    pmadcswd  m1, m1, %3, m3
-    psrad     m0, xm2
-    psrad     m1, xm2
-    packssdw  m0, m1
-%if mmsize == 32
-    vpermq    m0, m0, q3120
-%endif
-    mova      %1, m0
-%endif
-%endmacro
-
-%macro DEQUANT_LOOP 3
-%if 8*(%2-2*%3) > 0
-    mov t0d, 8*(%2-2*%3)
-%%loop:
-    %1 [r0+(t0     )*SIZEOF_PIXEL], [r1+t0*2      ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
-    %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
-    sub t0d, 16*%3
-    jge %%loop
-    RET
-%else
-%if mmsize < 32
-    %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
-%endif
-    %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
-    RET
-%endif
-%endmacro
-
-%macro DEQUANT16_FLAT 2-5
-    mova   m0, %1
-    psllw  m0, m4
-%assign i %0-2
-%rep %0-1
-%if i
-    mova   m %+ i, [r0+%2]
-    pmullw m %+ i, m0
-%else
-    pmullw m0, [r0+%2]
-%endif
-    mova   [r0+%2], m %+ i
-    %assign i i-1
-    %rotate 1
-%endrep
-%endmacro
-
-%if ARCH_X86_64
-    DECLARE_REG_TMP 6,3,2
-%else
-    DECLARE_REG_TMP 2,0,1
-%endif
-
-%macro DEQUANT_START 2
-    movifnidn t2d, r2m
-    imul t0d, t2d, 0x2b
-    shr  t0d, 8     ; i_qbits = i_qp / 6
-    lea  t1d, [t0*5]
-    sub  t2d, t0d
-    sub  t2d, t1d   ; i_mf = i_qp % 6
-    shl  t2d, %1
-%if ARCH_X86_64
-    add  r1, t2     ; dequant_mf[i_mf]
-%else
-    add  r1, r1mp   ; dequant_mf[i_mf]
-    mov  r0, r0mp   ; dct
-%endif
-    sub  t0d, %2
-    jl   .rshift32  ; negative qbits => rightshift
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
-;-----------------------------------------------------------------------------
-%macro DEQUANT 3
-cglobal dequant_%1x%1, 0,3,6
-.skip_prologue:
-    DEQUANT_START %2+2, %2
-
-.lshift:
-    movd xm2, t0d
-    DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
-
-.rshift32:
-    neg   t0d
-    mova  m3, [pd_1]
-    movd xm2, t0d
-    pslld m3, xm2
-    pxor  m4, m4
-    psrld m3, 1
-    DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-
-%if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
-cglobal dequant_%1x%1_flat16, 0,3
-    movifnidn t2d, r2m
-%if %1 == 8
-    cmp  t2d, 12
-    jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
-    sub  t2d, 12
-%endif
-    imul t0d, t2d, 0x2b
-    shr  t0d, 8     ; i_qbits = i_qp / 6
-    lea  t1d, [t0*5]
-    sub  t2d, t0d
-    sub  t2d, t1d   ; i_mf = i_qp % 6
-    shl  t2d, %2
-%ifdef PIC
-    lea  r1, [dequant%1_scale]
-    add  r1, t2
-%else
-    lea  r1, [dequant%1_scale + t2]
-%endif
-    movifnidn r0, r0mp
-    movd xm4, t0d
-%if %1 == 4
-%if mmsize == 8
-    DEQUANT16_FLAT [r1], 0, 16
-    DEQUANT16_FLAT [r1+8], 8, 24
-%elif mmsize == 16
-    DEQUANT16_FLAT [r1], 0, 16
-%else
-    vbroadcasti128 m0, [r1]
-    psllw  m0, xm4
-    pmullw m0, [r0]
-    mova [r0], m0
-%endif
-%elif mmsize == 8
-    DEQUANT16_FLAT [r1], 0, 8, 64, 72
-    DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
-    DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
-    DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
-%elif mmsize == 16
-    DEQUANT16_FLAT [r1], 0, 64
-    DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
-    DEQUANT16_FLAT [r1+32], 32, 96
-%else
-    mova   m1, [r1+ 0]
-    mova   m2, [r1+32]
-    psllw  m1, xm4
-    psllw  m2, xm4
-    pmullw m0, m1, [r0+ 0]
-    pmullw m3, m2, [r0+32]
-    pmullw m4, m1, [r0+64]
-    pmullw m5, m2, [r0+96]
-    mova [r0+ 0], m0
-    mova [r0+32], m3
-    mova [r0+64], m4
-    mova [r0+96], m5
-%endif
-    RET
-%endif ; !HIGH_BIT_DEPTH && !AVX
-%endmacro ; DEQUANT
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-DEQUANT 4, 4, 2
-DEQUANT 8, 6, 2
-INIT_XMM xop
-DEQUANT 4, 4, 2
-DEQUANT 8, 6, 2
-INIT_YMM avx2
-DEQUANT 4, 4, 4
-DEQUANT 8, 6, 4
-%else
-%if ARCH_X86_64 == 0
-INIT_MMX mmx
-DEQUANT 4, 4, 1
-DEQUANT 8, 6, 1
-%endif
-INIT_XMM sse2
-DEQUANT 4, 4, 2
-DEQUANT 8, 6, 2
-INIT_XMM avx
-DEQUANT 4, 4, 2
-DEQUANT 8, 6, 2
-INIT_XMM xop
-DEQUANT 4, 4, 2
-DEQUANT 8, 6, 2
-INIT_YMM avx2
-DEQUANT 4, 4, 4
-DEQUANT 8, 6, 4
-%endif
-
-%macro DEQUANT_DC 2
-cglobal dequant_4x4dc, 0,3,6
-    DEQUANT_START 6, 6
-
-.lshift:
-%if cpuflag(avx2)
-    vpbroadcastdct m3, [r1]
-%else
-    movd    xm3, [r1]
-    SPLAT%1  m3, xm3
-%endif
-    movd    xm2, t0d
-    pslld    m3, xm2
-%assign %%x 0
-%rep SIZEOF_PIXEL*32/mmsize
-    %2       m0, m3, [r0+%%x]
-    mova     [r0+%%x], m0
-%assign %%x %%x+mmsize
-%endrep
-    RET
-
-.rshift32:
-    neg      t0d
-%if cpuflag(avx2)
-    vpbroadcastdct m2, [r1]
-%else
-    movd     xm2, [r1]
-%endif
-    mova      m5, [p%1_1]
-    movd     xm3, t0d
-    pslld     m4, m5, xm3
-    psrld     m4, 1
-%if HIGH_BIT_DEPTH
-%if notcpuflag(avx2)
-    pshufd    m2, m2, 0
-%endif
-%assign %%x 0
-%rep SIZEOF_PIXEL*32/mmsize
-    pmadcswd  m0, m2, [r0+%%x], m4
-    psrad     m0, xm3
-    mova      [r0+%%x], m0
-%assign %%x %%x+mmsize
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if notcpuflag(avx2)
-    PSHUFLW   m2, m2, 0
-%endif
-    punpcklwd m2, m4
-%assign %%x 0
-%rep SIZEOF_PIXEL*32/mmsize
-    mova      m0, [r0+%%x]
-    punpckhwd m1, m0, m5
-    punpcklwd m0, m5
-    pmaddwd   m0, m2
-    pmaddwd   m1, m2
-    psrad     m0, xm3
-    psrad     m1, xm3
-    packssdw  m0, m1
-    mova      [r0+%%x], m0
-%assign %%x %%x+mmsize
-%endrep
-%endif ; !HIGH_BIT_DEPTH
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-DEQUANT_DC d, pmaddwd
-INIT_XMM xop
-DEQUANT_DC d, pmaddwd
-INIT_YMM avx2
-DEQUANT_DC d, pmaddwd
-%else
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DEQUANT_DC w, pmullw
-%endif
-INIT_XMM sse2
-DEQUANT_DC w, pmullw
-INIT_XMM avx
-DEQUANT_DC w, pmullw
-INIT_YMM avx2
-DEQUANT_DC w, pmullw
-%endif
-
-%macro PEXTRW 4
-    %if cpuflag(sse4)
-        pextrw %1, %2, %3
-    %else
-        ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
-        %if %3
-            pextrw %4d, %2, %3
-        %else
-            movd %4d, %2
-        %endif
-        mov %1, %4w
-    %endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
-; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
-;-----------------------------------------------------------------------------
-
-%macro DEQUANT_2x4_DC 1
-%ifidn %1, dconly
-    DECLARE_REG_TMP 6,3,2
-    %define %%args dct, dmf, qp
-%else
-    DECLARE_REG_TMP 6,4,3
-    %define %%args dct, dct4x4, dmf, qp
-%endif
-
-%if ARCH_X86_64 == 0
-    DECLARE_REG_TMP 2,0,1
-%endif
-
-cglobal idct_dequant_2x4_%1, 0,3,5, %%args
-    movifnidn  t2d, qpm
-    imul       t0d, t2d, 0x2b
-    shr        t0d, 8         ; qp / 6
-    lea        t1d, [t0*5]
-    sub        t2d, t0d
-    sub        t2d, t1d       ; qp % 6
-    shl        t2d, 6         ; 16 * sizeof(int)
-%if ARCH_X86_64
-    imul       t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
-%else
-    mov       dctq, dctmp
-    add         t2, dmfmp
-    imul       t2d, [t2], -0xffff
-%endif
-%if HIGH_BIT_DEPTH
-    mova        m0, [dctq]
-    mova        m1, [dctq+16]
-    SUMSUB_BA    d, 1, 0, 2   ; 16-bit intermediate precision is enough for the first two sumsub steps,
-    packssdw    m1, m0        ; and by packing to words we can use pmaddwd instead of pmulld later.
-%else
-    movq        m0, [dctq]
-    movq        m1, [dctq+8]
-    SUMSUB_BA    w, 1, 0, 2
-    punpcklqdq  m1, m0        ; a0 a1 a2 a3 a4 a5 a6 a7
-%endif
-    pshufd      m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
-    movd        m3, t2d
-    pshuflw     m3, m3, q1000 ; +  +  +  -
-    SUMSUB_BA    w, 0, 1, 2
-    punpcklqdq  m3, m3        ; +  +  +  -  +  +  +  -
-    pshufd      m1, m1, q0022
-    sub        t0d, 6
-    jl .rshift
-    movd        m2, t0d
-    psllw       m3, m2
-    pmaddwd     m0, m3
-    pmaddwd     m1, m3
-    jmp .end
-.rshift:
-    neg        t0d
-    movd        m2, t0d
-    pcmpeqd     m4, m4
-    pmaddwd     m0, m3
-    pmaddwd     m1, m3
-    pslld       m4, m2
-    psrad       m4, 1
-    psubd       m0, m4 ; + 1 << (qp/6-1)
-    psubd       m1, m4
-    psrad       m0, m2
-    psrad       m1, m2
-.end:
-%ifidn %1, dconly
-%if HIGH_BIT_DEPTH
-    mova    [dctq], m0
-    mova [dctq+16], m1
-%else
-    packssdw    m0, m1
-    mova    [dctq], m0
-%endif
-%else
-    movifnidn dct4x4q, dct4x4mp
-%if HIGH_BIT_DEPTH
-    movd   [dct4x4q+0*64], m0
-%if cpuflag(sse4)
-    pextrd [dct4x4q+1*64], m0, 1
-    add    dct4x4q, 4*64
-    pextrd [dct4x4q-2*64], m0, 2
-    pextrd [dct4x4q-1*64], m0, 3
-    movd   [dct4x4q+0*64], m1
-    pextrd [dct4x4q+1*64], m1, 1
-    pextrd [dct4x4q+2*64], m1, 2
-    pextrd [dct4x4q+3*64], m1, 3
-%else
-    MOVHL       m2, m0
-    psrlq       m0, 32
-    movd   [dct4x4q+1*64], m0
-    add    dct4x4q, 4*64
-    movd   [dct4x4q-2*64], m2
-    psrlq       m2, 32
-    movd   [dct4x4q-1*64], m2
-    movd   [dct4x4q+0*64], m1
-    MOVHL       m2, m1
-    psrlq       m1, 32
-    movd   [dct4x4q+1*64], m1
-    movd   [dct4x4q+2*64], m2
-    psrlq       m2, 32
-    movd   [dct4x4q+3*64], m2
-%endif
-%else
-    PEXTRW [dct4x4q+0*32], m0, 0, eax
-    PEXTRW [dct4x4q+1*32], m0, 2, eax
-    PEXTRW [dct4x4q+2*32], m0, 4, eax
-    PEXTRW [dct4x4q+3*32], m0, 6, eax
-    add    dct4x4q, 4*32
-    PEXTRW [dct4x4q+0*32], m1, 0, eax
-    PEXTRW [dct4x4q+1*32], m1, 2, eax
-    PEXTRW [dct4x4q+2*32], m1, 4, eax
-    PEXTRW [dct4x4q+3*32], m1, 6, eax
-%endif
-%endif
-    RET
-%endmacro
-
-; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
-INIT_XMM sse2
-DEQUANT_2x4_DC dc
-DEQUANT_2x4_DC dconly
-INIT_XMM avx
-DEQUANT_2x4_DC dc
-DEQUANT_2x4_DC dconly
-
-; t4 is eax for return value.
-%if ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,6,4  ; Identical for both Windows and *NIX
-%else
-    DECLARE_REG_TMP 4,1,2,3,0,5
-%endif
-
-;-----------------------------------------------------------------------------
-; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
-;-----------------------------------------------------------------------------
-
-%macro OPTIMIZE_CHROMA_2x2_DC 0
-cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
-    movifnidn t0, r0mp
-    movd      m2, r1m
-    movq      m1, [t0]
-%if cpuflag(sse4)
-    pcmpeqb   m4, m4
-    pslld     m4, 11
-%else
-    pxor      m4, m4
-%endif
-%if cpuflag(ssse3)
-    mova      m3, [chroma_dc_dct_mask]
-    mova      m5, [chroma_dc_dmf_mask]
-%else
-    mova      m3, [chroma_dc_dct_mask_mmx]
-    mova      m5, [chroma_dc_dmf_mask_mmx]
-%endif
-    pshuflw   m2, m2, 0
-    pshufd    m0, m1, q0101      ;  1  0  3  2  1  0  3  2
-    punpcklqdq m2, m2
-    punpcklqdq m1, m1            ;  3  2  1  0  3  2  1  0
-    mova      m6, [pd_1024]      ; 32<<5, elements are shifted 5 bits to the left
-    PSIGNW    m0, m3             ; -1 -0  3  2 -1 -0  3  2
-    PSIGNW    m2, m5             ;  +  -  -  +  -  -  +  +
-    paddw     m0, m1             ; -1+3 -0+2  1+3  0+2 -1+3 -0+2  1+3  0+2
-    pmaddwd   m0, m2             ;  0-1-2+3  0-1+2-3  0+1-2-3  0+1+2+3  * dmf
-    punpcklwd m1, m1
-    psrad     m2, 16             ;  +  -  -  +
-    mov      t1d, 3
-    paddd     m0, m6
-    xor      t4d, t4d
-%if notcpuflag(ssse3)
-    psrad     m1, 31             ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
-%endif
-%if cpuflag(sse4)
-    ptest     m0, m4
-%else
-    mova      m6, m0
-    SWAP       0, 6
-    psrad     m6, 11
-    pcmpeqd   m6, m4
-    pmovmskb t5d, m6
-    cmp      t5d, 0xffff
-%endif
-    jz .ret                      ; if the DC coefficients already round to zero, terminate early
-    mova      m3, m0
-.outer_loop:
-    movsx    t3d, word [t0+2*t1] ; dct[coeff]
-    pshufd    m6, m1, q3333
-    pshufd    m1, m1, q2100      ; move the next element to high dword
-    PSIGND    m5, m2, m6
-    test     t3d, t3d
-    jz .loop_end
-.outer_loop_0:
-    mov      t2d, t3d
-    sar      t3d, 31
-    or       t3d, 1
-.inner_loop:
-    psubd     m3, m5             ; coeff -= sign
-    pxor      m6, m0, m3
-%if cpuflag(sse4)
-    ptest     m6, m4
-%else
-    psrad     m6, 11
-    pcmpeqd   m6, m4
-    pmovmskb t5d, m6
-    cmp      t5d, 0xffff
-%endif
-    jz .round_coeff
-    paddd     m3, m5             ; coeff += sign
-    mov      t4d, 1
-.loop_end:
-    dec      t1d
-    jz .last_coeff
-    pshufd    m2, m2, q1320      ;  -  +  -  +  /  -  -  +  +
-    jg .outer_loop
-.ret:
-    REP_RET
-.round_coeff:
-    sub      t2d, t3d
-    mov [t0+2*t1], t2w
-    jnz .inner_loop
-    jmp .loop_end
-.last_coeff:
-    movsx    t3d, word [t0]
-    punpcklqdq m2, m2            ;  +  +  +  +
-    PSIGND    m5, m2, m1
-    test     t3d, t3d
-    jnz .outer_loop_0
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM sse2
-OPTIMIZE_CHROMA_2x2_DC
-INIT_XMM ssse3
-OPTIMIZE_CHROMA_2x2_DC
-INIT_XMM sse4
-OPTIMIZE_CHROMA_2x2_DC
-INIT_XMM avx
-OPTIMIZE_CHROMA_2x2_DC
-%endif ; !HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
-;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,6
-    pxor      m5, m5
-    movsxdifnidn r3, r3d
-.loop:
-    mova      m2, [r0+r3*4-2*mmsize]
-    mova      m3, [r0+r3*4-1*mmsize]
-    ABSD      m0, m2
-    ABSD      m1, m3
-    paddd     m4, m0, [r1+r3*4-2*mmsize]
-    psubd     m0, [r2+r3*4-2*mmsize]
-    mova      [r1+r3*4-2*mmsize], m4
-    paddd     m4, m1, [r1+r3*4-1*mmsize]
-    psubd     m1, [r2+r3*4-1*mmsize]
-    mova      [r1+r3*4-1*mmsize], m4
-    pcmpgtd   m4, m0, m5
-    pand      m0, m4
-    pcmpgtd   m4, m1, m5
-    pand      m1, m4
-    PSIGND    m0, m2
-    PSIGND    m1, m3
-    mova      [r0+r3*4-2*mmsize], m0
-    mova      [r0+r3*4-1*mmsize], m1
-    sub      r3d, mmsize/2
-    jg .loop
-    RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx
-DENOISE_DCT
-%endif
-INIT_XMM sse2
-DENOISE_DCT
-INIT_XMM ssse3
-DENOISE_DCT
-INIT_XMM avx
-DENOISE_DCT
-INIT_YMM avx2
-DENOISE_DCT
-
-%else ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
-;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,7
-    pxor      m6, m6
-    movsxdifnidn r3, r3d
-.loop:
-    mova      m2, [r0+r3*2-2*mmsize]
-    mova      m3, [r0+r3*2-1*mmsize]
-    ABSW      m0, m2, sign
-    ABSW      m1, m3, sign
-    psubusw   m4, m0, [r2+r3*2-2*mmsize]
-    psubusw   m5, m1, [r2+r3*2-1*mmsize]
-    PSIGNW    m4, m2
-    PSIGNW    m5, m3
-    mova      [r0+r3*2-2*mmsize], m4
-    mova      [r0+r3*2-1*mmsize], m5
-    punpcklwd m2, m0, m6
-    punpcklwd m3, m1, m6
-    punpckhwd m0, m6
-    punpckhwd m1, m6
-    paddd     m2, [r1+r3*4-4*mmsize]
-    paddd     m0, [r1+r3*4-3*mmsize]
-    paddd     m3, [r1+r3*4-2*mmsize]
-    paddd     m1, [r1+r3*4-1*mmsize]
-    mova      [r1+r3*4-4*mmsize], m2
-    mova      [r1+r3*4-3*mmsize], m0
-    mova      [r1+r3*4-2*mmsize], m3
-    mova      [r1+r3*4-1*mmsize], m1
-    sub       r3, mmsize
-    jg .loop
-    RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx
-DENOISE_DCT
-%endif
-INIT_XMM sse2
-DENOISE_DCT
-INIT_XMM ssse3
-DENOISE_DCT
-INIT_XMM avx
-DENOISE_DCT
-
-INIT_YMM avx2
-cglobal denoise_dct, 4,4,4
-    pxor      m3, m3
-    movsxdifnidn r3, r3d
-.loop:
-    mova      m1, [r0+r3*2-mmsize]
-    pabsw     m0, m1
-    psubusw   m2, m0, [r2+r3*2-mmsize]
-    vpermq    m0, m0, q3120
-    psignw    m2, m1
-    mova [r0+r3*2-mmsize], m2
-    punpcklwd m1, m0, m3
-    punpckhwd m0, m3
-    paddd     m1, [r1+r3*4-2*mmsize]
-    paddd     m0, [r1+r3*4-1*mmsize]
-    mova      [r1+r3*4-2*mmsize], m1
-    mova      [r1+r3*4-1*mmsize], m0
-    sub       r3, mmsize/2
-    jg .loop
-    RET
-
-%endif ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; int decimate_score( dctcoef *dct )
-;-----------------------------------------------------------------------------
-
-%macro DECIMATE_MASK 5
-%if mmsize==16
-%if HIGH_BIT_DEPTH
-    movdqa    m0, [%3+ 0]
-    movdqa    m1, [%3+32]
-    packssdw  m0, [%3+16]
-    packssdw  m1, [%3+48]
-    ABSW2     m0, m1, m0, m1, m3, m4
-%else
-    ABSW      m0, [%3+ 0], m3
-    ABSW      m1, [%3+16], m4
-%endif
-    packsswb  m0, m1
-    pxor      m2, m2
-    pcmpeqb   m2, m0
-    pcmpgtb   m0, %4
-    pmovmskb  %1, m2
-    pmovmskb  %2, m0
-%else ; mmsize==8
-%if HIGH_BIT_DEPTH
-    movq      m0, [%3+ 0]
-    movq      m1, [%3+16]
-    movq      m2, [%3+32]
-    movq      m3, [%3+48]
-    packssdw  m0, [%3+ 8]
-    packssdw  m1, [%3+24]
-    packssdw  m2, [%3+40]
-    packssdw  m3, [%3+56]
-%else
-    movq      m0, [%3+ 0]
-    movq      m1, [%3+ 8]
-    movq      m2, [%3+16]
-    movq      m3, [%3+24]
-%endif
-    ABSW2     m0, m1, m0, m1, m6, m7
-    ABSW2     m2, m3, m2, m3, m6, m7
-    packsswb  m0, m1
-    packsswb  m2, m3
-    pxor      m4, m4
-    pxor      m6, m6
-    pcmpeqb   m4, m0
-    pcmpeqb   m6, m2
-    pcmpgtb   m0, %4
-    pcmpgtb   m2, %4
-    pmovmskb  %5, m4
-    pmovmskb  %1, m6
-    shl       %1, 8
-    or        %1, %5
-    pmovmskb  %5, m0
-    pmovmskb  %2, m2
-    shl       %2, 8
-    or        %2, %5
-%endif
-%endmacro
-
-cextern decimate_table4
-cextern decimate_table8
-
-%macro DECIMATE4x4 1
-
-cglobal decimate_score%1, 1,3
-%ifdef PIC
-    lea r4, [decimate_table4]
-    lea r5, [decimate_mask_table4]
-    %define table r4
-    %define mask_table r5
-%else
-    %define table decimate_table4
-    %define mask_table decimate_mask_table4
-%endif
-    DECIMATE_MASK edx, eax, r0, [pb_1], ecx
-    xor   edx, 0xffff
-    je   .ret
-    test  eax, eax
-    jne  .ret9
-%if %1==15
-    shr   edx, 1
-%endif
-    movzx ecx, dl
-    movzx eax, byte [mask_table + rcx]
-    cmp   edx, ecx
-    je   .ret
-    bsr   ecx, ecx
-    shr   edx, 1
-    shr   edx, cl
-    tzcnt ecx, edx
-    shr   edx, 1
-    shr   edx, cl
-    add    al, byte [table + rcx]
-    add    al, byte [mask_table + rdx]
-.ret:
-    REP_RET
-.ret9:
-    mov   eax, 9
-    RET
-
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DECIMATE4x4 15
-DECIMATE4x4 16
-%endif
-INIT_XMM sse2
-DECIMATE4x4 15
-DECIMATE4x4 16
-INIT_XMM ssse3
-DECIMATE4x4 15
-DECIMATE4x4 16
-
-; 2x gt1 output, 2x nz output, 1x mask
-%macro DECIMATE_MASK64_AVX2 5
-    pabsw     m0, [r0+ 0]
-    pabsw     m2, [r0+32]
-    pabsw     m1, [r0+64]
-    pabsw     m3, [r0+96]
-    packsswb  m0, m2
-    packsswb  m1, m3
-    pcmpgtb   m2, m0, %5    ; the > 1 checks don't care about order, so
-    pcmpgtb   m3, m1, %5    ; we can save latency by doing them here
-    pmovmskb  %1, m2
-    pmovmskb  %2, m3
-    or        %1, %2
-    jne .ret9
-    vpermq    m0, m0, q3120
-    vpermq    m1, m1, q3120
-    pxor      m4, m4
-    pcmpeqb   m0, m4
-    pcmpeqb   m1, m4
-    pmovmskb  %3, m0
-    pmovmskb  %4, m1
-%endmacro
-
-%macro DECIMATE8x8 0
-
-%if ARCH_X86_64
-cglobal decimate_score64, 1,5
-%ifdef PIC
-    lea r4, [decimate_table8]
-    %define table r4
-%else
-    %define table decimate_table8
-%endif
-    mova  m5, [pb_1]
-%if mmsize==32
-    DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
-    shl   r3, 32
-    or    r1, r3
-    xor   r1, -1
-    je  .ret
-%else
-    DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
-    test eax, eax
-    jne  .ret9
-    DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
-    shl  r2d, 16
-    or   r1d, r2d
-    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
-    shl   r2, 32
-    or   eax, r3d
-    or    r1, r2
-    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
-    shl   r2, 48
-    or    r1, r2
-    xor   r1, -1
-    je   .ret
-    add  eax, r3d
-    jne  .ret9
-%endif
-    mov   al, -6
-.loop:
-    tzcnt rcx, r1
-    shr   r1, cl
-    add   al, byte [table + rcx]
-    jge  .ret9
-    shr   r1, 1
-    jne  .loop
-    add   al, 6
-.ret:
-    REP_RET
-.ret9:
-    mov  eax, 9
-    RET
-
-%else ; ARCH
-%if mmsize == 8
-cglobal decimate_score64, 1,6
-%else
-cglobal decimate_score64, 1,5
-%endif
-    mova  m5, [pb_1]
-%if mmsize==32
-    DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
-    xor   r3, -1
-    je .tryret
-    xor   r4, -1
-.cont:
-%else
-    DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
-    test  r2, r2
-    jne  .ret9
-    DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
-    shl   r4, 16
-    or    r3, r4
-    DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
-    or    r2, r1
-    DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
-    shl   r1, 16
-    or    r4, r1
-    xor   r3, -1
-    je   .tryret
-    xor   r4, -1
-.cont:
-    add   r0, r2
-    jne  .ret9
-%endif
-    mov   al, -6
-.loop:
-    tzcnt ecx, r3
-    test  r3, r3
-    je   .largerun
-    shrd  r3, r4, cl
-    shr   r4, cl
-    add   al, byte [decimate_table8 + ecx]
-    jge  .ret9
-    shrd  r3, r4, 1
-    shr   r4, 1
-    test  r3, r3
-    jne  .loop
-    test  r4, r4
-    jne  .loop
-    add   al, 6
-.ret:
-    REP_RET
-.tryret:
-    xor   r4, -1
-    jne  .cont
-    RET
-.ret9:
-    mov   eax, 9
-    RET
-.largerun:
-    mov   r3, r4
-    xor   r4, r4
-    tzcnt ecx, r3
-    shr   r3, cl
-    shr   r3, 1
-    jne  .loop
-    add   al, 6
-    RET
-%endif ; ARCH
-
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DECIMATE8x8
-%endif
-INIT_XMM sse2
-DECIMATE8x8
-INIT_XMM ssse3
-DECIMATE8x8
-INIT_YMM avx2
-DECIMATE8x8
-
-;-----------------------------------------------------------------------------
-; int coeff_last( dctcoef *dct )
-;-----------------------------------------------------------------------------
-
-%macro BSR 3
-%if cpuflag(lzcnt)
-    lzcnt %1, %2
-    xor %1, %3
-%else
-    bsr %1, %2
-%endif
-%endmacro
-
-%macro LZCOUNT 3
-%if cpuflag(lzcnt)
-    lzcnt %1, %2
-%else
-    bsr %1, %2
-    xor %1, %3
-%endif
-%endmacro
-
-%if HIGH_BIT_DEPTH
-%macro LAST_MASK 3-4
-%if %1 == 4
-    movq     mm0, [%3]
-    packssdw mm0, [%3+8]
-    packsswb mm0, mm0
-    pcmpeqb  mm0, mm2
-    pmovmskb  %2, mm0
-%elif mmsize == 16
-    movdqa   xmm0, [%3+ 0]
-%if %1 == 8
-    packssdw xmm0, [%3+16]
-    packsswb xmm0, xmm0
-%else
-    movdqa   xmm1, [%3+32]
-    packssdw xmm0, [%3+16]
-    packssdw xmm1, [%3+48]
-    packsswb xmm0, xmm1
-%endif
-    pcmpeqb  xmm0, xmm2
-    pmovmskb   %2, xmm0
-%elif %1 == 8
-    movq     mm0, [%3+ 0]
-    movq     mm1, [%3+16]
-    packssdw mm0, [%3+ 8]
-    packssdw mm1, [%3+24]
-    packsswb mm0, mm1
-    pcmpeqb  mm0, mm2
-    pmovmskb  %2, mm0
-%else
-    movq     mm0, [%3+ 0]
-    movq     mm1, [%3+16]
-    packssdw mm0, [%3+ 8]
-    packssdw mm1, [%3+24]
-    movq     mm3, [%3+32]
-    movq     mm4, [%3+48]
-    packssdw mm3, [%3+40]
-    packssdw mm4, [%3+56]
-    packsswb mm0, mm1
-    packsswb mm3, mm4
-    pcmpeqb  mm0, mm2
-    pcmpeqb  mm3, mm2
-    pmovmskb  %2, mm0
-    pmovmskb  %4, mm3
-    shl       %4, 8
-    or        %2, %4
-%endif
-%endmacro
-
-%macro COEFF_LAST4 0
-cglobal coeff_last4, 1,3
-    pxor mm2, mm2
-    LAST_MASK 4, r1d, r0
-    xor  r1d, 0xff
-    shr  r1d, 4
-    BSR  eax, r1d, 0x1f
-    RET
-%endmacro
-
-INIT_MMX mmx2
-COEFF_LAST4
-INIT_MMX mmx2, lzcnt
-COEFF_LAST4
-
-%macro COEFF_LAST8 0
-cglobal coeff_last8, 1,3
-    pxor m2, m2
-    LAST_MASK 8, r1d, r0
-%if mmsize == 16
-    xor r1d, 0xffff
-    shr r1d, 8
-%else
-    xor r1d, 0xff
-%endif
-    BSR eax, r1d, 0x1f
-    RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-COEFF_LAST8
-%endif
-INIT_XMM sse2
-COEFF_LAST8
-INIT_XMM sse2, lzcnt
-COEFF_LAST8
-
-%else ; !HIGH_BIT_DEPTH
-%macro LAST_MASK 3-4
-%if %1 <= 8
-    movq     mm0, [%3+ 0]
-%if %1 == 4
-    packsswb mm0, mm0
-%else
-    packsswb mm0, [%3+ 8]
-%endif
-    pcmpeqb  mm0, mm2
-    pmovmskb  %2, mm0
-%elif mmsize == 16
-    movdqa   xmm0, [%3+ 0]
-    packsswb xmm0, [%3+16]
-    pcmpeqb  xmm0, xmm2
-    pmovmskb   %2, xmm0
-%else
-    movq     mm0, [%3+ 0]
-    movq     mm1, [%3+16]
-    packsswb mm0, [%3+ 8]
-    packsswb mm1, [%3+24]
-    pcmpeqb  mm0, mm2
-    pcmpeqb  mm1, mm2
-    pmovmskb  %2, mm0
-    pmovmskb  %4, mm1
-    shl       %4, 8
-    or        %2, %4
-%endif
-%endmacro
-
-%macro COEFF_LAST48 0
-%if ARCH_X86_64
-cglobal coeff_last4, 1,1
-    BSR  rax, [r0], 0x3f
-    shr  eax, 4
-    RET
-%else
-cglobal coeff_last4, 0,3
-    mov   edx, r0mp
-    mov   eax, [edx+4]
-    xor   ecx, ecx
-    test  eax, eax
-    cmovz eax, [edx]
-    setnz cl
-    BSR   eax, eax, 0x1f
-    shr   eax, 4
-    lea   eax, [eax+ecx*2]
-    RET
-%endif
-
-cglobal coeff_last8, 1,3
-    pxor m2, m2
-    LAST_MASK 8, r1d, r0, r2d
-    xor r1d, 0xff
-    BSR eax, r1d, 0x1f
-    RET
-%endmacro
-
-INIT_MMX mmx2
-COEFF_LAST48
-INIT_MMX mmx2, lzcnt
-COEFF_LAST48
-%endif ; HIGH_BIT_DEPTH
-
-%macro COEFF_LAST 0
-cglobal coeff_last15, 1,3
-    pxor m2, m2
-    LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
-    xor r1d, 0xffff
-    BSR eax, r1d, 0x1f
-    dec eax
-    RET
-
-cglobal coeff_last16, 1,3
-    pxor m2, m2
-    LAST_MASK 16, r1d, r0, r2d
-    xor r1d, 0xffff
-    BSR eax, r1d, 0x1f
-    RET
-
-%if ARCH_X86_64 == 0
-cglobal coeff_last64, 1, 4-mmsize/16
-    pxor m2, m2
-    LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
-    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
-    shl r2d, 16
-    or  r1d, r2d
-    xor r1d, -1
-    jne .secondhalf
-    LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
-    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
-    shl r2d, 16
-    or  r1d, r2d
-    not r1d
-    BSR eax, r1d, 0x1f
-    RET
-.secondhalf:
-    BSR eax, r1d, 0x1f
-    add eax, 32
-    RET
-%else
-cglobal coeff_last64, 1,3
-    pxor m2, m2
-    LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
-    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
-    shl r2d, 16
-    or  r1d, r2d
-    LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
-    LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
-    shl r0d, 16
-    or  r2d, r0d
-    shl  r2, 32
-    or   r1, r2
-    not  r1
-    BSR rax, r1, 0x3f
-    RET
-%endif
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-COEFF_LAST
-%endif
-INIT_XMM sse2
-COEFF_LAST
-INIT_XMM sse2, lzcnt
-COEFF_LAST
-
-%macro LAST_MASK_AVX2 2
-%if HIGH_BIT_DEPTH
-    mova     m0, [%2+ 0]
-    packssdw m0, [%2+32]
-    mova     m1, [%2+64]
-    packssdw m1, [%2+96]
-    packsswb m0, m1
-    mova     m1, [deinterleave_shufd]
-    vpermd   m0, m1, m0
-%else
-    mova     m0, [%2+ 0]
-    packsswb m0, [%2+32]
-    vpermq   m0, m0, q3120
-%endif
-    pcmpeqb  m0, m2
-    pmovmskb %1, m0
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_YMM avx2,lzcnt
-cglobal coeff_last64, 1,2
-    pxor m2, m2
-    LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
-    xor r1d, -1
-    jne .secondhalf
-    LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
-    not r1d
-    BSR eax, r1d, 0x1f
-    RET
-.secondhalf:
-    BSR eax, r1d, 0x1f
-    add eax, 32
-    RET
-%else
-INIT_YMM avx2,lzcnt
-cglobal coeff_last64, 1,3
-    pxor m2, m2
-    LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
-    LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
-    shl  r2, 32
-    or   r1, r2
-    not  r1
-    BSR rax, r1, 0x3f
-    RET
-%endif
-
-;-----------------------------------------------------------------------------
-; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
-;-----------------------------------------------------------------------------
-
-struc levelrun
-    .last: resd 1
-    .mask: resd 1
-    align 16, resb 1
-    .level: resw 16
-endstruc
-
-; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
-%if WIN64
-    DECLARE_REG_TMP 3,1,2,0,4,5,6
-%elif ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,4,5,6
-%else
-    DECLARE_REG_TMP 6,3,2,1,4,5,0
-%endif
-
-%macro COEFF_LEVELRUN 1
-cglobal coeff_level_run%1,0,7
-    movifnidn t0, r0mp
-    movifnidn t1, r1mp
-    pxor    m2, m2
-    xor    t3d, t3d
-    LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
-%if %1==15
-    shr    t5d, 1
-%elif %1==8
-    and    t5d, 0xff
-%elif %1==4
-    and    t5d, 0xf
-%endif
-    xor    t5d, (1<<%1)-1
-    mov [t1+levelrun.mask], t5d
-    shl    t5d, 32-%1
-    mov    t4d, %1-1
-    LZCOUNT t3d, t5d, 0x1f
-    xor    t6d, t6d
-    add    t5d, t5d
-    sub    t4d, t3d
-    shl    t5d, t3b
-    mov [t1+levelrun.last], t4d
-.loop:
-    LZCOUNT t3d, t5d, 0x1f
-%if HIGH_BIT_DEPTH
-    mov    t2d, [t0+t4*4]
-%else
-    mov    t2w, [t0+t4*2]
-%endif
-    inc    t3d
-    shl    t5d, t3b
-%if HIGH_BIT_DEPTH
-    mov   [t1+t6*4+levelrun.level], t2d
-%else
-    mov   [t1+t6*2+levelrun.level], t2w
-%endif
-    inc    t6d
-    sub    t4d, t3d
-    jge .loop
-    RET
-%endmacro
-
-INIT_MMX mmx2
-%if ARCH_X86_64 == 0
-COEFF_LEVELRUN 15
-COEFF_LEVELRUN 16
-%endif
-COEFF_LEVELRUN 4
-COEFF_LEVELRUN 8
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-COEFF_LEVELRUN 8
-%endif
-COEFF_LEVELRUN 15
-COEFF_LEVELRUN 16
-INIT_XMM sse2, lzcnt
-%if HIGH_BIT_DEPTH
-COEFF_LEVELRUN 8
-%endif
-COEFF_LEVELRUN 15
-COEFF_LEVELRUN 16
-INIT_MMX mmx2, lzcnt
-COEFF_LEVELRUN 4
-COEFF_LEVELRUN 8
-
-; Similar to the one above, but saves the DCT
-; coefficients in m0/m1 so we don't have to load
-; them later.
-%macro LAST_MASK_LUT 3
-    pxor     xm5, xm5
-%if %1 <= 8
-    mova      m0, [%3]
-    packsswb  m2, m0, m0
-%else
-    mova     xm0, [%3+ 0]
-    mova     xm1, [%3+16]
-    packsswb xm2, xm0, xm1
-%if mmsize==32
-    vinserti128 m0, m0, xm1, 1
-%endif
-%endif
-    pcmpeqb  xm2, xm5
-    pmovmskb  %2, xm2
-%endmacro
-
-%macro COEFF_LEVELRUN_LUT 1
-cglobal coeff_level_run%1,2,4+(%1/9)
-%ifdef PIC
-    lea       r5, [$$]
-    %define GLOBAL +r5-$$
-%else
-    %define GLOBAL
-%endif
-    LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
-%if %1==15
-    shr     eax, 1
-%elif %1==8
-    and     eax, 0xff
-%elif %1==4
-    and     eax, 0xf
-%endif
-    xor     eax, (1<<%1)-1
-    mov [r1+levelrun.mask], eax
-%if %1==15
-    add     eax, eax
-%endif
-%if %1 > 8
-%if ARCH_X86_64
-    mov     r4d, eax
-    shr     r4d, 8
-%else
-    movzx   r4d, ah ; first 8 bits
-%endif
-%endif
-    movzx   r2d, al ; second 8 bits
-    shl     eax, 32-%1-(%1&1)
-    LZCOUNT eax, eax, 0x1f
-    mov     r3d, %1-1
-    sub     r3d, eax
-    mov [r1+levelrun.last], r3d
-; Here we abuse pshufb, combined with a lookup table, to do a gather
-; operation based on a bitmask. For example:
-;
-; dct 15-8 (input): 0  0  4  0  0 -2  1  0
-; dct  7-0 (input): 0  0 -1  0  0  0  0 15
-; bitmask 1:        0  0  1  0  0  1  1  0
-; bitmask 2:        0  0  1  0  0  0  0  1
-; gather 15-8:      4 -2  1 __ __ __ __ __
-; gather  7-0:     -1 15 __ __ __ __ __ __
-; levels (output):  4 -2  1 -1 15 __ __ __ __ __ __ __ __ __ __ __
-;
-; The overlapping, dependent stores almost surely cause a mess of
-; forwarding issues, but it's still enormously faster.
-%if %1 > 8
-    movzx   eax, byte [popcnt_table+r4 GLOBAL]
-    movzx   r3d, byte [popcnt_table+r2 GLOBAL]
-%if mmsize==16
-    movh      m3, [dct_coef_shuffle+r4*8 GLOBAL]
-    movh      m2, [dct_coef_shuffle+r2*8 GLOBAL]
-    mova      m4, [pw_256]
-; Storing 8 bytes of shuffle constant and converting it (unpack + or)
-; is neutral to slightly faster in local speed measurements, but it
-; cuts the table size in half, which is surely a big cache win.
-    punpcklbw m3, m3
-    punpcklbw m2, m2
-    por       m3, m4
-    por       m2, m4
-    pshufb    m1, m3
-    pshufb    m0, m2
-    mova [r1+levelrun.level], m1
-; This obnoxious unaligned store messes with store forwarding and
-; stalls the CPU to no end, but merging the two registers before
-; storing requires a variable 128-bit shift. Emulating this does
-; work, but requires a lot of ops and the gain is tiny and
-; inconsistent, so we'll err on the side of fewer instructions.
-    movu [r1+rax*2+levelrun.level], m0
-%else ; mmsize==32
-    movq     xm2, [dct_coef_shuffle+r4*8 GLOBAL]
-    vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
-    punpcklbw m2, m2
-    por       m2, [pw_256]
-    pshufb    m0, m2
-    vextracti128 [r1+levelrun.level], m0, 1
-    movu [r1+rax*2+levelrun.level], xm0
-%endif
-    add     eax, r3d
-%else
-    movzx   eax, byte [popcnt_table+r2 GLOBAL]
-    movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
-    punpcklbw m1, m1
-    por       m1, [pw_256]
-    pshufb    m0, m1
-    mova [r1+levelrun.level], m0
-%endif
-    RET
-%endmacro
-
-%if HIGH_BIT_DEPTH==0
-INIT_MMX ssse3
-COEFF_LEVELRUN_LUT 4
-INIT_XMM ssse3
-COEFF_LEVELRUN_LUT 8
-COEFF_LEVELRUN_LUT 15
-COEFF_LEVELRUN_LUT 16
-INIT_MMX ssse3, lzcnt
-COEFF_LEVELRUN_LUT 4
-INIT_XMM ssse3, lzcnt
-COEFF_LEVELRUN_LUT 8
-COEFF_LEVELRUN_LUT 15
-COEFF_LEVELRUN_LUT 16
-INIT_XMM avx2, lzcnt
-COEFF_LEVELRUN_LUT 15
-COEFF_LEVELRUN_LUT 16
-%endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/quant.h b/android/src/main/libenc/jni/libx264/common/x86/quant.h
deleted file mode 100755
index c8c4c86..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/quant.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*****************************************************************************
- * quant.h: x86 quantization and level-run
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Christian Heine <sennindemokrit@gmx.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_I386_QUANT_H
-#define X264_I386_QUANT_H
-
-int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
-int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
-int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
-int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
-void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
-void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
-void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
-void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
-int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
-void x264_denoise_dct_mmx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-void x264_denoise_dct_avx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-int x264_decimate_score15_mmx2( dctcoef *dct );
-int x264_decimate_score15_sse2( dctcoef *dct );
-int x264_decimate_score15_ssse3( dctcoef *dct );
-int x264_decimate_score16_mmx2( dctcoef *dct );
-int x264_decimate_score16_sse2( dctcoef *dct );
-int x264_decimate_score16_ssse3( dctcoef *dct );
-int x264_decimate_score64_mmx2( dctcoef *dct );
-int x264_decimate_score64_sse2( dctcoef *dct );
-int x264_decimate_score64_ssse3( dctcoef *dct );
-int x264_decimate_score64_avx2( int16_t *dct );
-int x264_coeff_last4_mmx2( dctcoef *dct );
-int x264_coeff_last8_mmx2( dctcoef *dct );
-int x264_coeff_last15_mmx2( dctcoef *dct );
-int x264_coeff_last16_mmx2( dctcoef *dct );
-int x264_coeff_last64_mmx2( dctcoef *dct );
-int x264_coeff_last8_sse2( dctcoef *dct );
-int x264_coeff_last15_sse2( dctcoef *dct );
-int x264_coeff_last16_sse2( dctcoef *dct );
-int x264_coeff_last64_sse2( dctcoef *dct );
-int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
-int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
-int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
-int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
-int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
-int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
-int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced );
-int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
-int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
-int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
-int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
-int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs );
-int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs );
-int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS );
-int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/sad-a.asm b/android/src/main/libenc/jni/libx264/common/x86/sad-a.asm
deleted file mode 100755
index 7732b22..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/sad-a.asm
+++ /dev/null
@@ -1,1927 +0,0 @@
-;*****************************************************************************
-;* sad-a.asm: x86 sad functions
-;*****************************************************************************
-;* Copyright (C) 2003-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Laurent Aimar <fenrir@via.ecp.fr>
-;*          Alex Izvorski <aizvorksi@gmail.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
-
-SECTION .text
-
-cextern pb_3
-cextern pb_shuf8x8c
-cextern pw_8
-cextern sw_64
-
-;=============================================================================
-; SAD MMX
-;=============================================================================
-
-%macro SAD_INC_2x16P 0
-    movq    mm1,    [r0]
-    movq    mm2,    [r0+8]
-    movq    mm3,    [r0+r1]
-    movq    mm4,    [r0+r1+8]
-    psadbw  mm1,    [r2]
-    psadbw  mm2,    [r2+8]
-    psadbw  mm3,    [r2+r3]
-    psadbw  mm4,    [r2+r3+8]
-    lea     r0,     [r0+2*r1]
-    paddw   mm1,    mm2
-    paddw   mm3,    mm4
-    lea     r2,     [r2+2*r3]
-    paddw   mm0,    mm1
-    paddw   mm0,    mm3
-%endmacro
-
-%macro SAD_INC_2x8P 0
-    movq    mm1,    [r0]
-    movq    mm2,    [r0+r1]
-    psadbw  mm1,    [r2]
-    psadbw  mm2,    [r2+r3]
-    lea     r0,     [r0+2*r1]
-    paddw   mm0,    mm1
-    paddw   mm0,    mm2
-    lea     r2,     [r2+2*r3]
-%endmacro
-
-%macro SAD_INC_2x4P 0
-    movd    mm1,    [r0]
-    movd    mm2,    [r2]
-    punpckldq mm1,  [r0+r1]
-    punpckldq mm2,  [r2+r3]
-    psadbw  mm1,    mm2
-    paddw   mm0,    mm1
-    lea     r0,     [r0+2*r1]
-    lea     r2,     [r2+2*r3]
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SAD 2
-cglobal pixel_sad_%1x%2_mmx2, 4,4
-    pxor    mm0, mm0
-%rep %2/2
-    SAD_INC_2x%1P
-%endrep
-    movd    eax, mm0
-    RET
-%endmacro
-
-SAD 16, 16
-SAD 16,  8
-SAD  8, 16
-SAD  8,  8
-SAD  8,  4
-SAD  4, 16
-SAD  4,  8
-SAD  4,  4
-
-
-
-;=============================================================================
-; SAD XMM
-;=============================================================================
-
-%macro SAD_END_SSE2 0
-    MOVHL   m1, m0
-    paddw   m0, m1
-    movd   eax, m0
-    RET
-%endmacro
-
-%macro SAD_W16 0
-;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x16, 4,4,8
-    movu    m0, [r2]
-    movu    m1, [r2+r3]
-    lea     r2, [r2+2*r3]
-    movu    m2, [r2]
-    movu    m3, [r2+r3]
-    lea     r2, [r2+2*r3]
-    psadbw  m0, [r0]
-    psadbw  m1, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m4, [r2]
-    paddw   m0, m1
-    psadbw  m2, [r0]
-    psadbw  m3, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m5, [r2+r3]
-    lea     r2, [r2+2*r3]
-    paddw   m2, m3
-    movu    m6, [r2]
-    movu    m7, [r2+r3]
-    lea     r2, [r2+2*r3]
-    paddw   m0, m2
-    psadbw  m4, [r0]
-    psadbw  m5, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m1, [r2]
-    paddw   m4, m5
-    psadbw  m6, [r0]
-    psadbw  m7, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m2, [r2+r3]
-    lea     r2, [r2+2*r3]
-    paddw   m6, m7
-    movu    m3, [r2]
-    paddw   m0, m4
-    movu    m4, [r2+r3]
-    lea     r2, [r2+2*r3]
-    paddw   m0, m6
-    psadbw  m1, [r0]
-    psadbw  m2, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m5, [r2]
-    paddw   m1, m2
-    psadbw  m3, [r0]
-    psadbw  m4, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movu    m6, [r2+r3]
-    lea     r2, [r2+2*r3]
-    paddw   m3, m4
-    movu    m7, [r2]
-    paddw   m0, m1
-    movu    m1, [r2+r3]
-    paddw   m0, m3
-    psadbw  m5, [r0]
-    psadbw  m6, [r0+r1]
-    lea     r0, [r0+2*r1]
-    paddw   m5, m6
-    psadbw  m7, [r0]
-    psadbw  m1, [r0+r1]
-    paddw   m7, m1
-    paddw   m0, m5
-    paddw   m0, m7
-    SAD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x8, 4,4
-    movu    m0, [r2]
-    movu    m2, [r2+r3]
-    lea     r2, [r2+2*r3]
-    movu    m3, [r2]
-    movu    m4, [r2+r3]
-    psadbw  m0, [r0]
-    psadbw  m2, [r0+r1]
-    lea     r0, [r0+2*r1]
-    psadbw  m3, [r0]
-    psadbw  m4, [r0+r1]
-    lea     r0, [r0+2*r1]
-    lea     r2, [r2+2*r3]
-    paddw   m0, m2
-    paddw   m3, m4
-    paddw   m0, m3
-    movu    m1, [r2]
-    movu    m2, [r2+r3]
-    lea     r2, [r2+2*r3]
-    movu    m3, [r2]
-    movu    m4, [r2+r3]
-    psadbw  m1, [r0]
-    psadbw  m2, [r0+r1]
-    lea     r0, [r0+2*r1]
-    psadbw  m3, [r0]
-    psadbw  m4, [r0+r1]
-    lea     r0, [r0+2*r1]
-    lea     r2, [r2+2*r3]
-    paddw   m1, m2
-    paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
-    SAD_END_SSE2
-%endmacro
-
-INIT_XMM sse2
-SAD_W16
-INIT_XMM sse3
-SAD_W16
-INIT_XMM sse2, aligned
-SAD_W16
-
-%macro SAD_INC_4x8P_SSE 1
-    movq    m1, [r0]
-    movq    m2, [r0+r1]
-    lea     r0, [r0+2*r1]
-    movq    m3, [r2]
-    movq    m4, [r2+r3]
-    lea     r2, [r2+2*r3]
-    movhps  m1, [r0]
-    movhps  m2, [r0+r1]
-    movhps  m3, [r2]
-    movhps  m4, [r2+r3]
-    lea     r0, [r0+2*r1]
-    psadbw  m1, m3
-    psadbw  m2, m4
-    lea     r2, [r2+2*r3]
-    ACCUM paddw, 0, 1, %1
-    paddw   m0, m2
-%endmacro
-
-INIT_XMM
-;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal pixel_sad_8x16_sse2, 4,4
-    SAD_INC_4x8P_SSE 0
-    SAD_INC_4x8P_SSE 1
-    SAD_INC_4x8P_SSE 1
-    SAD_INC_4x8P_SSE 1
-    SAD_END_SSE2
-    RET
-
-;-----------------------------------------------------------------------------
-; void pixel_vsad( pixel *src, intptr_t stride );
-;-----------------------------------------------------------------------------
-
-%if ARCH_X86_64 == 0
-INIT_MMX
-cglobal pixel_vsad_mmx2, 3,3
-    mova      m0, [r0]
-    mova      m1, [r0+8]
-    mova      m2, [r0+r1]
-    mova      m3, [r0+r1+8]
-    lea       r0, [r0+r1*2]
-    psadbw    m0, m2
-    psadbw    m1, m3
-    paddw     m0, m1
-    sub      r2d, 2
-    je .end
-.loop:
-    mova      m4, [r0]
-    mova      m5, [r0+8]
-    mova      m6, [r0+r1]
-    mova      m7, [r0+r1+8]
-    lea       r0, [r0+r1*2]
-    psadbw    m2, m4
-    psadbw    m3, m5
-    psadbw    m4, m6
-    psadbw    m5, m7
-    ;max sum: 31*16*255(pixel_max)=126480
-    paddd     m0, m2
-    paddd     m0, m3
-    paddd     m0, m4
-    paddd     m0, m5
-    mova      m2, m6
-    mova      m3, m7
-    sub      r2d, 2
-    jg .loop
-.end:
-    movd     eax, m0
-    RET
-%endif
-
-INIT_XMM
-cglobal pixel_vsad_sse2, 3,3
-    mova      m0, [r0]
-    mova      m1, [r0+r1]
-    lea       r0, [r0+r1*2]
-    psadbw    m0, m1
-    sub      r2d, 2
-    je .end
-.loop:
-    mova      m2, [r0]
-    mova      m3, [r0+r1]
-    lea       r0, [r0+r1*2]
-    psadbw    m1, m2
-    psadbw    m2, m3
-    paddw     m0, m1
-    paddw     m0, m2
-    mova      m1, m3
-    sub      r2d, 2
-    jg .loop
-.end:
-    MOVHL     m1, m0
-    ;max sum: 31*16*255(pixel_max)=126480
-    paddd     m0, m1
-    movd     eax, m0
-    RET
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
-;-----------------------------------------------------------------------------
-
-cglobal intra_sad_x3_4x4_mmx2, 3,3
-    pxor      mm7, mm7
-    movd      mm0, [r1-FDEC_STRIDE]
-    movd      mm1, [r0+FENC_STRIDE*0]
-    movd      mm2, [r0+FENC_STRIDE*2]
-    punpckldq mm0, mm0
-    punpckldq mm1, [r0+FENC_STRIDE*1]
-    punpckldq mm2, [r0+FENC_STRIDE*3]
-    movq      mm6, mm0
-    movq      mm3, mm1
-    psadbw    mm3, mm0
-    psadbw    mm0, mm2
-    paddw     mm0, mm3
-    movd     [r2], mm0 ;V prediction cost
-    movd      mm3, [r1+FDEC_STRIDE*0-4]
-    movd      mm0, [r1+FDEC_STRIDE*1-4]
-    movd      mm4, [r1+FDEC_STRIDE*2-4]
-    movd      mm5, [r1+FDEC_STRIDE*3-4]
-    punpcklbw mm3, mm0
-    punpcklbw mm4, mm5
-    movq      mm5, mm3
-    punpckhwd mm5, mm4
-    punpckhdq mm5, mm6
-    psadbw    mm5, mm7
-    punpckhbw mm3, mm3
-    punpckhbw mm4, mm4
-    punpckhwd mm3, mm3
-    punpckhwd mm4, mm4
-    psraw     mm5, 2
-    pavgw     mm5, mm7
-    punpcklbw mm5, mm5
-    pshufw    mm5, mm5, 0 ;DC prediction
-    movq      mm6, mm5
-    psadbw    mm5, mm1
-    psadbw    mm6, mm2
-    psadbw    mm1, mm3
-    psadbw    mm2, mm4
-    paddw     mm5, mm6
-    paddw     mm1, mm2
-    movd   [r2+8], mm5 ;DC prediction cost
-    movd   [r2+4], mm1 ;H prediction cost
-    RET
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
-;-----------------------------------------------------------------------------
-
-;m0 = DC
-;m6 = V
-;m7 = H
-;m1 = DC score
-;m2 = V score
-;m3 = H score
-;m5 = pixel row
-;m4 = temp
-
-%macro INTRA_SAD_HVDC_ITER 2
-    movq      m5, [r0+FENC_STRIDE*%1]
-    movq      m4, m5
-    psadbw    m4, m0
-    ACCUM  paddw, 1, 4, %1
-    movq      m4, m5
-    psadbw    m4, m6
-    ACCUM  paddw, 2, 4, %1
-    pshufw    m4, m7, %2
-    psadbw    m5, m4
-    ACCUM  paddw, 3, 5, %1
-%endmacro
-
-INIT_MMX
-cglobal intra_sad_x3_8x8_mmx2, 3,3
-    movq      m7, [r1+7]
-    pxor      m0, m0
-    movq      m6, [r1+16]  ;V prediction
-    pxor      m1, m1
-    psadbw    m0, m7
-    psadbw    m1, m6
-    paddw     m0, m1
-    paddw     m0, [pw_8]
-    psrlw     m0, 4
-    punpcklbw m0, m0
-    pshufw    m0, m0, q0000 ;DC prediction
-    punpckhbw m7, m7
-    INTRA_SAD_HVDC_ITER 0, q3333
-    INTRA_SAD_HVDC_ITER 1, q2222
-    INTRA_SAD_HVDC_ITER 2, q1111
-    INTRA_SAD_HVDC_ITER 3, q0000
-    movq      m7, [r1+7]
-    punpcklbw m7, m7
-    INTRA_SAD_HVDC_ITER 4, q3333
-    INTRA_SAD_HVDC_ITER 5, q2222
-    INTRA_SAD_HVDC_ITER 6, q1111
-    INTRA_SAD_HVDC_ITER 7, q0000
-    movd  [r2+0], m2
-    movd  [r2+4], m3
-    movd  [r2+8], m1
-    RET
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
-;-----------------------------------------------------------------------------
-
-%macro INTRA_SAD_HV_ITER 1
-%if cpuflag(ssse3)
-    movd        m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
-    movd        m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
-    pshufb      m1, m7
-    pshufb      m3, m7
-%else
-    movq        m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
-    movq        m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
-    punpckhbw   m1, m1
-    punpckhbw   m3, m3
-    pshufw      m1, m1, q3333
-    pshufw      m3, m3, q3333
-%endif
-    movq        m4, [r0 + FENC_STRIDE*(%1+0)]
-    movq        m5, [r0 + FENC_STRIDE*(%1+1)]
-    psadbw      m1, m4
-    psadbw      m3, m5
-    psadbw      m4, m6
-    psadbw      m5, m6
-    paddw       m1, m3
-    paddw       m4, m5
-    ACCUM    paddw, 0, 1, %1
-    ACCUM    paddw, 2, 4, %1
-%endmacro
-
-%macro INTRA_SAD_8x8C 0
-cglobal intra_sad_x3_8x8c, 3,3
-    movq        m6, [r1 - FDEC_STRIDE]
-    add         r1, FDEC_STRIDE*4
-%if cpuflag(ssse3)
-    movq        m7, [pb_3]
-%endif
-    INTRA_SAD_HV_ITER 0
-    INTRA_SAD_HV_ITER 2
-    INTRA_SAD_HV_ITER 4
-    INTRA_SAD_HV_ITER 6
-    movd    [r2+4], m0
-    movd    [r2+8], m2
-    pxor        m7, m7
-    movq        m2, [r1 + FDEC_STRIDE*-4 - 8]
-    movq        m4, [r1 + FDEC_STRIDE*-2 - 8]
-    movq        m3, [r1 + FDEC_STRIDE* 0 - 8]
-    movq        m5, [r1 + FDEC_STRIDE* 2 - 8]
-    punpckhbw   m2, [r1 + FDEC_STRIDE*-3 - 8]
-    punpckhbw   m4, [r1 + FDEC_STRIDE*-1 - 8]
-    punpckhbw   m3, [r1 + FDEC_STRIDE* 1 - 8]
-    punpckhbw   m5, [r1 + FDEC_STRIDE* 3 - 8]
-    punpckhbw   m2, m4
-    punpckhbw   m3, m5
-    psrlq       m2, 32
-    psrlq       m3, 32
-    psadbw      m2, m7 ; s2
-    psadbw      m3, m7 ; s3
-    movq        m1, m6
-    SWAP        0, 6
-    punpckldq   m0, m7
-    punpckhdq   m1, m7
-    psadbw      m0, m7 ; s0
-    psadbw      m1, m7 ; s1
-    punpcklwd   m0, m1
-    punpcklwd   m2, m3
-    punpckldq   m0, m2 ;s0 s1 s2 s3
-    pshufw      m3, m0, q3312 ;s2,s1,s3,s3
-    pshufw      m0, m0, q1310 ;s0,s1,s3,s1
-    paddw       m0, m3
-    psrlw       m0, 2
-    pavgw       m0, m7 ; s0+s2, s1, s3, s1+s3
-%if cpuflag(ssse3)
-    movq2dq   xmm0, m0
-    pshufb    xmm0, [pb_shuf8x8c]
-    movq      xmm1, [r0+FENC_STRIDE*0]
-    movq      xmm2, [r0+FENC_STRIDE*1]
-    movq      xmm3, [r0+FENC_STRIDE*2]
-    movq      xmm4, [r0+FENC_STRIDE*3]
-    movhps    xmm1, [r0+FENC_STRIDE*4]
-    movhps    xmm2, [r0+FENC_STRIDE*5]
-    movhps    xmm3, [r0+FENC_STRIDE*6]
-    movhps    xmm4, [r0+FENC_STRIDE*7]
-    psadbw    xmm1, xmm0
-    psadbw    xmm2, xmm0
-    psadbw    xmm3, xmm0
-    psadbw    xmm4, xmm0
-    paddw     xmm1, xmm2
-    paddw     xmm1, xmm3
-    paddw     xmm1, xmm4
-    MOVHL     xmm0, xmm1
-    paddw     xmm1, xmm0
-    movd      [r2], xmm1
-%else
-    packuswb    m0, m0
-    punpcklbw   m0, m0
-    movq        m1, m0
-    punpcklbw   m0, m0 ; 4x dc0 4x dc1
-    punpckhbw   m1, m1 ; 4x dc2 4x dc3
-    movq        m2, [r0+FENC_STRIDE*0]
-    movq        m3, [r0+FENC_STRIDE*1]
-    movq        m4, [r0+FENC_STRIDE*2]
-    movq        m5, [r0+FENC_STRIDE*3]
-    movq        m6, [r0+FENC_STRIDE*4]
-    movq        m7, [r0+FENC_STRIDE*5]
-    psadbw      m2, m0
-    psadbw      m3, m0
-    psadbw      m4, m0
-    psadbw      m5, m0
-    movq        m0, [r0+FENC_STRIDE*6]
-    psadbw      m6, m1
-    psadbw      m7, m1
-    psadbw      m0, m1
-    psadbw      m1, [r0+FENC_STRIDE*7]
-    paddw       m2, m3
-    paddw       m4, m5
-    paddw       m6, m7
-    paddw       m0, m1
-    paddw       m2, m4
-    paddw       m6, m0
-    paddw       m2, m6
-    movd      [r2], m2
-%endif
-    RET
-%endmacro
-
-INIT_MMX mmx2
-INTRA_SAD_8x8C
-INIT_MMX ssse3
-INTRA_SAD_8x8C
-
-INIT_YMM avx2
-cglobal intra_sad_x3_8x8c, 3,3,7
-    vpbroadcastq m2, [r1 - FDEC_STRIDE]         ; V pred
-    add          r1, FDEC_STRIDE*4-1
-    pxor        xm5, xm5
-    punpckldq   xm3, xm2, xm5                   ; V0 _ V1 _
-    movd        xm0, [r1 + FDEC_STRIDE*-1 - 3]
-    movd        xm1, [r1 + FDEC_STRIDE* 3 - 3]
-    pinsrb      xm0, [r1 + FDEC_STRIDE*-4], 0
-    pinsrb      xm1, [r1 + FDEC_STRIDE* 0], 0
-    pinsrb      xm0, [r1 + FDEC_STRIDE*-3], 1
-    pinsrb      xm1, [r1 + FDEC_STRIDE* 1], 1
-    pinsrb      xm0, [r1 + FDEC_STRIDE*-2], 2
-    pinsrb      xm1, [r1 + FDEC_STRIDE* 2], 2
-    punpcklqdq  xm0, xm1                        ; H0 _ H1 _
-    vinserti128  m3, m3, xm0, 1                 ; V0 V1 H0 H1
-    pshufb      xm0, [hpred_shuf]               ; H00224466 H11335577
-    psadbw       m3, m5                         ; s0 s1 s2 s3
-    vpermq       m4, m3, q3312                  ; s2 s1 s3 s3
-    vpermq       m3, m3, q1310                  ; s0 s1 s3 s1
-    paddw        m3, m4
-    psrlw        m3, 2
-    pavgw        m3, m5                         ; s0+s2 s1 s3 s1+s3
-    pshufb       m3, [pb_shuf8x8c2]             ; DC0 _ DC1 _
-    vpblendd     m3, m3, m2, 11001100b          ; DC0 V DC1 V
-    vinserti128  m1, m3, xm3, 1                 ; DC0 V DC0 V
-    vperm2i128   m6, m3, m3, q0101              ; DC1 V DC1 V
-    vpermq       m0, m0, q3120                  ; H00224466 _ H11335577 _
-    movddup      m2, [r0+FENC_STRIDE*0]
-    movddup      m4, [r0+FENC_STRIDE*2]
-    pshuflw      m3, m0, q0000
-    psadbw       m3, m2
-    psadbw       m2, m1
-    pshuflw      m5, m0, q1111
-    psadbw       m5, m4
-    psadbw       m4, m1
-    paddw        m2, m4
-    paddw        m3, m5
-    movddup      m4, [r0+FENC_STRIDE*4]
-    pshuflw      m5, m0, q2222
-    psadbw       m5, m4
-    psadbw       m4, m6
-    paddw        m2, m4
-    paddw        m3, m5
-    movddup      m4, [r0+FENC_STRIDE*6]
-    pshuflw      m5, m0, q3333
-    psadbw       m5, m4
-    psadbw       m4, m6
-    paddw        m2, m4
-    paddw        m3, m5
-    vextracti128 xm0, m2, 1
-    vextracti128 xm1, m3, 1
-    paddw       xm2, xm0 ; DC V
-    paddw       xm3, xm1 ; H
-    pextrd   [r2+8], xm2, 2 ; V
-    movd     [r2+4], xm3    ; H
-    movd     [r2+0], xm2    ; DC
-    RET
-
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
-;-----------------------------------------------------------------------------
-
-;xmm7: DC prediction    xmm6: H prediction  xmm5: V prediction
-;xmm4: DC pred score    xmm3: H pred score  xmm2: V pred score
-%macro INTRA_SAD16 0
-cglobal intra_sad_x3_16x16, 3,5,8
-    pxor    mm0, mm0
-    pxor    mm1, mm1
-    psadbw  mm0, [r1-FDEC_STRIDE+0]
-    psadbw  mm1, [r1-FDEC_STRIDE+8]
-    paddw   mm0, mm1
-    movd    r3d, mm0
-%if cpuflag(ssse3)
-    mova  m1, [pb_3]
-%endif
-%assign x 0
-%rep 16
-    movzx   r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
-%if (x&3)==3 && x!=15
-    add      r1, FDEC_STRIDE*4
-%endif
-    add     r3d, r4d
-%assign x x+1
-%endrep
-    sub      r1, FDEC_STRIDE*12
-    add     r3d, 16
-    shr     r3d, 5
-    imul    r3d, 0x01010101
-    movd    m7, r3d
-    mova    m5, [r1-FDEC_STRIDE]
-%if mmsize==16
-    pshufd  m7, m7, 0
-%else
-    mova    m1, [r1-FDEC_STRIDE+8]
-    punpckldq m7, m7
-%endif
-    pxor    m4, m4
-    pxor    m3, m3
-    pxor    m2, m2
-    mov     r3d, 15*FENC_STRIDE
-.vloop:
-    SPLATB_LOAD m6, r1+r3*2-1, m1
-    mova    m0, [r0+r3]
-    psadbw  m0, m7
-    paddw   m4, m0
-    mova    m0, [r0+r3]
-    psadbw  m0, m5
-    paddw   m2, m0
-%if mmsize==8
-    mova    m0, [r0+r3]
-    psadbw  m0, m6
-    paddw   m3, m0
-    mova    m0, [r0+r3+8]
-    psadbw  m0, m7
-    paddw   m4, m0
-    mova    m0, [r0+r3+8]
-    psadbw  m0, m1
-    paddw   m2, m0
-    psadbw  m6, [r0+r3+8]
-    paddw   m3, m6
-%else
-    psadbw  m6, [r0+r3]
-    paddw   m3, m6
-%endif
-    add     r3d, -FENC_STRIDE
-    jge .vloop
-%if mmsize==16
-    pslldq  m3, 4
-    por     m3, m2
-    MOVHL   m1, m3
-    paddw   m3, m1
-    movq  [r2+0], m3
-    MOVHL   m1, m4
-    paddw   m4, m1
-%else
-    movd  [r2+0], m2
-    movd  [r2+4], m3
-%endif
-    movd  [r2+8], m4
-    RET
-%endmacro
-
-INIT_MMX mmx2
-INTRA_SAD16
-INIT_XMM sse2
-INTRA_SAD16
-INIT_XMM ssse3
-INTRA_SAD16
-
-INIT_YMM avx2
-cglobal intra_sad_x3_16x16, 3,5,6
-    pxor   xm0, xm0
-    psadbw xm0, [r1-FDEC_STRIDE]
-    MOVHL  xm1, xm0
-    paddw  xm0, xm1
-    movd   r3d, xm0
-%assign x 0
-%rep 16
-    movzx  r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
-%if (x&3)==3 && x!=15
-    add     r1, FDEC_STRIDE*4
-%endif
-    add    r3d, r4d
-%assign x x+1
-%endrep
-    sub     r1, FDEC_STRIDE*12
-    add    r3d, 16
-    shr    r3d, 5
-    movd   xm5, r3d
-    vpbroadcastb xm5, xm5
-    vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
-
-    pxor    m4, m4  ; DC / V accumulator
-    pxor   xm3, xm3 ; H accumulator
-    mov    r3d, 15*FENC_STRIDE
-.vloop:
-    vpbroadcastb  xm2, [r1+r3*2-1]
-    vbroadcasti128 m0, [r0+r3]
-    psadbw  m1, m0, m5
-    psadbw xm0, xm2
-    paddw   m4, m1
-    paddw  xm3, xm0
-    add    r3d, -FENC_STRIDE
-    jge .vloop
-    punpckhqdq m5, m4, m4
-    MOVHL  xm2, xm3
-    paddw   m4, m5      ; DC / V
-    paddw  xm3, xm2     ; H
-    vextracti128 xm2, m4, 1
-    movd  [r2+0], xm2
-    movd  [r2+4], xm3
-    movd  [r2+8], xm4
-    RET
-
-;=============================================================================
-; SAD x3/x4 MMX
-;=============================================================================
-
-%macro SAD_X3_START_1x8P 0
-    movq    mm3,    [r0]
-    movq    mm0,    [r1]
-    movq    mm1,    [r2]
-    movq    mm2,    [r3]
-    psadbw  mm0,    mm3
-    psadbw  mm1,    mm3
-    psadbw  mm2,    mm3
-%endmacro
-
-%macro SAD_X3_1x8P 2
-    movq    mm3,    [r0+%1]
-    movq    mm4,    [r1+%2]
-    movq    mm5,    [r2+%2]
-    movq    mm6,    [r3+%2]
-    psadbw  mm4,    mm3
-    psadbw  mm5,    mm3
-    psadbw  mm6,    mm3
-    paddw   mm0,    mm4
-    paddw   mm1,    mm5
-    paddw   mm2,    mm6
-%endmacro
-
-%macro SAD_X3_START_2x4P 3
-    movd      mm3,  [r0]
-    movd      %1,   [r1]
-    movd      %2,   [r2]
-    movd      %3,   [r3]
-    punpckldq mm3,  [r0+FENC_STRIDE]
-    punpckldq %1,   [r1+r4]
-    punpckldq %2,   [r2+r4]
-    punpckldq %3,   [r3+r4]
-    psadbw    %1,   mm3
-    psadbw    %2,   mm3
-    psadbw    %3,   mm3
-%endmacro
-
-%macro SAD_X3_2x16P 1
-%if %1
-    SAD_X3_START_1x8P
-%else
-    SAD_X3_1x8P 0, 0
-%endif
-    SAD_X3_1x8P 8, 8
-    SAD_X3_1x8P FENC_STRIDE, r4
-    SAD_X3_1x8P FENC_STRIDE+8, r4+8
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r4]
-    lea     r2, [r2+2*r4]
-    lea     r3, [r3+2*r4]
-%endmacro
-
-%macro SAD_X3_2x8P 1
-%if %1
-    SAD_X3_START_1x8P
-%else
-    SAD_X3_1x8P 0, 0
-%endif
-    SAD_X3_1x8P FENC_STRIDE, r4
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r4]
-    lea     r2, [r2+2*r4]
-    lea     r3, [r3+2*r4]
-%endmacro
-
-%macro SAD_X3_2x4P 1
-%if %1
-    SAD_X3_START_2x4P mm0, mm1, mm2
-%else
-    SAD_X3_START_2x4P mm4, mm5, mm6
-    paddw     mm0,  mm4
-    paddw     mm1,  mm5
-    paddw     mm2,  mm6
-%endif
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r4]
-    lea     r2, [r2+2*r4]
-    lea     r3, [r3+2*r4]
-%endmacro
-
-%macro SAD_X4_START_1x8P 0
-    movq    mm7,    [r0]
-    movq    mm0,    [r1]
-    movq    mm1,    [r2]
-    movq    mm2,    [r3]
-    movq    mm3,    [r4]
-    psadbw  mm0,    mm7
-    psadbw  mm1,    mm7
-    psadbw  mm2,    mm7
-    psadbw  mm3,    mm7
-%endmacro
-
-%macro SAD_X4_1x8P 2
-    movq    mm7,    [r0+%1]
-    movq    mm4,    [r1+%2]
-    movq    mm5,    [r2+%2]
-    movq    mm6,    [r3+%2]
-    psadbw  mm4,    mm7
-    psadbw  mm5,    mm7
-    psadbw  mm6,    mm7
-    psadbw  mm7,    [r4+%2]
-    paddw   mm0,    mm4
-    paddw   mm1,    mm5
-    paddw   mm2,    mm6
-    paddw   mm3,    mm7
-%endmacro
-
-%macro SAD_X4_START_2x4P 0
-    movd      mm7,  [r0]
-    movd      mm0,  [r1]
-    movd      mm1,  [r2]
-    movd      mm2,  [r3]
-    movd      mm3,  [r4]
-    punpckldq mm7,  [r0+FENC_STRIDE]
-    punpckldq mm0,  [r1+r5]
-    punpckldq mm1,  [r2+r5]
-    punpckldq mm2,  [r3+r5]
-    punpckldq mm3,  [r4+r5]
-    psadbw    mm0,  mm7
-    psadbw    mm1,  mm7
-    psadbw    mm2,  mm7
-    psadbw    mm3,  mm7
-%endmacro
-
-%macro SAD_X4_INC_2x4P 0
-    movd      mm7,  [r0]
-    movd      mm4,  [r1]
-    movd      mm5,  [r2]
-    punpckldq mm7,  [r0+FENC_STRIDE]
-    punpckldq mm4,  [r1+r5]
-    punpckldq mm5,  [r2+r5]
-    psadbw    mm4,  mm7
-    psadbw    mm5,  mm7
-    paddw     mm0,  mm4
-    paddw     mm1,  mm5
-    movd      mm4,  [r3]
-    movd      mm5,  [r4]
-    punpckldq mm4,  [r3+r5]
-    punpckldq mm5,  [r4+r5]
-    psadbw    mm4,  mm7
-    psadbw    mm5,  mm7
-    paddw     mm2,  mm4
-    paddw     mm3,  mm5
-%endmacro
-
-%macro SAD_X4_2x16P 1
-%if %1
-    SAD_X4_START_1x8P
-%else
-    SAD_X4_1x8P 0, 0
-%endif
-    SAD_X4_1x8P 8, 8
-    SAD_X4_1x8P FENC_STRIDE, r5
-    SAD_X4_1x8P FENC_STRIDE+8, r5+8
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r5]
-    lea     r2, [r2+2*r5]
-    lea     r3, [r3+2*r5]
-    lea     r4, [r4+2*r5]
-%endmacro
-
-%macro SAD_X4_2x8P 1
-%if %1
-    SAD_X4_START_1x8P
-%else
-    SAD_X4_1x8P 0, 0
-%endif
-    SAD_X4_1x8P FENC_STRIDE, r5
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r5]
-    lea     r2, [r2+2*r5]
-    lea     r3, [r3+2*r5]
-    lea     r4, [r4+2*r5]
-%endmacro
-
-%macro SAD_X4_2x4P 1
-%if %1
-    SAD_X4_START_2x4P
-%else
-    SAD_X4_INC_2x4P
-%endif
-    add     r0, 2*FENC_STRIDE
-    lea     r1, [r1+2*r5]
-    lea     r2, [r2+2*r5]
-    lea     r3, [r3+2*r5]
-    lea     r4, [r4+2*r5]
-%endmacro
-
-%macro SAD_X3_END 0
-%if UNIX64
-    movd    [r5+0], mm0
-    movd    [r5+4], mm1
-    movd    [r5+8], mm2
-%else
-    mov     r0, r5mp
-    movd    [r0+0], mm0
-    movd    [r0+4], mm1
-    movd    [r0+8], mm2
-%endif
-    RET
-%endmacro
-
-%macro SAD_X4_END 0
-    mov     r0, r6mp
-    movd    [r0+0], mm0
-    movd    [r0+4], mm1
-    movd    [r0+8], mm2
-    movd    [r0+12], mm3
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
-    SAD_X%1_2x%2P 1
-%rep %3/2-1
-    SAD_X%1_2x%2P 0
-%endrep
-    SAD_X%1_END
-%endmacro
-
-INIT_MMX
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-SAD_X 3,  8, 16
-SAD_X 3,  8,  8
-SAD_X 3,  8,  4
-SAD_X 3,  4,  8
-SAD_X 3,  4,  4
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-SAD_X 4,  8, 16
-SAD_X 4,  8,  8
-SAD_X 4,  8,  4
-SAD_X 4,  4,  8
-SAD_X 4,  4,  4
-
-
-
-;=============================================================================
-; SAD x3/x4 XMM
-;=============================================================================
-
-%macro SAD_X3_START_1x16P_SSE2 0
-    mova     m2, [r0]
-%if cpuflag(avx)
-    psadbw   m0, m2, [r1]
-    psadbw   m1, m2, [r2]
-    psadbw   m2, [r3]
-%else
-    movu     m0, [r1]
-    movu     m1, [r2]
-    movu     m3, [r3]
-    psadbw   m0, m2
-    psadbw   m1, m2
-    psadbw   m2, m3
-%endif
-%endmacro
-
-%macro SAD_X3_1x16P_SSE2 2
-    mova     m3, [r0+%1]
-%if cpuflag(avx)
-    psadbw   m4, m3, [r1+%2]
-    psadbw   m5, m3, [r2+%2]
-    psadbw   m3, [r3+%2]
-%else
-    movu     m4, [r1+%2]
-    movu     m5, [r2+%2]
-    movu     m6, [r3+%2]
-    psadbw   m4, m3
-    psadbw   m5, m3
-    psadbw   m3, m6
-%endif
-    paddw    m0, m4
-    paddw    m1, m5
-    paddw    m2, m3
-%endmacro
-
-%if ARCH_X86_64
-    DECLARE_REG_TMP 6
-%else
-    DECLARE_REG_TMP 5
-%endif
-
-%macro SAD_X3_4x16P_SSE2 2
-%if %1==0
-    lea  t0, [r4*3]
-    SAD_X3_START_1x16P_SSE2
-%else
-    SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
-%endif
-    SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
-    SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
-    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r4]
-    lea  r2, [r2+4*r4]
-    lea  r3, [r3+4*r4]
-%endif
-%endmacro
-
-%macro SAD_X3_START_2x8P_SSE2 0
-    movq     m3, [r0]
-    movq     m0, [r1]
-    movq     m1, [r2]
-    movq     m2, [r3]
-    movhps   m3, [r0+FENC_STRIDE]
-    movhps   m0, [r1+r4]
-    movhps   m1, [r2+r4]
-    movhps   m2, [r3+r4]
-    psadbw   m0, m3
-    psadbw   m1, m3
-    psadbw   m2, m3
-%endmacro
-
-%macro SAD_X3_2x8P_SSE2 4
-    movq     m6, [r0+%1]
-    movq     m3, [r1+%2]
-    movq     m4, [r2+%2]
-    movq     m5, [r3+%2]
-    movhps   m6, [r0+%3]
-    movhps   m3, [r1+%4]
-    movhps   m4, [r2+%4]
-    movhps   m5, [r3+%4]
-    psadbw   m3, m6
-    psadbw   m4, m6
-    psadbw   m5, m6
-    paddw    m0, m3
-    paddw    m1, m4
-    paddw    m2, m5
-%endmacro
-
-%macro SAD_X4_START_2x8P_SSE2 0
-    movq     m4, [r0]
-    movq     m0, [r1]
-    movq     m1, [r2]
-    movq     m2, [r3]
-    movq     m3, [r4]
-    movhps   m4, [r0+FENC_STRIDE]
-    movhps   m0, [r1+r5]
-    movhps   m1, [r2+r5]
-    movhps   m2, [r3+r5]
-    movhps   m3, [r4+r5]
-    psadbw   m0, m4
-    psadbw   m1, m4
-    psadbw   m2, m4
-    psadbw   m3, m4
-%endmacro
-
-%macro SAD_X4_2x8P_SSE2 4
-    movq     m6, [r0+%1]
-    movq     m4, [r1+%2]
-    movq     m5, [r2+%2]
-    movhps   m6, [r0+%3]
-    movhps   m4, [r1+%4]
-    movhps   m5, [r2+%4]
-    psadbw   m4, m6
-    psadbw   m5, m6
-    paddw    m0, m4
-    paddw    m1, m5
-    movq     m4, [r3+%2]
-    movq     m5, [r4+%2]
-    movhps   m4, [r3+%4]
-    movhps   m5, [r4+%4]
-    psadbw   m4, m6
-    psadbw   m5, m6
-    paddw    m2, m4
-    paddw    m3, m5
-%endmacro
-
-%macro SAD_X4_START_1x16P_SSE2 0
-    mova     m3, [r0]
-%if cpuflag(avx)
-    psadbw   m0, m3, [r1]
-    psadbw   m1, m3, [r2]
-    psadbw   m2, m3, [r3]
-    psadbw   m3, [r4]
-%else
-    movu     m0, [r1]
-    movu     m1, [r2]
-    movu     m2, [r3]
-    movu     m4, [r4]
-    psadbw   m0, m3
-    psadbw   m1, m3
-    psadbw   m2, m3
-    psadbw   m3, m4
-%endif
-%endmacro
-
-%macro SAD_X4_1x16P_SSE2 2
-    mova     m6, [r0+%1]
-%if cpuflag(avx)
-    psadbw   m4, m6, [r1+%2]
-    psadbw   m5, m6, [r2+%2]
-%else
-    movu     m4, [r1+%2]
-    movu     m5, [r2+%2]
-    psadbw   m4, m6
-    psadbw   m5, m6
-%endif
-    paddw    m0, m4
-    paddw    m1, m5
-%if cpuflag(avx)
-    psadbw   m4, m6, [r3+%2]
-    psadbw   m5, m6, [r4+%2]
-%else
-    movu     m4, [r3+%2]
-    movu     m5, [r4+%2]
-    psadbw   m4, m6
-    psadbw   m5, m6
-%endif
-    paddw    m2, m4
-    paddw    m3, m5
-%endmacro
-
-%macro SAD_X4_4x16P_SSE2 2
-%if %1==0
-    lea  r6, [r5*3]
-    SAD_X4_START_1x16P_SSE2
-%else
-    SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
-%endif
-    SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
-    SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
-    SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r5]
-    lea  r2, [r2+4*r5]
-    lea  r3, [r3+4*r5]
-    lea  r4, [r4+4*r5]
-%endif
-%endmacro
-
-%macro SAD_X3_4x8P_SSE2 2
-%if %1==0
-    lea  t0, [r4*3]
-    SAD_X3_START_2x8P_SSE2
-%else
-    SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
-%endif
-    SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r4]
-    lea  r2, [r2+4*r4]
-    lea  r3, [r3+4*r4]
-%endif
-%endmacro
-
-%macro SAD_X4_4x8P_SSE2 2
-%if %1==0
-    lea    r6, [r5*3]
-    SAD_X4_START_2x8P_SSE2
-%else
-    SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
-%endif
-    SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r5]
-    lea  r2, [r2+4*r5]
-    lea  r3, [r3+4*r5]
-    lea  r4, [r4+4*r5]
-%endif
-%endmacro
-
-%macro SAD_X3_END_SSE2 0
-    movifnidn r5, r5mp
-%if cpuflag(ssse3)
-    packssdw m0, m1
-    packssdw m2, m2
-    phaddd   m0, m2
-    mova   [r5], m0
-%else
-    movhlps  m3, m0
-    movhlps  m4, m1
-    movhlps  m5, m2
-    paddw    m0, m3
-    paddw    m1, m4
-    paddw    m2, m5
-    movd [r5+0], m0
-    movd [r5+4], m1
-    movd [r5+8], m2
-%endif
-    RET
-%endmacro
-
-%macro SAD_X4_END_SSE2 0
-    mov      r0, r6mp
-%if cpuflag(ssse3)
-    packssdw m0, m1
-    packssdw m2, m3
-    phaddd   m0, m2
-    mova   [r0], m0
-%else
-    psllq    m1, 32
-    psllq    m3, 32
-    paddw    m0, m1
-    paddw    m2, m3
-    movhlps  m1, m0
-    movhlps  m3, m2
-    paddw    m0, m1
-    paddw    m2, m3
-    movq [r0+0], m0
-    movq [r0+8], m2
-%endif
-    RET
-%endmacro
-
-%macro SAD_X4_START_2x8P_SSSE3 0
-    movddup  m4, [r0]
-    movq     m0, [r1]
-    movq     m1, [r3]
-    movhps   m0, [r2]
-    movhps   m1, [r4]
-    movddup  m5, [r0+FENC_STRIDE]
-    movq     m2, [r1+r5]
-    movq     m3, [r3+r5]
-    movhps   m2, [r2+r5]
-    movhps   m3, [r4+r5]
-    psadbw   m0, m4
-    psadbw   m1, m4
-    psadbw   m2, m5
-    psadbw   m3, m5
-    paddw    m0, m2
-    paddw    m1, m3
-%endmacro
-
-%macro SAD_X4_2x8P_SSSE3 4
-    movddup  m6, [r0+%1]
-    movq     m2, [r1+%2]
-    movq     m3, [r3+%2]
-    movhps   m2, [r2+%2]
-    movhps   m3, [r4+%2]
-    movddup  m7, [r0+%3]
-    movq     m4, [r1+%4]
-    movq     m5, [r3+%4]
-    movhps   m4, [r2+%4]
-    movhps   m5, [r4+%4]
-    psadbw   m2, m6
-    psadbw   m3, m6
-    psadbw   m4, m7
-    psadbw   m5, m7
-    paddw    m0, m2
-    paddw    m1, m3
-    paddw    m0, m4
-    paddw    m1, m5
-%endmacro
-
-%macro SAD_X4_4x8P_SSSE3 2
-%if %1==0
-    lea    r6, [r5*3]
-    SAD_X4_START_2x8P_SSSE3
-%else
-    SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
-%endif
-    SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r5]
-    lea  r2, [r2+4*r5]
-    lea  r3, [r3+4*r5]
-    lea  r4, [r4+4*r5]
-%endif
-%endmacro
-
-%macro SAD_X4_END_SSSE3 0
-    mov      r0, r6mp
-    packssdw m0, m1
-    mova   [r0], m0
-    RET
-%endmacro
-
-%macro SAD_X3_START_2x16P_AVX2 0
-    movu    m3, [r0] ; assumes FENC_STRIDE == 16
-    movu   xm0, [r1]
-    movu   xm1, [r2]
-    movu   xm2, [r3]
-    vinserti128  m0, m0, [r1+r4], 1
-    vinserti128  m1, m1, [r2+r4], 1
-    vinserti128  m2, m2, [r3+r4], 1
-    psadbw  m0, m3
-    psadbw  m1, m3
-    psadbw  m2, m3
-%endmacro
-
-%macro SAD_X3_2x16P_AVX2 3
-    movu    m3, [r0+%1] ; assumes FENC_STRIDE == 16
-    movu   xm4, [r1+%2]
-    movu   xm5, [r2+%2]
-    movu   xm6, [r3+%2]
-    vinserti128  m4, m4, [r1+%3], 1
-    vinserti128  m5, m5, [r2+%3], 1
-    vinserti128  m6, m6, [r3+%3], 1
-    psadbw  m4, m3
-    psadbw  m5, m3
-    psadbw  m6, m3
-    paddw   m0, m4
-    paddw   m1, m5
-    paddw   m2, m6
-%endmacro
-
-%macro SAD_X3_4x16P_AVX2 2
-%if %1==0
-    lea  t0, [r4*3]
-    SAD_X3_START_2x16P_AVX2
-%else
-    SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
-%endif
-    SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r4]
-    lea  r2, [r2+4*r4]
-    lea  r3, [r3+4*r4]
-%endif
-%endmacro
-
-%macro SAD_X4_START_2x16P_AVX2 0
-    vbroadcasti128 m4, [r0]
-    vbroadcasti128 m5, [r0+FENC_STRIDE]
-    movu   xm0, [r1]
-    movu   xm1, [r2]
-    movu   xm2, [r1+r5]
-    movu   xm3, [r2+r5]
-    vinserti128 m0, m0, [r3], 1
-    vinserti128 m1, m1, [r4], 1
-    vinserti128 m2, m2, [r3+r5], 1
-    vinserti128 m3, m3, [r4+r5], 1
-    psadbw  m0, m4
-    psadbw  m1, m4
-    psadbw  m2, m5
-    psadbw  m3, m5
-    paddw   m0, m2
-    paddw   m1, m3
-%endmacro
-
-%macro SAD_X4_2x16P_AVX2 4
-    vbroadcasti128 m6, [r0+%1]
-    vbroadcasti128 m7, [r0+%3]
-    movu   xm2, [r1+%2]
-    movu   xm3, [r2+%2]
-    movu   xm4, [r1+%4]
-    movu   xm5, [r2+%4]
-    vinserti128 m2, m2, [r3+%2], 1
-    vinserti128 m3, m3, [r4+%2], 1
-    vinserti128 m4, m4, [r3+%4], 1
-    vinserti128 m5, m5, [r4+%4], 1
-    psadbw  m2, m6
-    psadbw  m3, m6
-    psadbw  m4, m7
-    psadbw  m5, m7
-    paddw   m0, m2
-    paddw   m1, m3
-    paddw   m0, m4
-    paddw   m1, m5
-%endmacro
-
-%macro SAD_X4_4x16P_AVX2 2
-%if %1==0
-    lea  r6, [r5*3]
-    SAD_X4_START_2x16P_AVX2
-%else
-    SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
-%endif
-    SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
-%if %1 != %2-1
-%if (%1&1) != 0
-    add  r0, 8*FENC_STRIDE
-%endif
-    lea  r1, [r1+4*r5]
-    lea  r2, [r2+4*r5]
-    lea  r3, [r3+4*r5]
-    lea  r4, [r4+4*r5]
-%endif
-%endmacro
-
-%macro SAD_X3_END_AVX2 0
-    movifnidn r5, r5mp
-    packssdw  m0, m1        ; 0 0 1 1 0 0 1 1
-    packssdw  m2, m2        ; 2 2 _ _ 2 2 _ _
-    phaddd    m0, m2        ; 0 1 2 _ 0 1 2 _
-    vextracti128 xm1, m0, 1
-    paddd    xm0, xm1       ; 0 1 2 _
-    mova    [r5], xm0
-    RET
-%endmacro
-
-%macro SAD_X4_END_AVX2 0
-    mov       r0, r6mp
-    packssdw  m0, m1        ; 0 0 1 1 2 2 3 3
-    vextracti128 xm1, m0, 1
-    phaddd   xm0, xm1       ; 0 1 2 3
-    mova    [r0], xm0
-    RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X_SSE2 4
-cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
-%assign x 0
-%rep %3/4
-    SAD_X%1_4x%2P_SSE2 x, %3/4
-%assign x x+1
-%endrep
-    SAD_X%1_END_SSE2
-%endmacro
-
-INIT_XMM sse2
-SAD_X_SSE2 3, 16, 16, 7
-SAD_X_SSE2 3, 16,  8, 7
-SAD_X_SSE2 3,  8, 16, 7
-SAD_X_SSE2 3,  8,  8, 7
-SAD_X_SSE2 3,  8,  4, 7
-SAD_X_SSE2 4, 16, 16, 7
-SAD_X_SSE2 4, 16,  8, 7
-SAD_X_SSE2 4,  8, 16, 7
-SAD_X_SSE2 4,  8,  8, 7
-SAD_X_SSE2 4,  8,  4, 7
-
-INIT_XMM sse3
-SAD_X_SSE2 3, 16, 16, 7
-SAD_X_SSE2 3, 16,  8, 7
-SAD_X_SSE2 4, 16, 16, 7
-SAD_X_SSE2 4, 16,  8, 7
-
-%macro SAD_X_SSSE3 3
-cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
-%assign x 0
-%rep %3/4
-    SAD_X%1_4x%2P_SSSE3 x, %3/4
-%assign x x+1
-%endrep
-    SAD_X%1_END_SSSE3
-%endmacro
-
-INIT_XMM ssse3
-SAD_X_SSE2  3, 16, 16, 7
-SAD_X_SSE2  3, 16,  8, 7
-SAD_X_SSE2  4, 16, 16, 7
-SAD_X_SSE2  4, 16,  8, 7
-SAD_X_SSSE3 4,  8, 16
-SAD_X_SSSE3 4,  8,  8
-SAD_X_SSSE3 4,  8,  4
-
-INIT_XMM avx
-SAD_X_SSE2 3, 16, 16, 6
-SAD_X_SSE2 3, 16,  8, 6
-SAD_X_SSE2 4, 16, 16, 7
-SAD_X_SSE2 4, 16,  8, 7
-
-%macro SAD_X_AVX2 4
-cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
-%assign x 0
-%rep %3/4
-    SAD_X%1_4x%2P_AVX2 x, %3/4
-%assign x x+1
-%endrep
-    SAD_X%1_END_AVX2
-%endmacro
-
-INIT_YMM avx2
-SAD_X_AVX2 3, 16, 16, 7
-SAD_X_AVX2 3, 16,  8, 7
-SAD_X_AVX2 4, 16, 16, 8
-SAD_X_AVX2 4, 16,  8, 8
-
-;=============================================================================
-; SAD cacheline split
-;=============================================================================
-
-; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
-; unless the unaligned data spans the border between 2 cachelines, in which
-; case it's really slow. The exact numbers may differ, but all Intel cpus prior
-; to Nehalem have a large penalty for cacheline splits.
-; (8-byte alignment exactly half way between two cachelines is ok though.)
-; LDDQU was supposed to fix this, but it only works on Pentium 4.
-; So in the split case we load aligned data and explicitly perform the
-; alignment between registers. Like on archs that have only aligned loads,
-; except complicated by the fact that PALIGNR takes only an immediate, not
-; a variable alignment.
-; It is also possible to hoist the realignment to the macroblock level (keep
-; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
-; needed for that method makes it often slower.
-
-; sad 16x16 costs on Core2:
-; good offsets: 49 cycles (50/64 of all mvs)
-; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
-; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
-; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
-
-; computed jump assumes this loop is exactly 80 bytes
-%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
-ALIGN 16
-sad_w16_align%1_sse2:
-    movdqa  xmm1, [r2+16]
-    movdqa  xmm2, [r2+r3+16]
-    movdqa  xmm3, [r2]
-    movdqa  xmm4, [r2+r3]
-    pslldq  xmm1, 16-%1
-    pslldq  xmm2, 16-%1
-    psrldq  xmm3, %1
-    psrldq  xmm4, %1
-    por     xmm1, xmm3
-    por     xmm2, xmm4
-    psadbw  xmm1, [r0]
-    psadbw  xmm2, [r0+r1]
-    paddw   xmm0, xmm1
-    paddw   xmm0, xmm2
-    lea     r0,   [r0+2*r1]
-    lea     r2,   [r2+2*r3]
-    dec     r4
-    jg sad_w16_align%1_sse2
-    ret
-%endmacro
-
-; computed jump assumes this loop is exactly 64 bytes
-%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
-ALIGN 16
-sad_w16_align%1_ssse3:
-    movdqa  xmm1, [r2+16]
-    movdqa  xmm2, [r2+r3+16]
-    palignr xmm1, [r2], %1
-    palignr xmm2, [r2+r3], %1
-    psadbw  xmm1, [r0]
-    psadbw  xmm2, [r0+r1]
-    paddw   xmm0, xmm1
-    paddw   xmm0, xmm2
-    lea     r0,   [r0+2*r1]
-    lea     r2,   [r2+2*r3]
-    dec     r4
-    jg sad_w16_align%1_ssse3
-    ret
-%endmacro
-
-%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal pixel_sad_16x%2_cache64_%1
-    mov     eax, r2m
-    and     eax, 0x37
-    cmp     eax, 0x30
-    jle pixel_sad_16x%2_sse2
-    PROLOGUE 4,6
-    mov     r4d, r2d
-    and     r4d, 15
-%ifidn %1, ssse3
-    shl     r4d, 6  ; code size = 64
-%else
-    lea     r4, [r4*5]
-    shl     r4d, 4  ; code size = 80
-%endif
-%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
-%ifdef PIC
-    lea     r5, [sad_w16_addr]
-    add     r5, r4
-%else
-    lea     r5, [sad_w16_addr + r4]
-%endif
-    and     r2, ~15
-    mov     r4d, %2/2
-    pxor    xmm0, xmm0
-    call    r5
-    MOVHL   xmm1, xmm0
-    paddw   xmm0, xmm1
-    movd    eax,  xmm0
-    RET
-%endmacro
-
-%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
-    mov    eax, r2m
-    and    eax, 0x17|%1|(%4>>1)
-    cmp    eax, 0x10|%1|(%4>>1)
-    jle pixel_sad_%1x%2_mmx2
-    and    eax, 7
-    shl    eax, 3
-    movd   mm6, [sw_64]
-    movd   mm7, eax
-    psubw  mm6, mm7
-    PROLOGUE 4,5
-    and    r2, ~7
-    mov    r4d, %3
-    pxor   mm0, mm0
-%endmacro
-
-%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal pixel_sad_16x%1_cache%2_mmx2
-    SAD_CACHELINE_START_MMX2 16, %1, %1, %2
-.loop:
-    movq   mm1, [r2]
-    movq   mm2, [r2+8]
-    movq   mm3, [r2+16]
-    movq   mm4, mm2
-    psrlq  mm1, mm7
-    psllq  mm2, mm6
-    psllq  mm3, mm6
-    psrlq  mm4, mm7
-    por    mm1, mm2
-    por    mm3, mm4
-    psadbw mm1, [r0]
-    psadbw mm3, [r0+8]
-    paddw  mm0, mm1
-    paddw  mm0, mm3
-    add    r2, r3
-    add    r0, r1
-    dec    r4
-    jg .loop
-    movd   eax, mm0
-    RET
-%endmacro
-
-%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal pixel_sad_8x%1_cache%2_mmx2
-    SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
-.loop:
-    movq   mm1, [r2+8]
-    movq   mm2, [r2+r3+8]
-    movq   mm3, [r2]
-    movq   mm4, [r2+r3]
-    psllq  mm1, mm6
-    psllq  mm2, mm6
-    psrlq  mm3, mm7
-    psrlq  mm4, mm7
-    por    mm1, mm3
-    por    mm2, mm4
-    psadbw mm1, [r0]
-    psadbw mm2, [r0+r1]
-    paddw  mm0, mm1
-    paddw  mm0, mm2
-    lea    r2, [r2+2*r3]
-    lea    r0, [r0+2*r1]
-    dec    r4
-    jg .loop
-    movd   eax, mm0
-    RET
-%endmacro
-
-; sad_x3/x4_cache64: check each mv.
-; if they're all within a cacheline, use normal sad_x3/x4.
-; otherwise, send them individually to sad_cache64.
-%macro CHECK_SPLIT 3 ; pix, width, cacheline
-    mov  eax, %1
-    and  eax, 0x17|%2|(%3>>1)
-    cmp  eax, 0x10|%2|(%3>>1)
-    jg .split
-%endmacro
-
-%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal pixel_sad_x3_%1x%2_cache%3_%6
-    CHECK_SPLIT r1m, %1, %3
-    CHECK_SPLIT r2m, %1, %3
-    CHECK_SPLIT r3m, %1, %3
-    jmp pixel_sad_x3_%1x%2_%4
-.split:
-%if ARCH_X86_64
-    PROLOGUE 6,9
-    push r3
-    push r2
-%if WIN64
-    movsxd r4, r4d
-    sub rsp, 40 ; shadow space and alignment
-%endif
-    mov  r2, r1
-    mov  r1, FENC_STRIDE
-    mov  r3, r4
-    mov  r7, r0
-    mov  r8, r5
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8], eax
-%if WIN64
-    mov  r2, [rsp+40+0*8]
-%else
-    pop  r2
-%endif
-    mov  r0, r7
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8+4], eax
-%if WIN64
-    mov  r2, [rsp+40+1*8]
-%else
-    pop  r2
-%endif
-    mov  r0, r7
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8+8], eax
-%if WIN64
-    add  rsp, 40+2*8
-%endif
-    RET
-%else
-    push edi
-    mov  edi, [esp+28]
-    push dword [esp+24]
-    push dword [esp+16]
-    push dword 16
-    push dword [esp+20]
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  ecx, [esp+32]
-    mov  [edi], eax
-    mov  [esp+8], ecx
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  ecx, [esp+36]
-    mov  [edi+4], eax
-    mov  [esp+8], ecx
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [edi+8], eax
-    add  esp, 16
-    pop  edi
-    ret
-%endif
-%endmacro
-
-%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal pixel_sad_x4_%1x%2_cache%3_%6
-    CHECK_SPLIT r1m, %1, %3
-    CHECK_SPLIT r2m, %1, %3
-    CHECK_SPLIT r3m, %1, %3
-    CHECK_SPLIT r4m, %1, %3
-    jmp pixel_sad_x4_%1x%2_%4
-.split:
-%if ARCH_X86_64
-    PROLOGUE 6,9
-    mov  r8,  r6mp
-    push r4
-    push r3
-    push r2
-%if WIN64
-    sub rsp, 32 ; shadow space
-%endif
-    mov  r2, r1
-    mov  r1, FENC_STRIDE
-    mov  r3, r5
-    mov  r7, r0
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8], eax
-%if WIN64
-    mov  r2, [rsp+32+0*8]
-%else
-    pop  r2
-%endif
-    mov  r0, r7
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8+4], eax
-%if WIN64
-    mov  r2, [rsp+32+1*8]
-%else
-    pop  r2
-%endif
-    mov  r0, r7
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8+8], eax
-%if WIN64
-    mov  r2, [rsp+32+2*8]
-%else
-    pop  r2
-%endif
-    mov  r0, r7
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [r8+12], eax
-%if WIN64
-    add  rsp, 32+3*8
-%endif
-    RET
-%else
-    push edi
-    mov  edi, [esp+32]
-    push dword [esp+28]
-    push dword [esp+16]
-    push dword 16
-    push dword [esp+20]
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  ecx, [esp+32]
-    mov  [edi], eax
-    mov  [esp+8], ecx
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  ecx, [esp+36]
-    mov  [edi+4], eax
-    mov  [esp+8], ecx
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  ecx, [esp+40]
-    mov  [edi+8], eax
-    mov  [esp+8], ecx
-    call pixel_sad_%1x%2_cache%3_%5
-    mov  [edi+12], eax
-    add  esp, 16
-    pop  edi
-    ret
-%endif
-%endmacro
-
-%macro SADX34_CACHELINE_FUNC 1+
-    SADX3_CACHELINE_FUNC %1
-    SADX4_CACHELINE_FUNC %1
-%endmacro
-
-
-; instantiate the aligned sads
-
-INIT_MMX
-%if ARCH_X86_64 == 0
-SAD16_CACHELINE_FUNC_MMX2  8, 32
-SAD16_CACHELINE_FUNC_MMX2 16, 32
-SAD8_CACHELINE_FUNC_MMX2   4, 32
-SAD8_CACHELINE_FUNC_MMX2   8, 32
-SAD8_CACHELINE_FUNC_MMX2  16, 32
-SAD16_CACHELINE_FUNC_MMX2  8, 64
-SAD16_CACHELINE_FUNC_MMX2 16, 64
-%endif ; !ARCH_X86_64
-SAD8_CACHELINE_FUNC_MMX2   4, 64
-SAD8_CACHELINE_FUNC_MMX2   8, 64
-SAD8_CACHELINE_FUNC_MMX2  16, 64
-
-%if ARCH_X86_64 == 0
-SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC 16,  8, 32, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC  8, 16, 32, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC  8,  8, 32, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC 16,  8, 64, mmx2, mmx2, mmx2
-%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC  8, 16, 64, mmx2, mmx2, mmx2
-SADX34_CACHELINE_FUNC  8,  8, 64, mmx2, mmx2, mmx2
-
-%if ARCH_X86_64 == 0
-SAD16_CACHELINE_FUNC sse2, 8
-SAD16_CACHELINE_FUNC sse2, 16
-%assign i 1
-%rep 15
-SAD16_CACHELINE_LOOP_SSE2 i
-%assign i i+1
-%endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
-SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2, sse2
-%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC  8, 16, 64, sse2, mmx2, sse2
-
-SAD16_CACHELINE_FUNC ssse3, 8
-SAD16_CACHELINE_FUNC ssse3, 16
-%assign i 1
-%rep 15
-SAD16_CACHELINE_LOOP_SSSE3 i
-%assign i i+1
-%endrep
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
-SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3, ssse3
-
diff --git a/android/src/main/libenc/jni/libx264/common/x86/sad16-a.asm b/android/src/main/libenc/jni/libx264/common/x86/sad16-a.asm
deleted file mode 100755
index f0886e7..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/sad16-a.asm
+++ /dev/null
@@ -1,727 +0,0 @@
-;*****************************************************************************
-;* sad16-a.asm: x86 high depth sad functions
-;*****************************************************************************
-;* Copyright (C) 2010-2016 x264 project
-;*
-;* Authors: Oskar Arvidsson <oskar@irock.se>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION .text
-
-cextern pw_1
-cextern pw_4
-cextern pw_8
-
-;=============================================================================
-; SAD MMX
-;=============================================================================
-
-%macro SAD_INC_1x16P_MMX 0
-    movu    m1, [r0+ 0]
-    movu    m2, [r0+ 8]
-    movu    m3, [r0+16]
-    movu    m4, [r0+24]
-    psubw   m1, [r2+ 0]
-    psubw   m2, [r2+ 8]
-    psubw   m3, [r2+16]
-    psubw   m4, [r2+24]
-    ABSW2   m1, m2, m1, m2, m5, m6
-    ABSW2   m3, m4, m3, m4, m7, m5
-    lea     r0, [r0+2*r1]
-    lea     r2, [r2+2*r3]
-    paddw   m1, m2
-    paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
-%endmacro
-
-%macro SAD_INC_2x8P_MMX 0
-    movu    m1, [r0+0]
-    movu    m2, [r0+8]
-    movu    m3, [r0+2*r1+0]
-    movu    m4, [r0+2*r1+8]
-    psubw   m1, [r2+0]
-    psubw   m2, [r2+8]
-    psubw   m3, [r2+2*r3+0]
-    psubw   m4, [r2+2*r3+8]
-    ABSW2   m1, m2, m1, m2, m5, m6
-    ABSW2   m3, m4, m3, m4, m7, m5
-    lea     r0, [r0+4*r1]
-    lea     r2, [r2+4*r3]
-    paddw   m1, m2
-    paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
-%endmacro
-
-%macro SAD_INC_2x4P_MMX 0
-    movu    m1, [r0]
-    movu    m2, [r0+2*r1]
-    psubw   m1, [r2]
-    psubw   m2, [r2+2*r3]
-    ABSW2   m1, m2, m1, m2, m3, m4
-    lea     r0, [r0+4*r1]
-    lea     r2, [r2+4*r3]
-    paddw   m0, m1
-    paddw   m0, m2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SAD_MMX 3
-cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
-    pxor    m0, m0
-%if %2 == 4
-    SAD_INC_%3x%1P_MMX
-    SAD_INC_%3x%1P_MMX
-%else
-    mov    r4d, %2/%3
-.loop:
-    SAD_INC_%3x%1P_MMX
-    dec    r4d
-    jg .loop
-%endif
-%if %1*%2 == 256
-    HADDUW  m0, m1
-%else
-    HADDW   m0, m1
-%endif
-    movd   eax, m0
-    RET
-%endmacro
-
-INIT_MMX mmx2
-SAD_MMX 16, 16, 1
-SAD_MMX 16,  8, 1
-SAD_MMX  8, 16, 2
-SAD_MMX  8,  8, 2
-SAD_MMX  8,  4, 2
-SAD_MMX  4,  8, 2
-SAD_MMX  4,  4, 2
-INIT_MMX ssse3
-SAD_MMX  4,  8, 2
-SAD_MMX  4,  4, 2
-
-;=============================================================================
-; SAD XMM
-;=============================================================================
-
-%macro SAD_INC_2ROW 1
-%if 2*%1 > mmsize
-    movu    m1, [r2+ 0]
-    movu    m2, [r2+16]
-    movu    m3, [r2+2*r3+ 0]
-    movu    m4, [r2+2*r3+16]
-    psubw   m1, [r0+ 0]
-    psubw   m2, [r0+16]
-    psubw   m3, [r0+2*r1+ 0]
-    psubw   m4, [r0+2*r1+16]
-    ABSW2   m1, m2, m1, m2, m5, m6
-    lea     r0, [r0+4*r1]
-    lea     r2, [r2+4*r3]
-    ABSW2   m3, m4, m3, m4, m7, m5
-    paddw   m1, m2
-    paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
-%else
-    movu    m1, [r2]
-    movu    m2, [r2+2*r3]
-    psubw   m1, [r0]
-    psubw   m2, [r0+2*r1]
-    ABSW2   m1, m2, m1, m2, m3, m4
-    lea     r0, [r0+4*r1]
-    lea     r2, [r2+4*r3]
-    paddw   m0, m1
-    paddw   m0, m2
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
-;-----------------------------------------------------------------------------
-%macro SAD 2
-cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
-    pxor    m0, m0
-%if %2 == 4
-    SAD_INC_2ROW %1
-    SAD_INC_2ROW %1
-%else
-    mov    r4d, %2/2
-.loop:
-    SAD_INC_2ROW %1
-    dec    r4d
-    jg .loop
-%endif
-    HADDW   m0, m1
-    movd   eax, xm0
-    RET
-%endmacro
-
-INIT_XMM sse2
-SAD 16, 16
-SAD 16,  8
-SAD  8, 16
-SAD  8,  8
-SAD  8,  4
-INIT_XMM sse2, aligned
-SAD 16, 16
-SAD 16,  8
-SAD  8, 16
-SAD  8,  8
-INIT_XMM ssse3
-SAD 16, 16
-SAD 16,  8
-SAD  8, 16
-SAD  8,  8
-SAD  8,  4
-INIT_XMM ssse3, aligned
-SAD 16, 16
-SAD 16,  8
-SAD  8, 16
-SAD  8,  8
-INIT_YMM avx2
-SAD 16, 16
-SAD 16,  8
-
-;=============================================================================
-; SAD x3/x4
-;=============================================================================
-
-%macro SAD_X3_INC_P 0
-    add     r0, 4*FENC_STRIDE
-    lea     r1, [r1+4*r4]
-    lea     r2, [r2+4*r4]
-    lea     r3, [r3+4*r4]
-%endmacro
-
-%macro SAD_X3_ONE_START 0
-    mova    m3, [r0]
-    movu    m0, [r1]
-    movu    m1, [r2]
-    movu    m2, [r3]
-    psubw   m0, m3
-    psubw   m1, m3
-    psubw   m2, m3
-    ABSW2   m0, m1, m0, m1, m4, m5
-    ABSW    m2, m2, m6
-%endmacro
-
-%macro SAD_X3_ONE 2
-    mova    m6, [r0+%1]
-    movu    m3, [r1+%2]
-    movu    m4, [r2+%2]
-    movu    m5, [r3+%2]
-    psubw   m3, m6
-    psubw   m4, m6
-    psubw   m5, m6
-    ABSW2   m3, m4, m3, m4, m7, m6
-    ABSW    m5, m5, m6
-    paddw   m0, m3
-    paddw   m1, m4
-    paddw   m2, m5
-%endmacro
-
-%macro SAD_X3_END 2
-%if mmsize == 8 && %1*%2 == 256
-    HADDUW   m0, m3
-    HADDUW   m1, m4
-    HADDUW   m2, m5
-%else
-    HADDW    m0, m3
-    HADDW    m1, m4
-    HADDW    m2, m5
-%endif
-%if UNIX64
-    movd [r5+0], xm0
-    movd [r5+4], xm1
-    movd [r5+8], xm2
-%else
-    mov      r0, r5mp
-    movd [r0+0], xm0
-    movd [r0+4], xm1
-    movd [r0+8], xm2
-%endif
-    RET
-%endmacro
-
-%macro SAD_X4_INC_P 0
-    add     r0, 4*FENC_STRIDE
-    lea     r1, [r1+4*r5]
-    lea     r2, [r2+4*r5]
-    lea     r3, [r3+4*r5]
-    lea     r4, [r4+4*r5]
-%endmacro
-
-%macro SAD_X4_ONE_START 0
-    mova    m4, [r0]
-    movu    m0, [r1]
-    movu    m1, [r2]
-    movu    m2, [r3]
-    movu    m3, [r4]
-    psubw   m0, m4
-    psubw   m1, m4
-    psubw   m2, m4
-    psubw   m3, m4
-    ABSW2   m0, m1, m0, m1, m5, m6
-    ABSW2   m2, m3, m2, m3, m4, m7
-%endmacro
-
-%macro SAD_X4_ONE 2
-    mova    m4, [r0+%1]
-    movu    m5, [r1+%2]
-    movu    m6, [r2+%2]
-%if num_mmregs > 8
-    movu    m7, [r3+%2]
-    movu    m8, [r4+%2]
-    psubw   m5, m4
-    psubw   m6, m4
-    psubw   m7, m4
-    psubw   m8, m4
-    ABSW2   m5, m6, m5, m6, m9, m10
-    ABSW2   m7, m8, m7, m8, m9, m10
-    paddw   m0, m5
-    paddw   m1, m6
-    paddw   m2, m7
-    paddw   m3, m8
-%elif cpuflag(ssse3)
-    movu    m7, [r3+%2]
-    psubw   m5, m4
-    psubw   m6, m4
-    psubw   m7, m4
-    movu    m4, [r4+%2]
-    pabsw   m5, m5
-    psubw   m4, [r0+%1]
-    pabsw   m6, m6
-    pabsw   m7, m7
-    pabsw   m4, m4
-    paddw   m0, m5
-    paddw   m1, m6
-    paddw   m2, m7
-    paddw   m3, m4
-%else ; num_mmregs == 8 && !ssse3
-    psubw   m5, m4
-    psubw   m6, m4
-    ABSW    m5, m5, m7
-    ABSW    m6, m6, m7
-    paddw   m0, m5
-    paddw   m1, m6
-    movu    m5, [r3+%2]
-    movu    m6, [r4+%2]
-    psubw   m5, m4
-    psubw   m6, m4
-    ABSW2   m5, m6, m5, m6, m7, m4
-    paddw   m2, m5
-    paddw   m3, m6
-%endif
-%endmacro
-
-%macro SAD_X4_END 2
-%if mmsize == 8 && %1*%2 == 256
-    HADDUW    m0, m4
-    HADDUW    m1, m5
-    HADDUW    m2, m6
-    HADDUW    m3, m7
-%else
-    HADDW     m0, m4
-    HADDW     m1, m5
-    HADDW     m2, m6
-    HADDW     m3, m7
-%endif
-    mov       r0, r6mp
-    movd [r0+ 0], xm0
-    movd [r0+ 4], xm1
-    movd [r0+ 8], xm2
-    movd [r0+12], xm3
-    RET
-%endmacro
-
-%macro SAD_X_2xNP 4
-    %assign x %3
-%rep %4
-    SAD_X%1_ONE x*mmsize, x*mmsize
-    SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
-    %assign x x+1
-%endrep
-%endmacro
-
-%macro PIXEL_VSAD 0
-cglobal pixel_vsad, 3,3,8
-    mova      m0, [r0]
-    mova      m1, [r0+16]
-    mova      m2, [r0+2*r1]
-    mova      m3, [r0+2*r1+16]
-    lea       r0, [r0+4*r1]
-    psubw     m0, m2
-    psubw     m1, m3
-    ABSW2     m0, m1, m0, m1, m4, m5
-    paddw     m0, m1
-    sub      r2d, 2
-    je .end
-.loop:
-    mova      m4, [r0]
-    mova      m5, [r0+16]
-    mova      m6, [r0+2*r1]
-    mova      m7, [r0+2*r1+16]
-    lea       r0, [r0+4*r1]
-    psubw     m2, m4
-    psubw     m3, m5
-    psubw     m4, m6
-    psubw     m5, m7
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    ABSW      m4, m4, m1
-    ABSW      m5, m5, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    paddw     m0, m4
-    paddw     m0, m5
-    mova      m2, m6
-    mova      m3, m7
-    sub r2d, 2
-    jg .loop
-.end:
-%if BIT_DEPTH == 9
-    HADDW     m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
-%else
-    HADDUW    m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
-%endif
-    movd     eax, m0
-    RET
-%endmacro
-INIT_XMM sse2
-PIXEL_VSAD
-INIT_XMM ssse3
-PIXEL_VSAD
-INIT_XMM xop
-PIXEL_VSAD
-
-INIT_YMM avx2
-cglobal pixel_vsad, 3,3
-    mova      m0, [r0]
-    mova      m1, [r0+2*r1]
-    lea       r0, [r0+4*r1]
-    psubw     m0, m1
-    pabsw     m0, m0
-    sub      r2d, 2
-    je .end
-.loop:
-    mova      m2, [r0]
-    mova      m3, [r0+2*r1]
-    lea       r0, [r0+4*r1]
-    psubw     m1, m2
-    psubw     m2, m3
-    pabsw     m1, m1
-    pabsw     m2, m2
-    paddw     m0, m1
-    paddw     m0, m2
-    mova      m1, m3
-    sub      r2d, 2
-    jg .loop
-.end:
-%if BIT_DEPTH == 9
-    HADDW     m0, m1
-%else
-    HADDUW    m0, m1
-%endif
-    movd     eax, xm0
-    RET
-
-;-----------------------------------------------------------------------------
-; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
-;                        uint16_t *pix2, intptr_t i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
-    %assign regnum %1+1
-    %xdefine STRIDE r %+ regnum
-    mov     r6, %3/2-1
-    SAD_X%1_ONE_START
-    SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
-    SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
-.loop:
-    SAD_X%1_INC_P
-    SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
-    dec     r6
-    jg .loop
-%if %1 == 4
-    mov     r6, r6m
-%endif
-    SAD_X%1_END %2, %3
-%endmacro
-
-INIT_MMX mmx2
-%define XMM_REGS 0
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-SAD_X 3,  8, 16
-SAD_X 3,  8,  8
-SAD_X 3,  8,  4
-SAD_X 3,  4,  8
-SAD_X 3,  4,  4
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-SAD_X 4,  8, 16
-SAD_X 4,  8,  8
-SAD_X 4,  8,  4
-SAD_X 4,  4,  8
-SAD_X 4,  4,  4
-INIT_MMX ssse3
-SAD_X 3,  4,  8
-SAD_X 3,  4,  4
-SAD_X 4,  4,  8
-SAD_X 4,  4,  4
-INIT_XMM ssse3
-%define XMM_REGS 7
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-SAD_X 3,  8, 16
-SAD_X 3,  8,  8
-SAD_X 3,  8,  4
-%define XMM_REGS 9
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-SAD_X 4,  8, 16
-SAD_X 4,  8,  8
-SAD_X 4,  8,  4
-INIT_XMM sse2
-%define XMM_REGS 8
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-SAD_X 3,  8, 16
-SAD_X 3,  8,  8
-SAD_X 3,  8,  4
-%define XMM_REGS 11
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-SAD_X 4,  8, 16
-SAD_X 4,  8,  8
-SAD_X 4,  8,  4
-INIT_XMM xop
-%define XMM_REGS 7
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-SAD_X 3,  8, 16
-SAD_X 3,  8,  8
-SAD_X 3,  8,  4
-%define XMM_REGS 9
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-SAD_X 4,  8, 16
-SAD_X 4,  8,  8
-SAD_X 4,  8,  4
-INIT_YMM avx2
-%define XMM_REGS 7
-SAD_X 3, 16, 16
-SAD_X 3, 16,  8
-%define XMM_REGS 9
-SAD_X 4, 16, 16
-SAD_X 4, 16,  8
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
-;-----------------------------------------------------------------------------
-
-%macro INTRA_SAD_X3_4x4 0
-cglobal intra_sad_x3_4x4, 3,3,7
-%if cpuflag(ssse3)
-    movddup   m0, [r1-1*FDEC_STRIDEB]
-%else
-    movq      m0, [r1-1*FDEC_STRIDEB]
-    punpcklqdq m0, m0
-%endif
-    movq      m1, [r0+0*FENC_STRIDEB]
-    movq      m2, [r0+2*FENC_STRIDEB]
-    pshuflw   m6, m0, q1032
-    paddw     m6, m0
-    pshuflw   m5, m6, q2301
-    paddw     m6, m5
-    punpcklqdq m6, m6       ; A+B+C+D 8 times
-    movhps    m1, [r0+1*FENC_STRIDEB]
-    movhps    m2, [r0+3*FENC_STRIDEB]
-    psubw     m3, m1, m0
-    psubw     m0, m2
-    ABSW2     m3, m0, m3, m0, m4, m5
-    paddw     m0, m3
-    movd      m3, [r1+0*FDEC_STRIDEB-4]
-    movd      m4, [r1+2*FDEC_STRIDEB-4]
-    movhps    m3, [r1+1*FDEC_STRIDEB-8]
-    movhps    m4, [r1+3*FDEC_STRIDEB-8]
-    pshufhw   m3, m3, q3333
-    pshufhw   m4, m4, q3333
-    pshuflw   m3, m3, q1111 ; FF FF EE EE
-    pshuflw   m4, m4, q1111 ; HH HH GG GG
-    paddw     m5, m3, m4
-    paddw     m6, [pw_4]
-    paddw     m6, m5
-    pshufd    m5, m5, q1032
-    paddw     m5, m6
-    psrlw     m5, 3
-    psubw     m6, m5, m2
-    psubw     m5, m1
-    psubw     m1, m3
-    psubw     m2, m4
-    ABSW2     m5, m6, m5, m6, m3, m4
-    ABSW2     m1, m2, m1, m2, m3, m4
-    paddw     m5, m6
-    paddw     m1, m2
-%if cpuflag(ssse3)
-    phaddw    m0, m1
-    movhlps   m3, m5
-    paddw     m5, m3
-    phaddw    m0, m5
-    pmaddwd   m0, [pw_1]
-    mova    [r2], m0
-%else
-    HADDW     m0, m3
-    HADDW     m1, m3
-    HADDW     m5, m3
-    movd    [r2], m0 ; V prediction cost
-    movd  [r2+4], m1 ; H prediction cost
-    movd  [r2+8], m5 ; DC prediction cost
-%endif
-    RET
-%endmacro
-
-INIT_XMM sse2
-INTRA_SAD_X3_4x4
-INIT_XMM ssse3
-INTRA_SAD_X3_4x4
-INIT_XMM avx
-INTRA_SAD_X3_4x4
-
-;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
-;-----------------------------------------------------------------------------
-
-;m0 = DC
-;m6 = V
-;m7 = H
-;m1 = DC score
-;m2 = V score
-;m3 = H score
-;m5 = temp
-;m4 = pixel row
-
-%macro INTRA_SAD_HVDC_ITER 2
-    mova        m4, [r0+(%1-4)*FENC_STRIDEB]
-    psubw       m4, m0
-    ABSW        m4, m4, m5
-    ACCUM    paddw, 1, 4, %1
-    mova        m4, [r0+(%1-4)*FENC_STRIDEB]
-    psubw       m4, m6
-    ABSW        m4, m4, m5
-    ACCUM    paddw, 2, 4, %1
-    pshufd      m5, m7, %2
-    psubw       m5, [r0+(%1-4)*FENC_STRIDEB]
-    ABSW        m5, m5, m4
-    ACCUM    paddw, 3, 5, %1
-%endmacro
-
-%macro INTRA_SAD_X3_8x8 0
-cglobal intra_sad_x3_8x8, 3,3,8
-    add         r0, 4*FENC_STRIDEB
-    movu        m0, [r1+7*SIZEOF_PIXEL]
-    mova        m6, [r1+16*SIZEOF_PIXEL] ;V prediction
-    mova        m7, m0
-    paddw       m0, m6
-    punpckhwd   m7, m7
-    HADDW       m0, m4
-    paddw       m0, [pw_8]
-    psrlw       m0, 4
-    SPLATW      m0, m0
-    INTRA_SAD_HVDC_ITER 0, q3333
-    INTRA_SAD_HVDC_ITER 1, q2222
-    INTRA_SAD_HVDC_ITER 2, q1111
-    INTRA_SAD_HVDC_ITER 3, q0000
-    movq        m7, [r1+7*SIZEOF_PIXEL]
-    punpcklwd   m7, m7
-    INTRA_SAD_HVDC_ITER 4, q3333
-    INTRA_SAD_HVDC_ITER 5, q2222
-    INTRA_SAD_HVDC_ITER 6, q1111
-    INTRA_SAD_HVDC_ITER 7, q0000
-%if cpuflag(ssse3)
-    phaddw      m2, m3     ; 2 2 2 2 3 3 3 3
-    movhlps     m3, m1
-    paddw       m1, m3     ; 1 1 1 1 _ _ _ _
-    phaddw      m2, m1     ; 2 2 3 3 1 1 _ _
-    pmaddwd     m2, [pw_1] ; 2 3 1 _
-    mova      [r2], m2
-%else
-    HADDW       m2, m4
-    HADDW       m3, m4
-    HADDW       m1, m4
-    movd    [r2+0], m2
-    movd    [r2+4], m3
-    movd    [r2+8], m1
-%endif
-    RET
-%endmacro
-
-INIT_XMM sse2
-INTRA_SAD_X3_8x8
-INIT_XMM ssse3
-INTRA_SAD_X3_8x8
-
-%macro INTRA_SAD_HVDC_ITER_YMM 2
-    mova       xm4, [r0+(%1-4)*FENC_STRIDEB]
-    vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
-    pshufd      m5, m7, %2
-    psubw       m5, m4
-    pabsw       m5, m5
-    ACCUM    paddw, 2, 5, %1 ; H
-    psubw       m5, m4, m6
-    psubw       m4, m0
-    pabsw       m5, m5
-    pabsw       m4, m4
-    ACCUM    paddw, 1, 5, %1 ; V
-    ACCUM    paddw, 3, 4, %1 ; DC
-%endmacro
-
-INIT_YMM avx2
-cglobal intra_sad_x3_8x8, 3,3,8
-    add            r0, 4*FENC_STRIDEB
-    movu          xm0, [r1+7*SIZEOF_PIXEL]
-    vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
-    vpermq         m7, m0, q0011
-    paddw         xm0, xm6
-    paddw         xm0, [pw_1] ; equal to +8 after HADDW
-    HADDW         xm0, xm4
-    psrld         xm0, 4
-    vpbroadcastw   m0, xm0
-    punpcklwd      m7, m7
-    INTRA_SAD_HVDC_ITER_YMM 0, q3333
-    INTRA_SAD_HVDC_ITER_YMM 1, q2222
-    INTRA_SAD_HVDC_ITER_YMM 2, q1111
-    INTRA_SAD_HVDC_ITER_YMM 3, q0000
-    phaddw         m1, m2     ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
-    punpckhqdq     m2, m3, m3
-    paddw          m3, m2     ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
-    phaddw         m1, m3     ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
-    vextracti128  xm2, m1, 1
-    paddw         xm1, xm2    ; 1 1 2 2 3 3 _ _
-    pmaddwd       xm1, [pw_1] ; 1 2 3 _
-    mova         [r2], xm1
-    RET
diff --git a/android/src/main/libenc/jni/libx264/common/x86/trellis-64.asm b/android/src/main/libenc/jni/libx264/common/x86/trellis-64.asm
deleted file mode 100755
index 0c25914..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/trellis-64.asm
+++ /dev/null
@@ -1,893 +0,0 @@
-;*****************************************************************************
-;* trellis-64.asm: x86_64 trellis quantization
-;*****************************************************************************
-;* Copyright (C) 2012-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-; This is a pretty straight-forward translation of the C code, except:
-; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
-; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
-;   4x parallel, handling 4 node_ctxs of the same coef (even if some of those
-;   nodes are invalid).
-; * Interprocedural register allocation. Eliminates argument-passing overhead
-;   to trellis_coef* subroutines. Also reduces codesize.
-
-; Optimizations that I tried, and rejected because they were not faster:
-; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
-;   Costs too much icache compared to the negligible speedup.
-; * There are only 21 possible sets of live node_ctxs; we could keep track of
-;   exactly which set we're in and feed that (along with abs_level) into a jump
-;   table instead of the switch to select a trellis_coef subroutine. This would
-;   eliminate all branches about which node_ctxs are live, but costs either a
-;   bunch of icache or a bunch of call/ret, and the jump table itself is
-;   unpredictable.
-; * Separate versions of trellis_coef* depending on whether we're doing the 1st
-;   or the 2nd of the two abs_level candidates. This would eliminate some
-;   branches about if(score is better).
-; * Special case more values of coef. I had a coef2 at some intermediate point
-;   in the optimization process, but it didn't end up worthwhile in conjunction
-;   with all the other optimizations.
-; * Unroll or simd writeback. I don't know why this didn't help.
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA
-
-pd_m16: times 4 dd -16
-sq_1: dq 1, 0
-pq_128: times 2 dq 128
-pq_ffffffff: times 2 dq 0xffffffff
-
-cextern pd_8
-cextern pd_0123
-cextern pd_4567
-cextern cabac_entropy
-cextern cabac_transition
-cextern cabac_size_unary
-cextern cabac_transition_unary
-cextern dct4_weight_tab
-cextern dct8_weight_tab
-cextern dct4_weight2_tab
-cextern dct8_weight2_tab
-cextern last_coeff_flag_offset_8x8
-cextern significant_coeff_flag_offset_8x8
-cextern coeff_flag_offset_chroma_422_dc
-
-SECTION .text
-
-%define TRELLIS_SCORE_BIAS 1<<60
-%define SIZEOF_NODE 16
-%define CABAC_SIZE_BITS 8
-%define LAMBDA_BITS 4
-
-%macro SQUARE 2 ; dst, tmp
-    ; could use pmuldq here, to eliminate the abs. but that would involve
-    ; templating a sse4 version of all of trellis, for negligible speedup.
-%if cpuflag(ssse3)
-    pabsd   m%1, m%1
-    pmuludq m%1, m%1
-%elif HIGH_BIT_DEPTH
-    ABSD    m%2, m%1
-    SWAP     %1, %2
-    pmuludq m%1, m%1
-%else
-    pmuludq m%1, m%1
-    pand    m%1, [pq_ffffffff]
-%endif
-%endmacro
-
-%macro LOAD_DUP 2 ; dst, src
-%if cpuflag(ssse3)
-    movddup    %1, %2
-%else
-    movd       %1, %2
-    punpcklqdq %1, %1
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int trellis_cabac_4x4_psy(
-;     const int *unquant_mf, const uint8_t *zigzag, int lambda2,
-;     int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
-;     uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
-;     uint64_t level_state0, uint16_t level_state1,
-;     int b_ac, dctcoef *fenc_dct, int psy_trellis )
-;-----------------------------------------------------------------------------
-%macro TRELLIS 4
-%define num_coefs %2
-%define dc %3
-%define psy %4
-cglobal %1, 4,15,9
-    %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
-    %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
-    SUB  rsp, pad
-    DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
-%if WIN64
-    %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
-%else
-    %define level_statem rsp+stack_offset+32
-%endif
-    %define b_acm r11m ; 4x4 only
-    %define b_interlacedm r11m ; 8x8 only
-    %define i_coefsm1 r11m ; dc only
-    %define fenc_dctm r12m
-    %define psy_trellism r13m
-%if num_coefs == 64
-    shl dword b_interlacedm, 6
-    %define dct_weight1_tab dct8_weight_tab
-    %define dct_weight2_tab dct8_weight2_tab
-%else
-    %define dct_weight1_tab dct4_weight_tab
-    %define dct_weight2_tab dct4_weight2_tab
-%endif
-
-    %define stack rsp
-    %define last_nnzm [stack+0]
-    %define zigzagm   [stack+8]
-    mov     last_nnzm, iid
-    mov     zigzagm,   zigzagq
-%if WIN64 == 0
-    %define orig_coefsm  [stack+16]
-    %define quant_coefsm [stack+24]
-    mov     orig_coefsm,  orig_coefsq
-    mov     quant_coefsm, quant_coefsq
-%endif
-    %define unquant_mfm   [stack+32]
-    %define levelgt1_ctxm [stack+40]
-    %define ssd            stack+48
-    %define cost_siglast   stack+80
-    %define level_tree     stack+96
-
-    ; trellis_node_t is layed out differently than C.
-    ; struct-of-arrays rather than array-of-structs, for simd.
-    %define nodes_curq r7
-    %define nodes_prevq r8
-    %define node_score(x) x*8
-    %define node_level_idx(x) 64+x*4
-    %define node_cabac_state(x) 96+x*4
-    lea nodes_curq, [level_tree + level_tree_size]
-    lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
-    mov        r6, TRELLIS_SCORE_BIAS
-    mov       [nodes_curq + node_score(0)], r6
-    mov dword [nodes_curq + node_level_idx(0)], 0
-    movd      mm0, [level_statem + 0]
-    punpcklbw mm0, [level_statem + 4]
-    punpcklwd mm0, [level_statem + 8]
-    %define level_state_packed mm0 ; version for copying into node.cabac_state
-    pcmpeqb    m7, m7 ; TRELLIS_SCORE_MAX
-    movq [nodes_curq + node_score(1)], m7
-    mova [nodes_curq + node_score(2)], m7
-
-    %define levels_usedq r4
-    %define levels_usedd r4d
-    mov dword [level_tree], 0
-    mov       levels_usedd, 1
-
-    %define abs_levelq r9
-    %define abs_leveld r9d
-    %define abs_coefq r14
-    %define zigzagiq r5
-    %define zigzagid r5d
-
-%if num_coefs == 8
-    mov dword levelgt1_ctxm, 8
-%else
-    mov dword levelgt1_ctxm, 9
-%endif
-%if psy
-    LOAD_DUP m6, psy_trellism
-    %define psy_trellis m6
-%elif dc
-    LOAD_DUP   m6, [unquant_mfq]
-    paddd      m6, m6
-    %define unquant_mf m6
-%endif
-%ifdef PIC
-%if dc == 0
-    mov unquant_mfm, unquant_mfq
-%endif
-    ; Keep a single offset register to PICify all global constants.
-    ; They're all relative to "beginning of this asm file's .text section",
-    ; even tables that aren't in this file.
-    ; (Any address in .text would work, this one was just convenient.)
-    lea r0, [$$]
-    %define GLOBAL +r0-$$
-%else
-    %define GLOBAL
-%endif
-
-    TRELLIS_LOOP 0 ; node_ctx 0..3
-    TRELLIS_LOOP 1 ; node_ctx 1..7
-
-.writeback:
-    ; int level = bnode->level_idx;
-    ; for( int i = b_ac; i <= last_nnz; i++ )
-    ;     dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
-    ;     level = level_tree[level].next;
-    mov    iid, last_nnzm
-    add zigzagq, iiq
-    neg    iiq
-%if num_coefs == 16 && dc == 0
-    mov    r2d, b_acm
-    add    iiq, r2
-%endif
-    %define dctq r10
-    mov    r0d, [nodes_curq + node_level_idx(0) + rax*4]
-.writeback_loop:
-    movzx   r2, byte [zigzagq + iiq]
-%if cpuflag(ssse3)
-    movd    m0, [level_tree + r0*4]
-    movzx   r0, word [level_tree + r0*4]
-    psrld   m0, 16
-    movd    m1, [dctq + r2*SIZEOF_DCTCOEF]
-%if HIGH_BIT_DEPTH
-    psignd  m0, m1
-    movd [dctq + r2*SIZEOF_DCTCOEF], m0
-%else
-    psignw  m0, m1
-    movd   r4d, m0
-    mov  [dctq + r2*SIZEOF_DCTCOEF], r4w
-%endif
-%else
-    mov    r5d, [level_tree + r0*4]
-%if HIGH_BIT_DEPTH
-    mov    r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
-%else
-    movsx  r4d, word [dctq + r2*SIZEOF_DCTCOEF]
-%endif
-    movzx  r0d, r5w
-    sar    r4d, 31
-    shr    r5d, 16
-    xor    r5d, r4d
-    sub    r5d, r4d
-%if HIGH_BIT_DEPTH
-    mov  [dctq + r2*SIZEOF_DCTCOEF], r5d
-%else
-    mov  [dctq + r2*SIZEOF_DCTCOEF], r5w
-%endif
-%endif
-    inc    iiq
-    jle .writeback_loop
-
-    mov eax, 1
-.return:
-    ADD rsp, pad
-    RET
-
-%if num_coefs == 16 && dc == 0
-.return_zero:
-    pxor       m0, m0
-    mova [r10+ 0], m0
-    mova [r10+16], m0
-%if HIGH_BIT_DEPTH
-    mova [r10+32], m0
-    mova [r10+48], m0
-%endif
-    jmp .return
-%endif
-%endmacro ; TRELLIS
-
-
-
-%macro TRELLIS_LOOP 1 ; ctx_hi
-.i_loop%1:
-    ; if( !quant_coefs[i] )
-    mov   r6, quant_coefsm
-%if HIGH_BIT_DEPTH
-    mov   abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
-%else
-    movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
-%endif
-
-    ; int sigindex  = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
-    ;                 num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i;
-    mov    r10, cabac_state_sigm
-%if num_coefs == 64
-    mov    r6d, b_interlacedm
-%ifdef PIC
-    add    r6d, iid
-    movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
-%else
-    movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
-%endif
-    movzx  r10, byte [r10 + r6]
-%elif num_coefs == 8
-    movzx  r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
-    movzx  r10, byte [r10 + r13]
-%else
-    movzx  r10, byte [r10 + iiq]
-%endif
-
-    test  abs_leveld, abs_leveld
-    jnz %%.nonzero_quant_coef
-
-%if %1 == 0
-    ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
-    ;               * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
-    ; nodes_cur[0].score -= cost_sig0;
-    movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
-    imul   r10, lambda2q
-    shr    r10, CABAC_SIZE_BITS - LAMBDA_BITS
-    sub   [nodes_curq + node_score(0)], r10
-%endif
-    ZERO_LEVEL_IDX %1, cur
-    jmp .i_continue%1
-
-%%.nonzero_quant_coef:
-    ; int sign_coef = orig_coefs[zigzag[i]];
-    ; int abs_coef = abs( sign_coef );
-    ; int q = abs( quant_coefs[i] );
-    movzx   zigzagid, byte [zigzagq+iiq]
-    movd    m0, abs_leveld
-    mov     r6, orig_coefsm
-%if HIGH_BIT_DEPTH
-    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
-%else
-    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
-    psrad    m1, 16     ; sign_coef
-%endif
-    punpcklqdq m0, m0 ; quant_coef
-%if cpuflag(ssse3)
-    pabsd   m0, m0
-    pabsd   m2, m1 ; abs_coef
-%else
-    pxor    m8, m8
-    pcmpgtd m8, m1 ; sign_mask
-    pxor    m0, m8
-    pxor    m2, m1, m8
-    psubd   m0, m8
-    psubd   m2, m8
-%endif
-    psubd   m0, [sq_1] ; abs_level
-    movd  abs_leveld, m0
-
-    xchg  nodes_curq, nodes_prevq
-
-    ; if( i < num_coefs-1 )
-    ;     int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
-    ;                     num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i
-    ;     cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
-    ;     cost_sig1       = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
-    ;     cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
-    ;     cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
-%if %1 == 0
-%if dc && num_coefs != 8
-    cmp    iid, i_coefsm1
-%else
-    cmp    iid, num_coefs-1
-%endif
-    je %%.zero_siglast
-%endif
-    movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
-    xor    r10, 1
-    movzx  r12, word [cabac_entropy + r10*2 GLOBAL]
-    mov   [cost_siglast+0], r11d
-    mov    r10, cabac_state_lastm
-%if num_coefs == 64
-    movzx  r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
-    movzx  r10, byte [r10 + r6]
-%elif num_coefs == 8
-    movzx  r10, byte [r10 + r13]
-%else
-    movzx  r10, byte [r10 + iiq]
-%endif
-    movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
-    add    r11, r12
-    mov   [cost_siglast+4], r11d
-%if %1 == 0
-    xor    r10, 1
-    movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
-    add    r10, r12
-    mov   [cost_siglast+8], r10d
-%endif
-%%.skip_siglast:
-
-    ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
-    ; int d = abs_coef - unquant_abs_level;
-    ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
-%if dc
-    pmuludq m0, unquant_mf
-%else
-%ifdef PIC
-    mov    r10, unquant_mfm
-    LOAD_DUP m3, [r10 + zigzagiq*4]
-%else
-    LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
-%endif
-    pmuludq m0, m3
-%endif
-    paddd   m0, [pq_128]
-    psrld   m0, 8 ; unquant_abs_level
-%if psy || dc == 0
-    mova    m4, m0
-%endif
-    psubd   m0, m2
-    SQUARE   0, 3
-%if dc
-    psllq   m0, 8
-%else
-    LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
-    pmuludq m0, m5
-%endif
-
-%if psy
-    test   iid, iid
-    jz %%.dc_rounding
-    ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
-    ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
-    ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
-    ; ssd1[k] -= psy_weight * psy_value;
-    mov     r6, fenc_dctm
-%if HIGH_BIT_DEPTH
-    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
-%else
-    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
-    psrad   m3, 16 ; orig_coef
-%endif
-%if cpuflag(ssse3)
-    psignd  m4, m1 ; SIGN(unquant_abs_level, sign_coef)
-%else
-    PSIGN d, m4, m8
-%endif
-    psubd   m3, m1 ; predicted_coef
-    paddd   m4, m3
-%if cpuflag(ssse3)
-    pabsd   m4, m4
-%else
-    ABSD    m3, m4
-    SWAP     4, 3
-%endif
-    LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
-    pmuludq m1, psy_trellis
-    pmuludq m4, m1
-    psubq   m0, m4
-%if %1
-%%.dc_rounding:
-%endif
-%endif
-%if %1 == 0
-    mova [ssd], m0
-%endif
-
-%if dc == 0 && %1 == 0
-    test   iid, iid
-    jnz %%.skip_dc_rounding
-%%.dc_rounding:
-    ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
-    ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
-    ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
-    psrad   m1, 31 ; sign_coef>>31
-    paddd   m4, [pd_8]
-    paddd   m4, m1
-    pand    m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
-    psubd   m4, m2 ; d
-    SQUARE   4, 3
-    pmuludq m4, m5
-    mova [ssd], m4
-%%.skip_dc_rounding:
-%endif
-    mova [ssd+16], m0
-
-    %assign stack_offset_bak stack_offset
-    cmp abs_leveld, 1
-    jl %%.switch_coef0
-%if %1 == 0
-    mov    r10, [ssd] ; trellis_coef* args
-%endif
-    movq   r12, m0
-    ; for( int j = 0; j < 8; j++ )
-    ;     nodes_cur[j].score = TRELLIS_SCORE_MAX;
-%if cpuflag(ssse3)
-    mova [nodes_curq + node_score(0)], m7
-    mova [nodes_curq + node_score(2)], m7
-%else ; avoid store-forwarding stalls on k8/k10
-%if %1 == 0
-    movq [nodes_curq + node_score(0)], m7
-%endif
-    movq [nodes_curq + node_score(1)], m7
-    movq [nodes_curq + node_score(2)], m7
-    movq [nodes_curq + node_score(3)], m7
-%endif
-    mova [nodes_curq + node_score(4)], m7
-    mova [nodes_curq + node_score(6)], m7
-    je %%.switch_coef1
-%%.switch_coefn:
-    call trellis_coefn.entry%1
-    call trellis_coefn.entry%1b
-    jmp .i_continue1
-%%.switch_coef1:
-    call trellis_coef1.entry%1
-    call trellis_coefn.entry%1b
-    jmp .i_continue1
-%%.switch_coef0:
-    call trellis_coef0_%1
-    call trellis_coef1.entry%1b
-
-.i_continue%1:
-    dec iid
-%if num_coefs == 16 && dc == 0
-    cmp iid, b_acm
-%endif
-    jge .i_loop%1
-
-    call trellis_bnode_%1
-%if %1 == 0
-%if num_coefs == 16 && dc == 0
-    jz .return_zero
-%else
-    jz .return
-%endif
-    jmp .writeback
-
-%%.zero_siglast:
-    xor  r6d, r6d
-    mov [cost_siglast+0], r6
-    mov [cost_siglast+8], r6d
-    jmp %%.skip_siglast
-%endif
-%endmacro ; TRELLIS_LOOP
-
-; just a synonym for %if
-%macro IF0 1+
-%endmacro
-%macro IF1 1+
-    %1
-%endmacro
-
-%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
-    ; for( int j = 0; j < 8; j++ )
-    ;     nodes_cur[j].level_idx = levels_used;
-    ;     level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
-    ;     levels_used++;
-    add  levels_usedd, 3
-    and  levels_usedd, ~3 ; allow aligned stores
-    movd       m0, levels_usedd
-    pshufd     m0, m0, 0
-    IF%1 mova  m1, m0
-         paddd m0, [pd_0123]
-    IF%1 paddd m1, [pd_4567]
-         mova  m2, [nodes_%2q + node_level_idx(0)]
-    IF%1 mova  m3, [nodes_%2q + node_level_idx(4)]
-         mova [nodes_curq + node_level_idx(0)], m0
-    IF%1 mova [nodes_curq + node_level_idx(4)], m1
-         mova [level_tree + (levels_usedq+0)*4], m2
-    IF%1 mova [level_tree + (levels_usedq+4)*4], m3
-    add  levels_usedd, (1+%1)*4
-%endmacro
-
-INIT_XMM sse2
-TRELLIS trellis_cabac_4x4, 16, 0, 0
-TRELLIS trellis_cabac_8x8, 64, 0, 0
-TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
-TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
-TRELLIS trellis_cabac_dc, 16, 1, 0
-TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
-INIT_XMM ssse3
-TRELLIS trellis_cabac_4x4, 16, 0, 0
-TRELLIS trellis_cabac_8x8, 64, 0, 0
-TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
-TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
-TRELLIS trellis_cabac_dc, 16, 1, 0
-TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
-
-
-
-%define stack rsp+gprsize
-%define scoreq r14
-%define bitsq r13
-%define bitsd r13d
-
-INIT_XMM
-%macro clocal 1
-    ALIGN 16
-    global mangle(x264_%1)
-    mangle(x264_%1):
-    %1:
-    %assign stack_offset stack_offset_bak+gprsize
-%endmacro
-
-%macro TRELLIS_BNODE 1 ; ctx_hi
-clocal trellis_bnode_%1
-    ; int j = ctx_hi?1:0;
-    ; trellis_node_t *bnode = &nodes_cur[j];
-    ; while( ++j < (ctx_hi?8:4) )
-    ;     if( nodes_cur[j].score < bnode->score )
-    ;         bnode = &nodes_cur[j];
-%assign j %1
-    mov   rax, [nodes_curq + node_score(j)]
-    lea   rax, [rax*8 + j]
-%rep 3+3*%1
-%assign j j+1
-    mov   r11, [nodes_curq + node_score(j)]
-    lea   r11, [r11*8 + j]
-    cmp   rax, r11
-    cmova rax, r11
-%endrep
-    mov   r10, dctm
-    and   eax, 7
-    ret
-%endmacro ; TRELLIS_BNODE
-TRELLIS_BNODE 0
-TRELLIS_BNODE 1
-
-
-%macro TRELLIS_COEF0 1 ; ctx_hi
-clocal trellis_coef0_%1
-    ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
-    mov  r11d, [cost_siglast+0]
-    imul  r11, lambda2q
-    shr   r11, CABAC_SIZE_BITS - LAMBDA_BITS
-    add   r11, [ssd+16]
-%if %1 == 0
-    ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
-    mov  scoreq, [nodes_prevq + node_score(0)]
-    add  scoreq, [ssd]
-    sub  scoreq, r11
-    mov  [nodes_curq + node_score(0)], scoreq
-%endif
-    ; memcpy
-    mov  scoreq, [nodes_prevq + node_score(1)]
-    mov  [nodes_curq + node_score(1)], scoreq
-    mova m1, [nodes_prevq + node_score(2)]
-    mova [nodes_curq + node_score(2)], m1
-%if %1
-    mova m1, [nodes_prevq + node_score(4)]
-    mova [nodes_curq + node_score(4)], m1
-    mova m1, [nodes_prevq + node_score(6)]
-    mova [nodes_curq + node_score(6)], m1
-%endif
-    mov  r6d, [nodes_prevq + node_cabac_state(3)]
-    mov  [nodes_curq + node_cabac_state(3)], r6d
-%if %1
-    mova m1, [nodes_prevq + node_cabac_state(4)]
-    mova [nodes_curq + node_cabac_state(4)], m1
-%endif
-    ZERO_LEVEL_IDX %1, prev
-    ret
-%endmacro ; TRELLIS_COEF0
-TRELLIS_COEF0 0
-TRELLIS_COEF0 1
-
-
-
-%macro START_COEF 1 ; gt1
-    ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
-    mov  scoreq, [nodes_prevq + node_score(j)]
-%if j > 0
-    test scoreq, scoreq
-    js .ctx %+ nextj_if_invalid
-%endif
-
-    ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
-%if j >= 3
-    movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
-    movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
-%else
-    movzx r6d, byte [level_statem + coeff_abs_level1_offs]
-%endif
-%if %1
-    xor   r6d, 1
-%endif
-    movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
-
-    ; n.score += ssd;
-    ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
-%if j == 0
-    add  scoreq, r10
-    add  bitsd, [cost_siglast+8]
-%else
-    add  scoreq, r12
-    add  bitsd, [cost_siglast+4]
-%endif
-%endmacro ; START_COEF
-
-%macro END_COEF 1
-    ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
-    imul bitsq, lambda2q
-    shr  bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
-    add  scoreq, bitsq
-
-    ; if( n.score < nodes_cur[node_ctx].score )
-    ;     SET_LEVEL( n, abs_level );
-    ;     nodes_cur[node_ctx] = n;
-    cmp scoreq, [nodes_curq + node_score(node_ctx)]
-    jae .ctx %+ nextj_if_valid
-    mov [nodes_curq + node_score(node_ctx)], scoreq
-%if j == 2 || (j <= 3 && node_ctx == 4)
-    ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
-    movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
-%elif j >= 3
-    ; if we have updated before, then copy cabac_state from the parent node
-    mov  r6d, [nodes_prevq + node_cabac_state(j)]
-    mov [nodes_curq + node_cabac_state(node_ctx)], r6d
-%endif
-%if j >= 3 ; skip the transition if we're not going to reuse the context
-    mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
-%endif
-%if %1 && node_ctx == 7
-    mov  r6d, levelgt1_ctxm
-    mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
-%endif
-    mov  r6d, [nodes_prevq + node_level_idx(j)]
-%if %1
-    mov r11d, abs_leveld
-    shl r11d, 16
-    or   r6d, r11d
-%else
-    or   r6d, 1<<16
-%endif
-    mov [level_tree + levels_usedq*4], r6d
-    mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
-    inc levels_usedd
-%endmacro ; END_COEF
-
-
-
-%macro COEF1 2
-    %assign j %1
-    %assign nextj_if_valid %1+1
-    %assign nextj_if_invalid %2
-%if j < 4
-    %assign coeff_abs_level1_offs j+1
-%else
-    %assign coeff_abs_level1_offs 0
-%endif
-%if j < 3
-    %assign node_ctx j+1
-%else
-    %assign node_ctx j
-%endif
-.ctx %+ j:
-    START_COEF 0
-    add  bitsd, 1 << CABAC_SIZE_BITS
-    END_COEF 0
-%endmacro ; COEF1
-
-%macro COEFN 2
-    %assign j %1
-    %assign nextj_if_valid %2
-    %assign nextj_if_invalid %2
-%if j < 4
-    %assign coeff_abs_level1_offs j+1
-    %assign coeff_abs_levelgt1_offs 5
-%else
-    %assign coeff_abs_level1_offs 0
-    %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
-%endif
-%if j < 4
-    %assign node_ctx 4
-%elif j < 7
-    %assign node_ctx j+1
-%else
-    %assign node_ctx 7
-%endif
-.ctx %+ j:
-    START_COEF 1
-    ; if( abs_level >= 15 )
-    ;     bits += bs_size_ue_big(...)
-    add  bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
-    ; n.cabac_state[levelgt1_ctx]
-%if j == 7 ; && compiling support for 4:2:2
-    mov    r6d, levelgt1_ctxm
-    %define coeff_abs_levelgt1_offs r6
-%endif
-%if j == 7
-    movzx  r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
-%else
-    movzx  r10, byte [level_statem + coeff_abs_levelgt1_offs]
-%endif
-    ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
-    add   r10d, r1d
-    movzx  r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
-    add  bitsd, r6d
-%if node_ctx == 7
-    movzx  r10, byte [cabac_transition_unary + r10-128 GLOBAL]
-%endif
-    END_COEF 1
-%endmacro ; COEFN
-
-
-
-clocal trellis_coef1
-.entry0b: ; ctx_lo, larger of the two abs_level candidates
-    mov  r10, [ssd+8]
-    sub  r10, r11
-    mov  r12, [ssd+24]
-    sub  r12, r11
-.entry0: ; ctx_lo, smaller of the two abs_level candidates
-    COEF1 0, 4
-    COEF1 1, 4
-    COEF1 2, 4
-    COEF1 3, 4
-.ctx4:
-    rep ret
-.entry1b: ; ctx_hi, larger of the two abs_level candidates
-    mov  r12, [ssd+24]
-    sub  r12, r11
-.entry1: ; ctx_hi, smaller of the two abs_level candidates
-trellis_coef1_hi:
-    COEF1 1, 2
-    COEF1 2, 3
-    COEF1 3, 4
-    COEF1 4, 5
-    COEF1 5, 6
-    COEF1 6, 7
-    COEF1 7, 8
-.ctx8:
-    rep ret
-
-%macro COEFN_PREFIX 1
-    ; int prefix = X264_MIN( abs_level - 1, 14 );
-    mov  r1d, abs_leveld
-    cmp  abs_leveld, 15
-    jge .level_suffix%1
-    xor  r5d, r5d
-.skip_level_suffix%1:
-    shl  r1d, 7
-%endmacro
-
-%macro COEFN_SUFFIX 1
-.level_suffix%1:
-    ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
-    lea  r5d, [abs_levelq-14]
-    bsr  r5d, r5d
-    shl  r5d, CABAC_SIZE_BITS+1
-    add  r5d, 1<<CABAC_SIZE_BITS
-    ; int prefix = X264_MIN( abs_level - 1, 14 );
-    mov  r1d, 15
-    jmp .skip_level_suffix%1
-%endmacro
-
-clocal trellis_coefn
-.entry0b:
-    mov  r10, [ssd+8]
-    mov  r12, [ssd+24]
-    inc  abs_leveld
-.entry0:
-    ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
-    ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
-    ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
-    ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
-    ; The C version has to be fully separate since C doesn't support multiple
-    ; entrypoints. But return-on-first-failure isn't very important here (as
-    ; opposed to coef1), so I might as well reduce codesize.
-    COEFN_PREFIX 0
-    COEFN 0, 1
-    COEFN 1, 2
-    COEFN 2, 3
-    COEFN 3, 8
-.ctx8:
-    mov zigzagq, zigzagm ; unspill since r1 was clobbered
-    ret
-.entry1b:
-    mov  r12, [ssd+24]
-    inc  abs_leveld
-.entry1:
-    COEFN_PREFIX 1
-    COEFN 4, 5
-    COEFN 5, 6
-    COEFN 6, 7
-    COEFN 7, 1
-    jmp .ctx1
-    COEFN_SUFFIX 0
-    COEFN_SUFFIX 1
diff --git a/android/src/main/libenc/jni/libx264/common/x86/util.h b/android/src/main/libenc/jni/libx264/common/x86/util.h
deleted file mode 100755
index 51018df..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/util.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*****************************************************************************
- * util.h: x86 inline asm
- *****************************************************************************
- * Copyright (C) 2008-2016 x264 project
- *
- * Authors: Fiona Glaser <fiona@x264.com>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_X86_UTIL_H
-#define X264_X86_UTIL_H
-
-#ifdef __SSE__
-#include <xmmintrin.h>
-
-#undef M128_ZERO
-#define M128_ZERO ((__m128){0,0,0,0})
-#define x264_union128_t x264_union128_sse_t
-typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
-#if HAVE_VECTOREXT
-typedef uint32_t v4si __attribute__((vector_size (16)));
-#endif
-#endif // __SSE__
-
-#if HAVE_X86_INLINE_ASM && HAVE_MMX
-
-#define x264_median_mv x264_median_mv_mmx2
-static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
-{
-    asm(
-        "movd   %1,    %%mm0 \n"
-        "movd   %2,    %%mm1 \n"
-        "movq   %%mm0, %%mm3 \n"
-        "movd   %3,    %%mm2 \n"
-        "pmaxsw %%mm1, %%mm0 \n"
-        "pminsw %%mm3, %%mm1 \n"
-        "pminsw %%mm2, %%mm0 \n"
-        "pmaxsw %%mm1, %%mm0 \n"
-        "movd   %%mm0, %0    \n"
-        :"=m"(*(x264_union32_t*)dst)
-        :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
-    );
-}
-
-#define x264_predictor_difference x264_predictor_difference_mmx2
-static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
-{
-    int sum;
-    static const uint64_t pw_1 = 0x0001000100010001ULL;
-
-    asm(
-        "pxor    %%mm4, %%mm4 \n"
-        "test    $1, %1       \n"
-        "jnz 3f               \n"
-        "movd    -8(%2,%1,4), %%mm0 \n"
-        "movd    -4(%2,%1,4), %%mm3 \n"
-        "psubw   %%mm3, %%mm0 \n"
-        "jmp 2f               \n"
-        "3:                   \n"
-        "dec     %1           \n"
-        "1:                   \n"
-        "movq    -8(%2,%1,4), %%mm0 \n"
-        "psubw   -4(%2,%1,4), %%mm0 \n"
-        "2:                   \n"
-        "sub     $2,    %1    \n"
-        "pxor    %%mm2, %%mm2 \n"
-        "psubw   %%mm0, %%mm2 \n"
-        "pmaxsw  %%mm2, %%mm0 \n"
-        "paddusw %%mm0, %%mm4 \n"
-        "jg 1b                \n"
-        "pmaddwd %4, %%mm4    \n"
-        "pshufw $14, %%mm4, %%mm0 \n"
-        "paddd   %%mm0, %%mm4 \n"
-        "movd    %%mm4, %0    \n"
-        :"=r"(sum), "+r"(i_mvc)
-        :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
-    );
-    return sum;
-}
-
-#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
-static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
-{
-    static const uint64_t pb_2    = 0x0202020202020202ULL;
-    static const uint64_t pb_32   = 0x2020202020202020ULL;
-    static const uint64_t pb_33   = 0x2121212121212121ULL;
-    int amvd;
-    asm(
-        "movd         %1, %%mm0 \n"
-        "movd         %2, %%mm1 \n"
-        "paddusb   %%mm1, %%mm0 \n"
-        "pminub       %5, %%mm0 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "movq      %%mm0, %%mm1 \n"
-        "pcmpgtb      %3, %%mm0 \n"
-        "pcmpgtb      %4, %%mm1 \n"
-        "psubb     %%mm0, %%mm2 \n"
-        "psubb     %%mm1, %%mm2 \n"
-        "movd      %%mm2, %0    \n"
-        :"=r"(amvd)
-        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
-         "m"(pb_2),"m"(pb_32),"m"(pb_33)
-    );
-    return amvd;
-}
-
-#define x264_predictor_clip x264_predictor_clip_mmx2
-static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
-{
-    static const uint32_t pd_32 = 0x20;
-    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
-
-    asm(
-        "movq       (%2), %%mm5 \n"
-        "movd         %6, %%mm3 \n"
-        "psllw        $2, %%mm5 \n" // Convert to subpel
-        "pshufw $0xEE, %%mm5, %%mm6 \n"
-        "dec         %k3        \n"
-        "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
-        "punpckldq %%mm3, %%mm3 \n"
-        "punpckldq %%mm5, %%mm5 \n"
-        "movd         %7, %%mm4 \n"
-        "lea   (%0,%3,4), %3    \n"
-        "1:                     \n"
-        "movq       (%0), %%mm0 \n"
-        "add          $8, %0    \n"
-        "movq      %%mm3, %%mm1 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
-        "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
-        "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
-        "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
-        "pmaxsw    %%mm5, %%mm0 \n"
-        "pminsw    %%mm6, %%mm0 \n"
-        "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
-        "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
-        "movq      %%mm0, (%5,%4,4) \n"
-        "and         $24, %k2   \n"
-        "add          $2, %4    \n"
-        "add          $8, %k2   \n"
-        "shr          $4, %k2   \n" // (4-val)>>1
-        "sub          %2, %4    \n" // +1 for each valid motion vector
-        "cmp          %3, %0    \n"
-        "jl 1b                  \n"
-        "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
-
-        /* Do the last iteration */
-        "2:                     \n"
-        "movd       (%0), %%mm0 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "pcmpeqd   %%mm0, %%mm3 \n"
-        "pcmpeqd   %%mm0, %%mm2 \n"
-        "por       %%mm3, %%mm2 \n"
-        "pmovmskb  %%mm2, %k2   \n"
-        "pmaxsw    %%mm5, %%mm0 \n"
-        "pminsw    %%mm6, %%mm0 \n"
-        "movd      %%mm0, (%5,%4,4) \n"
-        "inc          %4        \n"
-        "and          $1, %k2   \n"
-        "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
-        "3:                     \n"
-        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
-        :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
-    );
-    return i;
-}
-
-/* Same as the above, except we do (mv + 2) >> 2 on the input. */
-#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
-static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
-{
-    static const uint64_t pw_2 = 0x0002000200020002ULL;
-    static const uint32_t pd_32 = 0x20;
-    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
-
-    asm(
-        "movq       (%2), %%mm5 \n"
-        "movq         %6, %%mm7 \n"
-        "movd         %7, %%mm3 \n"
-        "pshufw $0xEE, %%mm5, %%mm6 \n"
-        "dec         %k3        \n"
-        "jz 2f                  \n"
-        "punpckldq %%mm3, %%mm3 \n"
-        "punpckldq %%mm5, %%mm5 \n"
-        "movd         %8, %%mm4 \n"
-        "lea   (%0,%3,4), %3    \n"
-        "1:                     \n"
-        "movq       (%0), %%mm0 \n"
-        "add          $8, %0    \n"
-        "paddw     %%mm7, %%mm0 \n"
-        "psraw        $2, %%mm0 \n"
-        "movq      %%mm3, %%mm1 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "pcmpeqd   %%mm0, %%mm1 \n"
-        "pcmpeqd   %%mm0, %%mm2 \n"
-        "por       %%mm1, %%mm2 \n"
-        "pmovmskb  %%mm2, %k2   \n"
-        "pmaxsw    %%mm5, %%mm0 \n"
-        "pminsw    %%mm6, %%mm0 \n"
-        "pand      %%mm4, %%mm2 \n"
-        "psrlq     %%mm2, %%mm0 \n"
-        "movq      %%mm0, (%5,%4,4) \n"
-        "and         $24, %k2   \n"
-        "add          $2, %4    \n"
-        "add          $8, %k2   \n"
-        "shr          $4, %k2   \n"
-        "sub          %2, %4    \n"
-        "cmp          %3, %0    \n"
-        "jl 1b                  \n"
-        "jg 3f                  \n"
-
-        /* Do the last iteration */
-        "2:                     \n"
-        "movd       (%0), %%mm0 \n"
-        "paddw     %%mm7, %%mm0 \n"
-        "psraw        $2, %%mm0 \n"
-        "pxor      %%mm2, %%mm2 \n"
-        "pcmpeqd   %%mm0, %%mm3 \n"
-        "pcmpeqd   %%mm0, %%mm2 \n"
-        "por       %%mm3, %%mm2 \n"
-        "pmovmskb  %%mm2, %k2   \n"
-        "pmaxsw    %%mm5, %%mm0 \n"
-        "pminsw    %%mm6, %%mm0 \n"
-        "movd      %%mm0, (%5,%4,4) \n"
-        "inc          %4        \n"
-        "and          $1, %k2   \n"
-        "sub          %2, %4    \n"
-        "3:                     \n"
-        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
-        :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
-    );
-    return i;
-}
-
-#endif
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/x86inc.asm b/android/src/main/libenc/jni/libx264/common/x86/x86inc.asm
deleted file mode 100755
index ff150f1..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/x86inc.asm
+++ /dev/null
@@ -1,1535 +0,0 @@
-;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
-;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* Permission to use, copy, modify, and/or distribute this software for any
-;* purpose with or without fee is hereby granted, provided that the above
-;* copyright notice and this permission notice appear in all copies.
-;*
-;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-;*****************************************************************************
-
-; This is a header file for the x264ASM assembly language, which uses
-; NASM/YASM syntax combined with a large number of macros to provide easy
-; abstraction between different calling conventions (x86_32, win64, linux64).
-; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
-
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible.  Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well.  Send patches or ideas
-; to x264-devel@videolan.org .
-
-%ifndef private_prefix
-    %define private_prefix x264
-%endif
-
-%ifndef public_prefix
-    %define public_prefix private_prefix
-%endif
-
-%ifndef STACK_ALIGNMENT
-    %if ARCH_X86_64
-        %define STACK_ALIGNMENT 16
-    %else
-        %define STACK_ALIGNMENT 4
-    %endif
-%endif
-
-%define WIN64  0
-%define UNIX64 0
-%if ARCH_X86_64
-    %ifidn __OUTPUT_FORMAT__,win32
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,win64
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,x64
-        %define WIN64  1
-    %else
-        %define UNIX64 1
-    %endif
-%endif
-
-%define FORMAT_ELF 0
-%ifidn __OUTPUT_FORMAT__,elf
-    %define FORMAT_ELF 1
-%elifidn __OUTPUT_FORMAT__,elf32
-    %define FORMAT_ELF 1
-%elifidn __OUTPUT_FORMAT__,elf64
-    %define FORMAT_ELF 1
-%endif
-
-%ifdef PREFIX
-    %define mangle(x) _ %+ x
-%else
-    %define mangle(x) x
-%endif
-
-%macro SECTION_RODATA 0-1 16
-    SECTION .rodata align=%1
-%endmacro
-
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
-    default rel
-%endif
-
-%ifdef __NASM_VER__
-    %use smartalign
-%endif
-
-; Macros to eliminate most code duplication between x86_32 and x86_64:
-; Currently this works only for leaf functions which load all their arguments
-; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
-
-; PROLOGUE:
-; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used. pushes callee-saved regs if needed.
-; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = (optional) stack size to be allocated. The stack will be aligned before
-;      allocating the specified stack size. If the required stack alignment is
-;      larger than the known stack alignment the stack will be manually aligned
-;      and an extra register will be allocated to hold the original stack
-;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
-;      register as stack pointer, request a negative stack size.
-; %4+/%5+ = list of names to define to registers
-; PROLOGUE can also be invoked by adding the same options to cglobal
-
-; e.g.
-; cglobal foo, 2,3,7,0x40, dst, src, tmp
-; declares a function (foo) that automatically loads two arguments (dst and
-; src) into registers, uses one additional register (tmp) plus 7 vector
-; registers (m0-m6) and allocates 0x40 bytes of stack space.
-
-; TODO Some functions can use some args directly from the stack. If they're the
-; last args then you can just not declare them, but if they're in the middle
-; we need more flexible macro.
-
-; RET:
-; Pops anything that was pushed by PROLOGUE, and returns.
-
-; REP_RET:
-; Use this instead of RET if it's a branch target.
-
-; registers:
-; rN and rNq are the native-size register holding function argument N
-; rNd, rNw, rNb are dword, word, and byte size
-; rNh is the high 8 bits of the word size
-; rNm is the original location of arg N (a register or on the stack), dword
-; rNmp is native size
-
-%macro DECLARE_REG 2-3
-    %define r%1q %2
-    %define r%1d %2d
-    %define r%1w %2w
-    %define r%1b %2b
-    %define r%1h %2h
-    %define %2q %2
-    %if %0 == 2
-        %define r%1m  %2d
-        %define r%1mp %2
-    %elif ARCH_X86_64 ; memory
-        %define r%1m [rstk + stack_offset + %3]
-        %define r%1mp qword r %+ %1 %+ m
-    %else
-        %define r%1m [rstk + stack_offset + %3]
-        %define r%1mp dword r %+ %1 %+ m
-    %endif
-    %define r%1  %2
-%endmacro
-
-%macro DECLARE_REG_SIZE 3
-    %define r%1q r%1
-    %define e%1q r%1
-    %define r%1d e%1
-    %define e%1d e%1
-    %define r%1w %1
-    %define e%1w %1
-    %define r%1h %3
-    %define e%1h %3
-    %define r%1b %2
-    %define e%1b %2
-    %if ARCH_X86_64 == 0
-        %define r%1 e%1
-    %endif
-%endmacro
-
-DECLARE_REG_SIZE ax, al, ah
-DECLARE_REG_SIZE bx, bl, bh
-DECLARE_REG_SIZE cx, cl, ch
-DECLARE_REG_SIZE dx, dl, dh
-DECLARE_REG_SIZE si, sil, null
-DECLARE_REG_SIZE di, dil, null
-DECLARE_REG_SIZE bp, bpl, null
-
-; t# defines for when per-arch register allocation is more complex than just function arguments
-
-%macro DECLARE_REG_TMP 1-*
-    %assign %%i 0
-    %rep %0
-        CAT_XDEFINE t, %%i, r%1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro DECLARE_REG_TMP_SIZE 0-*
-    %rep %0
-        %define t%1q t%1 %+ q
-        %define t%1d t%1 %+ d
-        %define t%1w t%1 %+ w
-        %define t%1h t%1 %+ h
-        %define t%1b t%1 %+ b
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-
-%if ARCH_X86_64
-    %define gprsize 8
-%else
-    %define gprsize 4
-%endif
-
-%macro PUSH 1
-    push %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset+gprsize
-    %endif
-%endmacro
-
-%macro POP 1
-    pop %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset-gprsize
-    %endif
-%endmacro
-
-%macro PUSH_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            PUSH r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro POP_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            pop r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro LOAD_IF_USED 1-*
-    %rep %0
-        %if %1 < num_args
-            mov r%1, r %+ %1 %+ mp
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro SUB 2
-    sub %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset+(%2)
-    %endif
-%endmacro
-
-%macro ADD 2
-    add %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset-(%2)
-    %endif
-%endmacro
-
-%macro movifnidn 2
-    %ifnidn %1, %2
-        mov %1, %2
-    %endif
-%endmacro
-
-%macro movsxdifnidn 2
-    %ifnidn %1, %2
-        movsxd %1, %2
-    %endif
-%endmacro
-
-%macro ASSERT 1
-    %if (%1) == 0
-        %error assertion ``%1'' failed
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS 0-*
-    %ifdef n_arg_names
-        %assign %%i 0
-        %rep n_arg_names
-            CAT_UNDEF arg_name %+ %%i, q
-            CAT_UNDEF arg_name %+ %%i, d
-            CAT_UNDEF arg_name %+ %%i, w
-            CAT_UNDEF arg_name %+ %%i, h
-            CAT_UNDEF arg_name %+ %%i, b
-            CAT_UNDEF arg_name %+ %%i, m
-            CAT_UNDEF arg_name %+ %%i, mp
-            CAT_UNDEF arg_name, %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-
-    %xdefine %%stack_offset stack_offset
-    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
-    %assign %%i 0
-    %rep %0
-        %xdefine %1q r %+ %%i %+ q
-        %xdefine %1d r %+ %%i %+ d
-        %xdefine %1w r %+ %%i %+ w
-        %xdefine %1h r %+ %%i %+ h
-        %xdefine %1b r %+ %%i %+ b
-        %xdefine %1m r %+ %%i %+ m
-        %xdefine %1mp r %+ %%i %+ mp
-        CAT_XDEFINE arg_name, %%i, %1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-    %xdefine stack_offset %%stack_offset
-    %assign n_arg_names %0
-%endmacro
-
-%define required_stack_alignment ((mmsize + 15) & ~15)
-
-%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
-    %ifnum %1
-        %if %1 != 0
-            %assign %%pad 0
-            %assign stack_size %1
-            %if stack_size < 0
-                %assign stack_size -stack_size
-            %endif
-            %if WIN64
-                %assign %%pad %%pad + 32 ; shadow space
-                %if mmsize != 8
-                    %assign xmm_regs_used %2
-                    %if xmm_regs_used > 8
-                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
-                    %endif
-                %endif
-            %endif
-            %if required_stack_alignment <= STACK_ALIGNMENT
-                ; maintain the current stack alignment
-                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
-                SUB rsp, stack_size_padded
-            %else
-                %assign %%reg_num (regs_used - 1)
-                %xdefine rstk r %+ %%reg_num
-                ; align stack, and save original stack location directly above
-                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
-                ; stack in a single instruction (i.e. mov rsp, rstk or mov
-                ; rsp, [rsp+stack_size_padded])
-                %if %1 < 0 ; need to store rsp on stack
-                    %xdefine rstkm [rsp + stack_size + %%pad]
-                    %assign %%pad %%pad + gprsize
-                %else ; can keep rsp in rstk during whole function
-                    %xdefine rstkm rstk
-                %endif
-                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
-                mov rstk, rsp
-                and rsp, ~(required_stack_alignment-1)
-                sub rsp, stack_size_padded
-                movifnidn rstkm, rstk
-            %endif
-            WIN64_PUSH_XMM
-        %endif
-    %endif
-%endmacro
-
-%macro SETUP_STACK_POINTER 1
-    %ifnum %1
-        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
-            %if %1 > 0
-                %assign regs_used (regs_used + 1)
-            %endif
-            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
-                ; Ensure that we don't clobber any registers containing arguments
-                %assign regs_used 5 + UNIX64 * 3
-            %endif
-        %endif
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS_INTERNAL 3+
-    %ifnum %2
-        DEFINE_ARGS %3
-    %elif %1 == 4
-        DEFINE_ARGS %2
-    %elif %1 > 4
-        DEFINE_ARGS %2, %3
-    %endif
-%endmacro
-
-%if WIN64 ; Windows x64 ;=================================================
-
-DECLARE_REG 0,  rcx
-DECLARE_REG 1,  rdx
-DECLARE_REG 2,  R8
-DECLARE_REG 3,  R9
-DECLARE_REG 4,  R10, 40
-DECLARE_REG 5,  R11, 48
-DECLARE_REG 6,  rax, 56
-DECLARE_REG 7,  rdi, 64
-DECLARE_REG 8,  rsi, 72
-DECLARE_REG 9,  rbx, 80
-DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
-
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4, %3
-    %if mmsize != 8 && stack_size == 0
-        WIN64_SPILL_XMM %3
-    %endif
-    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%macro WIN64_PUSH_XMM 0
-    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
-        movaps [rstk + stack_offset +  8], xmm6
-    %endif
-    %if xmm_regs_used > 7
-        movaps [rstk + stack_offset + 24], xmm7
-    %endif
-    %if xmm_regs_used > 8
-        %assign %%i 8
-        %rep xmm_regs_used-8
-            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-%macro WIN64_SPILL_XMM 1
-    %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
-        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
-        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
-        SUB rsp, stack_size_padded
-    %endif
-    WIN64_PUSH_XMM
-%endmacro
-
-%macro WIN64_RESTORE_XMM_INTERNAL 1
-    %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
-            %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
-        %endrep
-    %endif
-    %if stack_size_padded > 0
-        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
-            mov rsp, rstkm
-        %else
-            add %1, stack_size_padded
-            %assign %%pad_size stack_size_padded
-        %endif
-    %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
-    %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
-    %endif
-%endmacro
-
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
-    %assign stack_offset (stack_offset-stack_size_padded)
-    %assign xmm_regs_used 0
-%endmacro
-
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
-    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
-        vzeroupper
-    %endif
-    AUTO_REP_RET
-%endmacro
-
-%elif ARCH_X86_64 ; *nix x64 ;=============================================
-
-DECLARE_REG 0,  rdi
-DECLARE_REG 1,  rsi
-DECLARE_REG 2,  rdx
-DECLARE_REG 3,  rcx
-DECLARE_REG 4,  R8
-DECLARE_REG 5,  R9
-DECLARE_REG 6,  rax, 8
-DECLARE_REG 7,  R10, 16
-DECLARE_REG 8,  R11, 24
-DECLARE_REG 9,  rbx, 32
-DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4
-    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-    %if stack_size_padded > 0
-        %if required_stack_alignment > STACK_ALIGNMENT
-            mov rsp, rstkm
-        %else
-            add rsp, stack_size_padded
-        %endif
-    %endif
-    POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
-        vzeroupper
-    %endif
-    AUTO_REP_RET
-%endmacro
-
-%else ; X86_32 ;==============================================================
-
-DECLARE_REG 0, eax, 4
-DECLARE_REG 1, ecx, 8
-DECLARE_REG 2, edx, 12
-DECLARE_REG 3, ebx, 16
-DECLARE_REG 4, esi, 20
-DECLARE_REG 5, edi, 24
-DECLARE_REG 6, ebp, 28
-%define rsp esp
-
-%macro DECLARE_ARG 1-*
-    %rep %0
-        %define r%1m [rstk + stack_offset + 4*%1 + 4]
-        %define r%1mp dword r%1m
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    %if num_args > 7
-        %assign num_args 7
-    %endif
-    %if regs_used > 7
-        %assign regs_used 7
-    %endif
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 7
-    PUSH_IF_USED 3, 4, 5, 6
-    ALLOC_STACK %4
-    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-    %if stack_size_padded > 0
-        %if required_stack_alignment > STACK_ALIGNMENT
-            mov rsp, rstkm
-        %else
-            add rsp, stack_size_padded
-        %endif
-    %endif
-    POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
-        vzeroupper
-    %endif
-    AUTO_REP_RET
-%endmacro
-
-%endif ;======================================================================
-
-%if WIN64 == 0
-    %macro WIN64_SPILL_XMM 1
-    %endmacro
-    %macro WIN64_RESTORE_XMM 1
-    %endmacro
-    %macro WIN64_PUSH_XMM 0
-    %endmacro
-%endif
-
-; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
-; a branch or a branch target. So switch to a 2-byte form of ret in that case.
-; We can automatically detect "follows a branch", but not a branch target.
-; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
-%macro REP_RET 0
-    %if has_epilogue
-        RET
-    %else
-        rep ret
-    %endif
-    annotate_function_size
-%endmacro
-
-%define last_branch_adr $$
-%macro AUTO_REP_RET 0
-    %if notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
-    %endif
-    ret
-    annotate_function_size
-%endmacro
-
-%macro BRANCH_INSTR 0-*
-    %rep %0
-        %macro %1 1-2 %1
-            %2 %1
-            %if notcpuflag(ssse3)
-                %%branch_instr equ $
-                %xdefine last_branch_adr %%branch_instr
-            %endif
-        %endmacro
-        %rotate 1
-    %endrep
-%endmacro
-
-BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
-
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
-    %if has_epilogue
-        call %1
-        RET
-    %elif %2
-        jmp %1
-    %endif
-    annotate_function_size
-%endmacro
-
-;=============================================================================
-; arch-independent part
-;=============================================================================
-
-%assign function_align 16
-
-; Begin a function.
-; Applies any symbol mangling needed for C linkage, and sets up a define such that
-; subsequent uses of the function name automatically refer to the mangled version.
-; Appends cpuflags to the function name if cpuflags has been specified.
-; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
-; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
-%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
-    cglobal_internal 1, %1 %+ SUFFIX, %2
-%endmacro
-%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
-    cglobal_internal 0, %1 %+ SUFFIX, %2
-%endmacro
-%macro cglobal_internal 2-3+
-    annotate_function_size
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        %xdefine %%VISIBILITY hidden
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
-    %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
-        %xdefine %2.skip_prologue %2 %+ .skip_prologue
-        CAT_XDEFINE cglobaled_, %2, 1
-    %endif
-    %xdefine current_function %2
-    %xdefine current_function_section __SECT__
-    %if FORMAT_ELF
-        global %2:function %%VISIBILITY
-    %else
-        global %2
-    %endif
-    align function_align
-    %2:
-    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
-    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
-    %assign stack_offset 0      ; stack pointer offset relative to the return address
-    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
-    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
-    %ifnidn %3, ""
-        PROLOGUE %3
-    %endif
-%endmacro
-
-%macro cextern 1
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-; like cextern, but without the prefix
-%macro cextern_naked 1
-    %ifdef PREFIX
-        %xdefine %1 mangle(%1)
-    %endif
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-%macro const 1-2+
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %if FORMAT_ELF
-        global %1:data hidden
-    %else
-        global %1
-    %endif
-    %1: %2
-%endmacro
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
-%if FORMAT_ELF
-    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
-%endif
-
-; Tell debuggers how large the function was.
-; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
-; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
-; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
-; then its size might be unspecified.
-%macro annotate_function_size 0
-    %ifdef __YASM_VER__
-        %ifdef current_function
-            %if FORMAT_ELF
-                current_function_section
-                %%ecf equ $
-                size current_function %%ecf - current_function
-                __SECT__
-            %endif
-        %endif
-    %endif
-%endmacro
-
-; cpuflags
-
-%assign cpuflags_mmx      (1<<0)
-%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2     (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_fma3     (1<<14)| cpuflags_avx
-%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
-
-; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
-%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
-%define notcpuflag(x) (cpuflag(x) ^ 1)
-
-; Takes an arbitrary number of cpuflags from the above list.
-; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
-; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-*
-    %xdefine SUFFIX
-    %undef cpuname
-    %assign cpuflags 0
-
-    %if %0 >= 1
-        %rep %0
-            %ifdef cpuname
-                %xdefine cpuname cpuname %+ _%1
-            %else
-                %xdefine cpuname %1
-            %endif
-            %assign cpuflags cpuflags | cpuflags_%1
-            %rotate 1
-        %endrep
-        %xdefine SUFFIX _ %+ cpuname
-
-        %if cpuflag(avx)
-            %assign avx_enabled 1
-        %endif
-        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
-            %define mova movaps
-            %define movu movups
-            %define movnta movntps
-        %endif
-        %if cpuflag(aligned)
-            %define movu mova
-        %elif cpuflag(sse3) && notcpuflag(ssse3)
-            %define movu lddqu
-        %endif
-    %endif
-
-    %if ARCH_X86_64 || cpuflag(sse2)
-        %ifdef __NASM_VER__
-            ALIGNMODE k8
-        %else
-            CPU amdnop
-        %endif
-    %else
-        %ifdef __NASM_VER__
-            ALIGNMODE nop
-        %else
-            CPU basicnop
-        %endif
-    %endif
-%endmacro
-
-; Merge mmx and sse*
-; m# is a simd register of the currently selected size
-; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
-; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
-
-%macro CAT_XDEFINE 3
-    %xdefine %1%2 %3
-%endmacro
-
-%macro CAT_UNDEF 2
-    %undef %1%2
-%endmacro
-
-%macro INIT_MMX 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_MMX %1
-    %define mmsize 8
-    %define num_mmregs 8
-    %define mova movq
-    %define movu movq
-    %define movh movd
-    %define movnta movntq
-    %assign %%i 0
-    %rep 8
-        CAT_XDEFINE m, %%i, mm %+ %%i
-        CAT_XDEFINE nnmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    %rep 8
-        CAT_UNDEF m, %%i
-        CAT_UNDEF nnmm, %%i
-        %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_XMM 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_XMM %1
-    %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %define movh movq
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, xmm %+ %%i
-        CAT_XDEFINE nnxmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_YMM 0-1+
-    %assign avx_enabled 1
-    %define RESET_MM_PERMUTATION INIT_YMM %1
-    %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %undef movh
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, ymm %+ %%i
-        CAT_XDEFINE nnymm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-INIT_XMM
-
-%macro DECLARE_MMCAST 1
-    %define  mmmm%1   mm%1
-    %define  mmxmm%1  mm%1
-    %define  mmymm%1  mm%1
-    %define xmmmm%1   mm%1
-    %define xmmxmm%1 xmm%1
-    %define xmmymm%1 xmm%1
-    %define ymmmm%1   mm%1
-    %define ymmxmm%1 xmm%1
-    %define ymmymm%1 ymm%1
-    %define xm%1 xmm %+ m%1
-    %define ym%1 ymm %+ m%1
-%endmacro
-
-%assign i 0
-%rep 16
-    DECLARE_MMCAST i
-    %assign i i+1
-%endrep
-
-; I often want to use macros that permute their arguments. e.g. there's no
-; efficient way to implement butterfly or transpose or dct without swapping some
-; arguments.
-;
-; I would like to not have to manually keep track of the permutations:
-; If I insert a permutation in the middle of a function, it should automatically
-; change everything that follows. For more complex macros I may also have multiple
-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
-;
-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
-; permutes its arguments. It's equivalent to exchanging the contents of the
-; registers, except that this way you exchange the register names instead, so it
-; doesn't cost any cycles.
-
-%macro PERMUTE 2-* ; takes a list of pairs to swap
-    %rep %0/2
-        %xdefine %%tmp%2 m%2
-        %rotate 2
-    %endrep
-    %rep %0/2
-        %xdefine m%1 %%tmp%2
-        CAT_XDEFINE nn, m%1, %1
-        %rotate 2
-    %endrep
-%endmacro
-
-%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-    %ifnum %1 ; SWAP 0, 1, ...
-        SWAP_INTERNAL_NUM %1, %2
-    %else ; SWAP m0, m1, ...
-        SWAP_INTERNAL_NAME %1, %2
-    %endif
-%endmacro
-
-%macro SWAP_INTERNAL_NUM 2-*
-    %rep %0-1
-        %xdefine %%tmp m%1
-        %xdefine m%1 m%2
-        %xdefine m%2 %%tmp
-        CAT_XDEFINE nn, m%1, %1
-        CAT_XDEFINE nn, m%2, %2
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro SWAP_INTERNAL_NAME 2-*
-    %xdefine %%args nn %+ %1
-    %rep %0-1
-        %xdefine %%args %%args, nn %+ %2
-        %rotate 1
-    %endrep
-    SWAP_INTERNAL_NUM %%args
-%endmacro
-
-; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
-; calls to that function will automatically load the permutation, so values can
-; be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 0-1
-    %if %0
-        %xdefine %%f %1_m
-    %else
-        %xdefine %%f current_function %+ _m
-    %endif
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
-        %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
-        %assign %%i 0
-        %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE nn, m %+ %%i, %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
-%macro call 1
-    call_internal %1 %+ SUFFIX, %1
-%endmacro
-%macro call_internal 2
-    %xdefine %%i %2
-    %ifndef cglobaled_%2
-        %ifdef cglobaled_%1
-            %xdefine %%i %1
-        %endif
-    %endif
-    call %%i
-    LOAD_MM_PERMUTATION %%i
-%endmacro
-
-; Substitutions that reduce instruction size but are functionally equivalent
-%macro add 2
-    %ifnum %2
-        %if %2==128
-            sub %1, -128
-        %else
-            add %1, %2
-        %endif
-    %else
-        add %1, %2
-    %endif
-%endmacro
-
-%macro sub 2
-    %ifnum %2
-        %if %2==128
-            add %1, -128
-        %else
-            sub %1, %2
-        %endif
-    %else
-        sub %1, %2
-    %endif
-%endmacro
-
-;=============================================================================
-; AVX abstraction layer
-;=============================================================================
-
-%assign i 0
-%rep 16
-    %if i < 8
-        CAT_XDEFINE sizeofmm, i, 8
-    %endif
-    CAT_XDEFINE sizeofxmm, i, 16
-    CAT_XDEFINE sizeofymm, i, 32
-    %assign i i+1
-%endrep
-%undef i
-
-%macro CHECK_AVX_INSTR_EMU 3-*
-    %xdefine %%opcode %1
-    %xdefine %%dst %2
-    %rep %0-2
-        %ifidn %%dst, %3
-            %error non-avx emulation of ``%%opcode'' is not supported
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-;%1 == instruction
-;%2 == minimal instruction set
-;%3 == 1 if float, 0 if int
-;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
-;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-;%6+: operands
-%macro RUN_AVX_INSTR 6-9+
-    %ifnum sizeof%7
-        %assign __sizeofreg sizeof%7
-    %elifnum sizeof%6
-        %assign __sizeofreg sizeof%6
-    %else
-        %assign __sizeofreg mmsize
-    %endif
-    %assign __emulate_avx 0
-    %if avx_enabled && __sizeofreg >= 16
-        %xdefine __instr v%1
-    %else
-        %xdefine __instr %1
-        %if %0 >= 8+%4
-            %assign __emulate_avx 1
-        %endif
-    %endif
-    %ifnidn %2, fnord
-        %ifdef cpuname
-            %if notcpuflag(%2)
-                %error use of ``%1'' %2 instruction in cpuname function: current_function
-            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
-                %error use of ``%1'' sse2 instruction in cpuname function: current_function
-            %endif
-        %endif
-    %endif
-
-    %if __emulate_avx
-        %xdefine __src1 %7
-        %xdefine __src2 %8
-        %if %5 && %4 == 0
-            %ifnidn %6, %7
-                %ifidn %6, %8
-                    %xdefine __src1 %8
-                    %xdefine __src2 %7
-                %elifnnum sizeof%8
-                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
-                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
-                    ; So, if the instruction is commutative with a memory arg, swap them.
-                    %xdefine __src1 %8
-                    %xdefine __src2 %7
-                %endif
-            %endif
-        %endif
-        %ifnidn %6, __src1
-            %if %0 >= 9
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
-            %endif
-            %if __sizeofreg == 8
-                MOVQ %6, __src1
-            %elif %3
-                MOVAPS %6, __src1
-            %else
-                MOVDQA %6, __src1
-            %endif
-        %endif
-        %if %0 >= 9
-            %1 %6, __src2, %9
-        %else
-            %1 %6, __src2
-        %endif
-    %elif %0 >= 9
-        __instr %6, %7, %8, %9
-    %elif %0 == 8
-        __instr %6, %7, %8
-    %elif %0 == 7
-        __instr %6, %7
-    %else
-        __instr %6
-    %endif
-%endmacro
-
-;%1 == instruction
-;%2 == minimal instruction set
-;%3 == 1 if float, 0 if int
-;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
-;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-5 fnord, 0, 255, 0
-    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
-        %ifidn %2, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
-        %elifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
-        %elifidn %4, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
-        %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
-        %else
-            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
-        %endif
-    %endmacro
-%endmacro
-
-; Instructions with both VEX and non-VEX encodings
-; Non-destructive instructions are written without parameters
-AVX_INSTR addpd, sse2, 1, 0, 1
-AVX_INSTR addps, sse, 1, 0, 1
-AVX_INSTR addsd, sse2, 1, 0, 0
-AVX_INSTR addss, sse, 1, 0, 0
-AVX_INSTR addsubpd, sse3, 1, 0, 0
-AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
-AVX_INSTR andnpd, sse2, 1, 0, 0
-AVX_INSTR andnps, sse, 1, 0, 0
-AVX_INSTR andpd, sse2, 1, 0, 1
-AVX_INSTR andps, sse, 1, 0, 1
-AVX_INSTR blendpd, sse4, 1, 1, 0
-AVX_INSTR blendps, sse4, 1, 1, 0
-AVX_INSTR blendvpd, sse4 ; can't be emulated
-AVX_INSTR blendvps, sse4 ; can't be emulated
-AVX_INSTR cmppd, sse2, 1, 1, 0
-AVX_INSTR cmpps, sse, 1, 1, 0
-AVX_INSTR cmpsd, sse2, 1, 1, 0
-AVX_INSTR cmpss, sse, 1, 1, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
-AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
-AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
-AVX_INSTR cvtsi2ss, sse, 1, 0, 0
-AVX_INSTR cvtss2sd, sse2, 1, 0, 0
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
-AVX_INSTR divpd, sse2, 1, 0, 0
-AVX_INSTR divps, sse, 1, 0, 0
-AVX_INSTR divsd, sse2, 1, 0, 0
-AVX_INSTR divss, sse, 1, 0, 0
-AVX_INSTR dppd, sse4, 1, 1, 0
-AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
-AVX_INSTR haddpd, sse3, 1, 0, 0
-AVX_INSTR haddps, sse3, 1, 0, 0
-AVX_INSTR hsubpd, sse3, 1, 0, 0
-AVX_INSTR hsubps, sse3, 1, 0, 0
-AVX_INSTR insertps, sse4, 1, 1, 0
-AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
-AVX_INSTR maskmovdqu, sse2
-AVX_INSTR maxpd, sse2, 1, 0, 1
-AVX_INSTR maxps, sse, 1, 0, 1
-AVX_INSTR maxsd, sse2, 1, 0, 0
-AVX_INSTR maxss, sse, 1, 0, 0
-AVX_INSTR minpd, sse2, 1, 0, 1
-AVX_INSTR minps, sse, 1, 0, 1
-AVX_INSTR minsd, sse2, 1, 0, 0
-AVX_INSTR minss, sse, 1, 0, 0
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
-AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
-AVX_INSTR movdqa, sse2
-AVX_INSTR movdqu, sse2
-AVX_INSTR movhlps, sse, 1, 0, 0
-AVX_INSTR movhpd, sse2, 1, 0, 0
-AVX_INSTR movhps, sse, 1, 0, 0
-AVX_INSTR movlhps, sse, 1, 0, 0
-AVX_INSTR movlpd, sse2, 1, 0, 0
-AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
-AVX_INSTR movntdq, sse2
-AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
-AVX_INSTR movq, mmx
-AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
-AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
-AVX_INSTR mpsadbw, sse4, 0, 1, 0
-AVX_INSTR mulpd, sse2, 1, 0, 1
-AVX_INSTR mulps, sse, 1, 0, 1
-AVX_INSTR mulsd, sse2, 1, 0, 0
-AVX_INSTR mulss, sse, 1, 0, 0
-AVX_INSTR orpd, sse2, 1, 0, 1
-AVX_INSTR orps, sse, 1, 0, 1
-AVX_INSTR pabsb, ssse3
-AVX_INSTR pabsd, ssse3
-AVX_INSTR pabsw, ssse3
-AVX_INSTR packsswb, mmx, 0, 0, 0
-AVX_INSTR packssdw, mmx, 0, 0, 0
-AVX_INSTR packuswb, mmx, 0, 0, 0
-AVX_INSTR packusdw, sse4, 0, 0, 0
-AVX_INSTR paddb, mmx, 0, 0, 1
-AVX_INSTR paddw, mmx, 0, 0, 1
-AVX_INSTR paddd, mmx, 0, 0, 1
-AVX_INSTR paddq, sse2, 0, 0, 1
-AVX_INSTR paddsb, mmx, 0, 0, 1
-AVX_INSTR paddsw, mmx, 0, 0, 1
-AVX_INSTR paddusb, mmx, 0, 0, 1
-AVX_INSTR paddusw, mmx, 0, 0, 1
-AVX_INSTR palignr, ssse3, 0, 1, 0
-AVX_INSTR pand, mmx, 0, 0, 1
-AVX_INSTR pandn, mmx, 0, 0, 0
-AVX_INSTR pavgb, mmx2, 0, 0, 1
-AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4 ; can't be emulated
-AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
-AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pcmpestri, sse42
-AVX_INSTR pcmpestrm, sse42
-AVX_INSTR pcmpistri, sse42
-AVX_INSTR pcmpistrm, sse42
-AVX_INSTR pcmpeqb, mmx, 0, 0, 1
-AVX_INSTR pcmpeqw, mmx, 0, 0, 1
-AVX_INSTR pcmpeqd, mmx, 0, 0, 1
-AVX_INSTR pcmpeqq, sse4, 0, 0, 1
-AVX_INSTR pcmpgtb, mmx, 0, 0, 0
-AVX_INSTR pcmpgtw, mmx, 0, 0, 0
-AVX_INSTR pcmpgtd, mmx, 0, 0, 0
-AVX_INSTR pcmpgtq, sse42, 0, 0, 0
-AVX_INSTR pextrb, sse4
-AVX_INSTR pextrd, sse4
-AVX_INSTR pextrq, sse4
-AVX_INSTR pextrw, mmx2
-AVX_INSTR phaddw, ssse3, 0, 0, 0
-AVX_INSTR phaddd, ssse3, 0, 0, 0
-AVX_INSTR phaddsw, ssse3, 0, 0, 0
-AVX_INSTR phminposuw, sse4
-AVX_INSTR phsubw, ssse3, 0, 0, 0
-AVX_INSTR phsubd, ssse3, 0, 0, 0
-AVX_INSTR phsubsw, ssse3, 0, 0, 0
-AVX_INSTR pinsrb, sse4, 0, 1, 0
-AVX_INSTR pinsrd, sse4, 0, 1, 0
-AVX_INSTR pinsrq, sse4, 0, 1, 0
-AVX_INSTR pinsrw, mmx2, 0, 1, 0
-AVX_INSTR pmaddwd, mmx, 0, 0, 1
-AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
-AVX_INSTR pmaxsb, sse4, 0, 0, 1
-AVX_INSTR pmaxsw, mmx2, 0, 0, 1
-AVX_INSTR pmaxsd, sse4, 0, 0, 1
-AVX_INSTR pmaxub, mmx2, 0, 0, 1
-AVX_INSTR pmaxuw, sse4, 0, 0, 1
-AVX_INSTR pmaxud, sse4, 0, 0, 1
-AVX_INSTR pminsb, sse4, 0, 0, 1
-AVX_INSTR pminsw, mmx2, 0, 0, 1
-AVX_INSTR pminsd, sse4, 0, 0, 1
-AVX_INSTR pminub, mmx2, 0, 0, 1
-AVX_INSTR pminuw, sse4, 0, 0, 1
-AVX_INSTR pminud, sse4, 0, 0, 1
-AVX_INSTR pmovmskb, mmx2
-AVX_INSTR pmovsxbw, sse4
-AVX_INSTR pmovsxbd, sse4
-AVX_INSTR pmovsxbq, sse4
-AVX_INSTR pmovsxwd, sse4
-AVX_INSTR pmovsxwq, sse4
-AVX_INSTR pmovsxdq, sse4
-AVX_INSTR pmovzxbw, sse4
-AVX_INSTR pmovzxbd, sse4
-AVX_INSTR pmovzxbq, sse4
-AVX_INSTR pmovzxwd, sse4
-AVX_INSTR pmovzxwq, sse4
-AVX_INSTR pmovzxdq, sse4
-AVX_INSTR pmuldq, sse4, 0, 0, 1
-AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
-AVX_INSTR pmulhuw, mmx2, 0, 0, 1
-AVX_INSTR pmulhw, mmx, 0, 0, 1
-AVX_INSTR pmullw, mmx, 0, 0, 1
-AVX_INSTR pmulld, sse4, 0, 0, 1
-AVX_INSTR pmuludq, sse2, 0, 0, 1
-AVX_INSTR por, mmx, 0, 0, 1
-AVX_INSTR psadbw, mmx2, 0, 0, 1
-AVX_INSTR pshufb, ssse3, 0, 0, 0
-AVX_INSTR pshufd, sse2
-AVX_INSTR pshufhw, sse2
-AVX_INSTR pshuflw, sse2
-AVX_INSTR psignb, ssse3, 0, 0, 0
-AVX_INSTR psignw, ssse3, 0, 0, 0
-AVX_INSTR psignd, ssse3, 0, 0, 0
-AVX_INSTR psllw, mmx, 0, 0, 0
-AVX_INSTR pslld, mmx, 0, 0, 0
-AVX_INSTR psllq, mmx, 0, 0, 0
-AVX_INSTR pslldq, sse2, 0, 0, 0
-AVX_INSTR psraw, mmx, 0, 0, 0
-AVX_INSTR psrad, mmx, 0, 0, 0
-AVX_INSTR psrlw, mmx, 0, 0, 0
-AVX_INSTR psrld, mmx, 0, 0, 0
-AVX_INSTR psrlq, mmx, 0, 0, 0
-AVX_INSTR psrldq, sse2, 0, 0, 0
-AVX_INSTR psubb, mmx, 0, 0, 0
-AVX_INSTR psubw, mmx, 0, 0, 0
-AVX_INSTR psubd, mmx, 0, 0, 0
-AVX_INSTR psubq, sse2, 0, 0, 0
-AVX_INSTR psubsb, mmx, 0, 0, 0
-AVX_INSTR psubsw, mmx, 0, 0, 0
-AVX_INSTR psubusb, mmx, 0, 0, 0
-AVX_INSTR psubusw, mmx, 0, 0, 0
-AVX_INSTR ptest, sse4
-AVX_INSTR punpckhbw, mmx, 0, 0, 0
-AVX_INSTR punpckhwd, mmx, 0, 0, 0
-AVX_INSTR punpckhdq, mmx, 0, 0, 0
-AVX_INSTR punpckhqdq, sse2, 0, 0, 0
-AVX_INSTR punpcklbw, mmx, 0, 0, 0
-AVX_INSTR punpcklwd, mmx, 0, 0, 0
-AVX_INSTR punpckldq, mmx, 0, 0, 0
-AVX_INSTR punpcklqdq, sse2, 0, 0, 0
-AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse
-AVX_INSTR rcpss, sse, 1, 0, 0
-AVX_INSTR roundpd, sse4
-AVX_INSTR roundps, sse4
-AVX_INSTR roundsd, sse4, 1, 1, 0
-AVX_INSTR roundss, sse4, 1, 1, 0
-AVX_INSTR rsqrtps, sse
-AVX_INSTR rsqrtss, sse, 1, 0, 0
-AVX_INSTR shufpd, sse2, 1, 1, 0
-AVX_INSTR shufps, sse, 1, 1, 0
-AVX_INSTR sqrtpd, sse2
-AVX_INSTR sqrtps, sse
-AVX_INSTR sqrtsd, sse2, 1, 0, 0
-AVX_INSTR sqrtss, sse, 1, 0, 0
-AVX_INSTR stmxcsr, sse
-AVX_INSTR subpd, sse2, 1, 0, 0
-AVX_INSTR subps, sse, 1, 0, 0
-AVX_INSTR subsd, sse2, 1, 0, 0
-AVX_INSTR subss, sse, 1, 0, 0
-AVX_INSTR ucomisd, sse2
-AVX_INSTR ucomiss, sse
-AVX_INSTR unpckhpd, sse2, 1, 0, 0
-AVX_INSTR unpckhps, sse, 1, 0, 0
-AVX_INSTR unpcklpd, sse2, 1, 0, 0
-AVX_INSTR unpcklps, sse, 1, 0, 0
-AVX_INSTR xorpd, sse2, 1, 0, 1
-AVX_INSTR xorps, sse, 1, 0, 1
-
-; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 3dnow, 1, 0, 1
-AVX_INSTR pfsub, 3dnow, 1, 0, 0
-AVX_INSTR pfmul, 3dnow, 1, 0, 1
-
-; base-4 constants for shuffles
-%assign i 0
-%rep 256
-    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
-    %if j < 10
-        CAT_XDEFINE q000, j, i
-    %elif j < 100
-        CAT_XDEFINE q00, j, i
-    %elif j < 1000
-        CAT_XDEFINE q0, j, i
-    %else
-        CAT_XDEFINE q, j, i
-    %endif
-    %assign i i+1
-%endrep
-%undef i
-%undef j
-
-%macro FMA_INSTR 3
-    %macro %1 4-7 %1, %2, %3
-        %if cpuflag(xop)
-            v%5 %1, %2, %3, %4
-        %elifnidn %1, %4
-            %6 %1, %2, %3
-            %7 %1, %4
-        %else
-            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
-%endmacro
-
-FMA_INSTR  pmacsww,  pmullw, paddw
-FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
-; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
-; FMA3 is only possible if dst is the same as one of the src registers.
-; Either src2 or src3 can be a memory operand.
-%macro FMA4_INSTR 2-*
-    %push fma4_instr
-    %xdefine %$prefix %1
-    %rep %0 - 1
-        %macro %$prefix%2 4-6 %$prefix, %2
-            %if notcpuflag(fma3) && notcpuflag(fma4)
-                %error use of ``%5%6'' fma instruction in cpuname function: current_function
-            %elif cpuflag(fma4)
-                v%5%6 %1, %2, %3, %4
-            %elifidn %1, %2
-                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
-                %ifnum sizeof%3
-                    v%{5}213%6 %2, %3, %4
-                %else
-                    v%{5}132%6 %2, %4, %3
-                %endif
-            %elifidn %1, %3
-                v%{5}213%6 %3, %2, %4
-            %elifidn %1, %4
-                v%{5}231%6 %4, %2, %3
-            %else
-                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
-            %endif
-        %endmacro
-        %rotate 1
-    %endrep
-    %pop
-%endmacro
-
-FMA4_INSTR fmadd,    pd, ps, sd, ss
-FMA4_INSTR fmaddsub, pd, ps
-FMA4_INSTR fmsub,    pd, ps, sd, ss
-FMA4_INSTR fmsubadd, pd, ps
-FMA4_INSTR fnmadd,   pd, ps, sd, ss
-FMA4_INSTR fnmsub,   pd, ps, sd, ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
-%ifdef __YASM_VER__
-    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
-        %macro vpbroadcastq 2
-            %if sizeof%1 == 16
-                movddup %1, %2
-            %else
-                vbroadcastsd %1, %2
-            %endif
-        %endmacro
-    %endif
-%endif
diff --git a/android/src/main/libenc/jni/libx264/common/x86/x86util.asm b/android/src/main/libenc/jni/libx264/common/x86/x86util.asm
deleted file mode 100755
index 57d4356..0000000
--- a/android/src/main/libenc/jni/libx264/common/x86/x86util.asm
+++ /dev/null
@@ -1,886 +0,0 @@
-;*****************************************************************************
-;* x86util.asm: x86 utility macros
-;*****************************************************************************
-;* Copyright (C) 2008-2016 x264 project
-;*
-;* Authors: Holger Lubitz <holger@lubitz.org>
-;*          Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
-%assign SIZEOF_PIXEL 1
-%assign SIZEOF_DCTCOEF 2
-%define pixel byte
-%define vpbroadcastdct vpbroadcastw
-%define vpbroadcastpix vpbroadcastb
-%if HIGH_BIT_DEPTH
-    %assign SIZEOF_PIXEL 2
-    %assign SIZEOF_DCTCOEF 4
-    %define pixel word
-    %define vpbroadcastdct vpbroadcastd
-    %define vpbroadcastpix vpbroadcastw
-%endif
-
-%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
-%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
-
-%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
-
-%macro FIX_STRIDES 1-*
-%if HIGH_BIT_DEPTH
-%rep %0
-    add %1, %1
-    %rotate 1
-%endrep
-%endif
-%endmacro
-
-
-%macro SBUTTERFLY 4
-%ifidn %1, dqqq
-    vperm2i128  m%4, m%2, m%3, q0301 ; punpckh
-    vinserti128 m%2, m%2, xm%3, 1    ; punpckl
-%elif avx_enabled && mmsize >= 16
-    punpckh%1 m%4, m%2, m%3
-    punpckl%1 m%2, m%3
-%else
-    mova      m%4, m%2
-    punpckl%1 m%2, m%3
-    punpckh%1 m%4, m%3
-%endif
-    SWAP %3, %4
-%endmacro
-
-%macro SBUTTERFLY2 4
-    punpckl%1 m%4, m%2, m%3
-    punpckh%1 m%2, m%2, m%3
-    SWAP %2, %4, %3
-%endmacro
-
-%macro TRANSPOSE4x4W 5
-    SBUTTERFLY wd, %1, %2, %5
-    SBUTTERFLY wd, %3, %4, %5
-    SBUTTERFLY dq, %1, %3, %5
-    SBUTTERFLY dq, %2, %4, %5
-    SWAP %2, %3
-%endmacro
-
-%macro TRANSPOSE2x4x4W 5
-    SBUTTERFLY wd,  %1, %2, %5
-    SBUTTERFLY wd,  %3, %4, %5
-    SBUTTERFLY dq,  %1, %3, %5
-    SBUTTERFLY dq,  %2, %4, %5
-    SBUTTERFLY qdq, %1, %2, %5
-    SBUTTERFLY qdq, %3, %4, %5
-%endmacro
-
-%macro TRANSPOSE4x4D 5
-    SBUTTERFLY dq,  %1, %2, %5
-    SBUTTERFLY dq,  %3, %4, %5
-    SBUTTERFLY qdq, %1, %3, %5
-    SBUTTERFLY qdq, %2, %4, %5
-    SWAP %2, %3
-%endmacro
-
-%macro TRANSPOSE8x8W 9-11
-%if ARCH_X86_64
-    SBUTTERFLY wd,  %1, %2, %9
-    SBUTTERFLY wd,  %3, %4, %9
-    SBUTTERFLY wd,  %5, %6, %9
-    SBUTTERFLY wd,  %7, %8, %9
-    SBUTTERFLY dq,  %1, %3, %9
-    SBUTTERFLY dq,  %2, %4, %9
-    SBUTTERFLY dq,  %5, %7, %9
-    SBUTTERFLY dq,  %6, %8, %9
-    SBUTTERFLY qdq, %1, %5, %9
-    SBUTTERFLY qdq, %2, %6, %9
-    SBUTTERFLY qdq, %3, %7, %9
-    SBUTTERFLY qdq, %4, %8, %9
-    SWAP %2, %5
-    SWAP %4, %7
-%else
-; in:  m0..m7, unless %11 in which case m6 is in %9
-; out: m0..m7, unless %11 in which case m4 is in %10
-; spills into %9 and %10
-%if %0<11
-    movdqa %9, m%7
-%endif
-    SBUTTERFLY wd,  %1, %2, %7
-    movdqa %10, m%2
-    movdqa m%7, %9
-    SBUTTERFLY wd,  %3, %4, %2
-    SBUTTERFLY wd,  %5, %6, %2
-    SBUTTERFLY wd,  %7, %8, %2
-    SBUTTERFLY dq,  %1, %3, %2
-    movdqa %9, m%3
-    movdqa m%2, %10
-    SBUTTERFLY dq,  %2, %4, %3
-    SBUTTERFLY dq,  %5, %7, %3
-    SBUTTERFLY dq,  %6, %8, %3
-    SBUTTERFLY qdq, %1, %5, %3
-    SBUTTERFLY qdq, %2, %6, %3
-    movdqa %10, m%2
-    movdqa m%3, %9
-    SBUTTERFLY qdq, %3, %7, %2
-    SBUTTERFLY qdq, %4, %8, %2
-    SWAP %2, %5
-    SWAP %4, %7
-%if %0<11
-    movdqa m%5, %10
-%endif
-%endif
-%endmacro
-
-%macro WIDEN_SXWD 2
-    punpckhwd m%2, m%1
-    psrad     m%2, 16
-%if cpuflag(sse4)
-    pmovsxwd  m%1, m%1
-%else
-    punpcklwd m%1, m%1
-    psrad     m%1, 16
-%endif
-%endmacro
-
-%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
-%if cpuflag(ssse3)
-    pabsw   %1, %2
-%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
-    pxor    %1, %1
-    pcmpgtw %1, %2
-    pxor    %2, %1
-    psubw   %2, %1
-    SWAP    %1, %2
-%elifidn %1, %2
-    pxor    %3, %3
-    psubw   %3, %1
-    pmaxsw  %1, %3
-%elifid %2
-    pxor    %1, %1
-    psubw   %1, %2
-    pmaxsw  %1, %2
-%elif %0 == 2
-    pxor    %1, %1
-    psubw   %1, %2
-    pmaxsw  %1, %2
-%else
-    mova    %1, %2
-    pxor    %3, %3
-    psubw   %3, %1
-    pmaxsw  %1, %3
-%endif
-%endmacro
-
-%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
-%if cpuflag(ssse3)
-    pabsw   %1, %3
-    pabsw   %2, %4
-%elifidn %1, %3
-    pxor    %5, %5
-    pxor    %6, %6
-    psubw   %5, %1
-    psubw   %6, %2
-    pmaxsw  %1, %5
-    pmaxsw  %2, %6
-%else
-    pxor    %1, %1
-    pxor    %2, %2
-    psubw   %1, %3
-    psubw   %2, %4
-    pmaxsw  %1, %3
-    pmaxsw  %2, %4
-%endif
-%endmacro
-
-%macro ABSB 2
-%if cpuflag(ssse3)
-    pabsb   %1, %1
-%else
-    pxor    %2, %2
-    psubb   %2, %1
-    pminub  %1, %2
-%endif
-%endmacro
-
-%macro ABSD 2-3
-%if cpuflag(ssse3)
-    pabsd   %1, %2
-%else
-    %define %%s %2
-%if %0 == 3
-    mova    %3, %2
-    %define %%s %3
-%endif
-    pxor     %1, %1
-    pcmpgtd  %1, %%s
-    pxor    %%s, %1
-    psubd   %%s, %1
-    SWAP     %1, %%s
-%endif
-%endmacro
-
-%macro PSIGN 3-4
-%if cpuflag(ssse3) && %0 == 4
-    psign%1 %2, %3, %4
-%elif cpuflag(ssse3)
-    psign%1 %2, %3
-%elif %0 == 4
-    pxor    %2, %3, %4
-    psub%1  %2, %4
-%else
-    pxor    %2, %3
-    psub%1  %2, %3
-%endif
-%endmacro
-
-%define PSIGNW PSIGN w,
-%define PSIGND PSIGN d,
-
-%macro SPLATB_LOAD 3
-%if cpuflag(ssse3)
-    movd      %1, [%2-3]
-    pshufb    %1, %3
-%else
-    movd      %1, [%2-3] ;to avoid crossing a cacheline
-    punpcklbw %1, %1
-    SPLATW    %1, %1, 3
-%endif
-%endmacro
-
-%imacro SPLATW 2-3 0
-%if cpuflag(avx2) && %3 == 0
-    vpbroadcastw %1, %2
-%else
-    PSHUFLW      %1, %2, (%3)*q1111
-%if mmsize == 16
-    punpcklqdq   %1, %1
-%endif
-%endif
-%endmacro
-
-%imacro SPLATD 2-3 0
-%if mmsize == 16
-    pshufd %1, %2, (%3)*q1111
-%else
-    pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
-%endif
-%endmacro
-
-%macro CLIPW 3 ;(dst, min, max)
-    pmaxsw %1, %2
-    pminsw %1, %3
-%endmacro
-
-%macro MOVHL 2 ; dst, src
-%ifidn %1, %2
-    punpckhqdq %1, %2
-%elif cpuflag(avx)
-    punpckhqdq %1, %2, %2
-%elif cpuflag(sse4)
-    pshufd     %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
-%else
-    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
-%endif
-%endmacro
-
-%macro HADDD 2 ; sum junk
-%if sizeof%1 == 32
-%define %2 xmm%2
-    vextracti128 %2, %1, 1
-%define %1 xmm%1
-    paddd   %1, %2
-%endif
-%if mmsize >= 16
-    MOVHL   %2, %1
-    paddd   %1, %2
-%endif
-%if cpuflag(xop) && sizeof%1 == 16
-    vphadddq %1, %1
-%else
-    PSHUFLW %2, %1, q0032
-    paddd   %1, %2
-%endif
-%undef %1
-%undef %2
-%endmacro
-
-%macro HADDW 2 ; reg, tmp
-%if cpuflag(xop) && sizeof%1 == 16
-    vphaddwq  %1, %1
-    MOVHL     %2, %1
-    paddd     %1, %2
-%else
-    pmaddwd   %1, [pw_1]
-    HADDD     %1, %2
-%endif
-%endmacro
-
-%macro HADDUWD 2
-%if cpuflag(xop) && sizeof%1 == 16
-    vphadduwd %1, %1
-%else
-    psrld %2, %1, 16
-    pslld %1, 16
-    psrld %1, 16
-    paddd %1, %2
-%endif
-%endmacro
-
-%macro HADDUW 2
-%if cpuflag(xop) && sizeof%1 == 16
-    vphadduwq %1, %1
-    MOVHL     %2, %1
-    paddd     %1, %2
-%else
-    HADDUWD   %1, %2
-    HADDD     %1, %2
-%endif
-%endmacro
-
-%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
-; AVX2 version uses a precalculated extra input that
-; can be re-used across calls
-%if sizeof%1==32
-                                 ; %3 = abcdefgh ijklmnop (lower address)
-                                 ; %2 = ABCDEFGH IJKLMNOP (higher address)
-;   vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
-%if %4 < 16
-    palignr    %1, %5, %3, %4    ; %1 = bcdefghi jklmnopA
-%else
-    palignr    %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
-%endif
-%elif cpuflag(ssse3)
-    %if %0==5
-        palignr %1, %2, %3, %4
-    %else
-        palignr %1, %2, %3
-    %endif
-%else
-    %define %%dst %1
-    %if %0==5
-        %ifnidn %1, %2
-            mova %%dst, %2
-        %endif
-        %rotate 1
-    %endif
-    %ifnidn %4, %2
-        mova %4, %2
-    %endif
-    %if mmsize==8
-        psllq  %%dst, (8-%3)*8
-        psrlq  %4, %3*8
-    %else
-        pslldq %%dst, 16-%3
-        psrldq %4, %3
-    %endif
-    por %%dst, %4
-%endif
-%endmacro
-
-%macro PSHUFLW 1+
-    %if mmsize == 8
-        pshufw %1
-    %else
-        pshuflw %1
-    %endif
-%endmacro
-
-; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
-; values shifted in are undefined
-; faster if dst==src
-%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
-%define PSRLPIX PSXLPIX r,  1, ;dst, src, shift
-%macro PSXLPIX 5
-    %if mmsize == 8
-        %if %5&1
-            ps%1lq %3, %4, %5*8
-        %else
-            pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
-        %endif
-    %else
-        ps%1ldq %3, %4, %5*2
-    %endif
-%endmacro
-
-%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
-%ifnum %5
-    pand   m%3, m%5, m%4 ; src .. y6 .. y4
-    pand   m%1, m%5, m%2 ; dst .. y6 .. y4
-%else
-    mova   m%1, %5
-    pand   m%3, m%1, m%4 ; src .. y6 .. y4
-    pand   m%1, m%1, m%2 ; dst .. y6 .. y4
-%endif
-    psrlw  m%2, 8        ; dst .. y7 .. y5
-    psrlw  m%4, 8        ; src .. y7 .. y5
-%endmacro
-
-%macro SUMSUB_BA 3-4
-%if %0==3
-    padd%1  m%2, m%3
-    padd%1  m%3, m%3
-    psub%1  m%3, m%2
-%elif avx_enabled
-    padd%1  m%4, m%2, m%3
-    psub%1  m%3, m%2
-    SWAP    %2, %4
-%else
-    mova    m%4, m%2
-    padd%1  m%2, m%3
-    psub%1  m%3, m%4
-%endif
-%endmacro
-
-%macro SUMSUB_BADC 5-6
-%if %0==6
-    SUMSUB_BA %1, %2, %3, %6
-    SUMSUB_BA %1, %4, %5, %6
-%else
-    padd%1  m%2, m%3
-    padd%1  m%4, m%5
-    padd%1  m%3, m%3
-    padd%1  m%5, m%5
-    psub%1  m%3, m%2
-    psub%1  m%5, m%4
-%endif
-%endmacro
-
-%macro HADAMARD4_V 4+
-    SUMSUB_BADC w, %1, %2, %3, %4
-    SUMSUB_BADC w, %1, %3, %2, %4
-%endmacro
-
-%macro HADAMARD8_V 8+
-    SUMSUB_BADC w, %1, %2, %3, %4
-    SUMSUB_BADC w, %5, %6, %7, %8
-    SUMSUB_BADC w, %1, %3, %2, %4
-    SUMSUB_BADC w, %5, %7, %6, %8
-    SUMSUB_BADC w, %1, %5, %2, %6
-    SUMSUB_BADC w, %3, %7, %4, %8
-%endmacro
-
-%macro TRANS_SSE2 5-6
-; TRANSPOSE2x2
-; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
-; %2: ord/unord (for compat with sse4, unused)
-; %3/%4: source regs
-; %5/%6: tmp regs
-%ifidn %1, d
-%define mask [mask_10]
-%define shift 16
-%elifidn %1, q
-%define mask [mask_1100]
-%define shift 32
-%endif
-%if %0==6 ; less dependency if we have two tmp
-    mova   m%5, mask   ; ff00
-    mova   m%6, m%4    ; x5x4
-    psll%1 m%4, shift  ; x4..
-    pand   m%6, m%5    ; x5..
-    pandn  m%5, m%3    ; ..x0
-    psrl%1 m%3, shift  ; ..x1
-    por    m%4, m%5    ; x4x0
-    por    m%3, m%6    ; x5x1
-%else ; more dependency, one insn less. sometimes faster, sometimes not
-    mova   m%5, m%4    ; x5x4
-    psll%1 m%4, shift  ; x4..
-    pxor   m%4, m%3    ; (x4^x1)x0
-    pand   m%4, mask   ; (x4^x1)..
-    pxor   m%3, m%4    ; x4x0
-    psrl%1 m%4, shift  ; ..(x1^x4)
-    pxor   m%5, m%4    ; x5x1
-    SWAP   %4, %3, %5
-%endif
-%endmacro
-
-%macro TRANS_SSE4 5-6 ; see above
-%ifidn %1, d
-%ifidn %2, ord
-    psrl%1  m%5, m%3, 16
-    pblendw m%5, m%4, q2222
-    psll%1  m%4, 16
-    pblendw m%4, m%3, q1111
-    SWAP     %3, %5
-%else
-%if avx_enabled
-    pblendw m%5, m%3, m%4, q2222
-    SWAP     %3, %5
-%else
-    mova    m%5, m%3
-    pblendw m%3, m%4, q2222
-%endif
-    psll%1  m%4, 16
-    psrl%1  m%5, 16
-    por     m%4, m%5
-%endif
-%elifidn %1, q
-    shufps m%5, m%3, m%4, q3131
-    shufps m%3, m%3, m%4, q2020
-    SWAP    %4, %5
-%endif
-%endmacro
-
-%macro TRANS_XOP 5-6
-%ifidn %1, d
-    vpperm m%5, m%3, m%4, [transd_shuf1]
-    vpperm m%3, m%3, m%4, [transd_shuf2]
-%elifidn %1, q
-    shufps m%5, m%3, m%4, q3131
-    shufps m%3, m%4, q2020
-%endif
-    SWAP    %4, %5
-%endmacro
-
-%macro HADAMARD 5-6
-; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
-; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
-; %3/%4: regs
-; %5(%6): tmpregs
-%if %1!=0 ; have to reorder stuff for horizontal op
-    %ifidn %2, sumsub
-        %define ORDER ord
-        ; sumsub needs order because a-b != b-a unless a=b
-    %else
-        %define ORDER unord
-        ; if we just max, order doesn't matter (allows pblendw+or in sse4)
-    %endif
-    %if %1==1
-        TRANS d, ORDER, %3, %4, %5, %6
-    %elif %1==2
-        %if mmsize==8
-            SBUTTERFLY dq, %3, %4, %5
-        %else
-            TRANS q, ORDER, %3, %4, %5, %6
-        %endif
-    %elif %1==4
-        SBUTTERFLY qdq, %3, %4, %5
-    %elif %1==8
-        SBUTTERFLY dqqq, %3, %4, %5
-    %endif
-%endif
-%ifidn %2, sumsub
-    SUMSUB_BA w, %3, %4, %5
-%else
-    %ifidn %2, amax
-        %if %0==6
-            ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
-        %else
-            ABSW m%3, m%3, m%5
-            ABSW m%4, m%4, m%5
-        %endif
-    %endif
-    pmaxsw m%3, m%4
-%endif
-%endmacro
-
-
-%macro HADAMARD2_2D 6-7 sumsub
-    HADAMARD 0, sumsub, %1, %2, %5
-    HADAMARD 0, sumsub, %3, %4, %5
-    SBUTTERFLY %6, %1, %2, %5
-%ifnum %7
-    HADAMARD 0, amax, %1, %2, %5, %7
-%else
-    HADAMARD 0, %7, %1, %2, %5
-%endif
-    SBUTTERFLY %6, %3, %4, %5
-%ifnum %7
-    HADAMARD 0, amax, %3, %4, %5, %7
-%else
-    HADAMARD 0, %7, %3, %4, %5
-%endif
-%endmacro
-
-%macro HADAMARD4_2D 5-6 sumsub
-    HADAMARD2_2D %1, %2, %3, %4, %5, wd
-    HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
-    SWAP %2, %3
-%endmacro
-
-%macro HADAMARD4_2D_SSE 5-6 sumsub
-    HADAMARD  0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
-    HADAMARD  0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
-    SBUTTERFLY   wd, %1, %2, %5     ; %1: m0 1+0 %2: m1 1+0
-    SBUTTERFLY   wd, %3, %4, %5     ; %3: m0 3+2 %4: m1 3+2
-    HADAMARD2_2D %1, %3, %2, %4, %5, dq
-    SBUTTERFLY  qdq, %1, %2, %5
-    HADAMARD  0, %6, %1, %2, %5     ; 2nd H m1/m0 row 0+1
-    SBUTTERFLY  qdq, %3, %4, %5
-    HADAMARD  0, %6, %3, %4, %5     ; 2nd H m1/m0 row 2+3
-%endmacro
-
-%macro HADAMARD8_2D 9-10 sumsub
-    HADAMARD2_2D %1, %2, %3, %4, %9, wd
-    HADAMARD2_2D %5, %6, %7, %8, %9, wd
-    HADAMARD2_2D %1, %3, %2, %4, %9, dq
-    HADAMARD2_2D %5, %7, %6, %8, %9, dq
-    HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
-    HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
-%ifnidn %10, amax
-    SWAP %2, %5
-    SWAP %4, %7
-%endif
-%endmacro
-
-; doesn't include the "pmaddubsw hmul_8p" pass
-%macro HADAMARD8_2D_HMUL 10
-    HADAMARD4_V %1, %2, %3, %4, %9
-    HADAMARD4_V %5, %6, %7, %8, %9
-    SUMSUB_BADC w, %1, %5, %2, %6, %9
-    HADAMARD 2, sumsub, %1, %5, %9, %10
-    HADAMARD 2, sumsub, %2, %6, %9, %10
-    SUMSUB_BADC w, %3, %7, %4, %8, %9
-    HADAMARD 2, sumsub, %3, %7, %9, %10
-    HADAMARD 2, sumsub, %4, %8, %9, %10
-    HADAMARD 1, amax, %1, %5, %9, %10
-    HADAMARD 1, amax, %2, %6, %9, %5
-    HADAMARD 1, amax, %3, %7, %9, %5
-    HADAMARD 1, amax, %4, %8, %9, %5
-%endmacro
-
-%macro SUMSUB2_AB 4
-%if cpuflag(xop)
-    pmacs%1%1 m%4, m%3, [p%1_m2], m%2
-    pmacs%1%1 m%2, m%2, [p%1_2], m%3
-%elifnum %3
-    psub%1  m%4, m%2, m%3
-    psub%1  m%4, m%3
-    padd%1  m%2, m%2
-    padd%1  m%2, m%3
-%else
-    mova    m%4, m%2
-    padd%1  m%2, m%2
-    padd%1  m%2, %3
-    psub%1  m%4, %3
-    psub%1  m%4, %3
-%endif
-%endmacro
-
-%macro SUMSUBD2_AB 5
-%ifnum %4
-    psra%1  m%5, m%2, 1  ; %3: %3>>1
-    psra%1  m%4, m%3, 1  ; %2: %2>>1
-    padd%1  m%4, m%2     ; %3: %3>>1+%2
-    psub%1  m%5, m%3     ; %2: %2>>1-%3
-    SWAP     %2, %5
-    SWAP     %3, %4
-%else
-    mova    %5, m%2
-    mova    %4, m%3
-    psra%1  m%3, 1  ; %3: %3>>1
-    psra%1  m%2, 1  ; %2: %2>>1
-    padd%1  m%3, %5 ; %3: %3>>1+%2
-    psub%1  m%2, %4 ; %2: %2>>1-%3
-%endif
-%endmacro
-
-%macro DCT4_1D 5
-%ifnum %5
-    SUMSUB_BADC w, %4, %1, %3, %2, %5
-    SUMSUB_BA   w, %3, %4, %5
-    SUMSUB2_AB  w, %1, %2, %5
-    SWAP %1, %3, %4, %5, %2
-%else
-    SUMSUB_BADC w, %4, %1, %3, %2
-    SUMSUB_BA   w, %3, %4
-    mova     [%5], m%2
-    SUMSUB2_AB  w, %1, [%5], %2
-    SWAP %1, %3, %4, %2
-%endif
-%endmacro
-
-%macro IDCT4_1D 6-7
-%ifnum %6
-    SUMSUBD2_AB %1, %3, %5, %7, %6
-    ; %3: %3>>1-%5 %5: %3+%5>>1
-    SUMSUB_BA   %1, %4, %2, %7
-    ; %4: %2+%4 %2: %2-%4
-    SUMSUB_BADC %1, %5, %4, %3, %2, %7
-    ; %5: %2+%4 + (%3+%5>>1)
-    ; %4: %2+%4 - (%3+%5>>1)
-    ; %3: %2-%4 + (%3>>1-%5)
-    ; %2: %2-%4 - (%3>>1-%5)
-%else
-%ifidn %1, w
-    SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
-%else
-    SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
-%endif
-    SUMSUB_BA   %1, %4, %2
-    SUMSUB_BADC %1, %5, %4, %3, %2
-%endif
-    SWAP %2, %5, %4
-    ; %2: %2+%4 + (%3+%5>>1) row0
-    ; %3: %2-%4 + (%3>>1-%5) row1
-    ; %4: %2-%4 - (%3>>1-%5) row2
-    ; %5: %2+%4 - (%3+%5>>1) row3
-%endmacro
-
-
-%macro LOAD_DIFF 5-6 1
-%if HIGH_BIT_DEPTH
-%if %6 ; %5 aligned?
-    mova       %1, %4
-    psubw      %1, %5
-%else
-    movu       %1, %4
-    movu       %2, %5
-    psubw      %1, %2
-%endif
-%else ; !HIGH_BIT_DEPTH
-%ifidn %3, none
-    movh       %1, %4
-    movh       %2, %5
-    punpcklbw  %1, %2
-    punpcklbw  %2, %2
-    psubw      %1, %2
-%else
-    movh       %1, %4
-    punpcklbw  %1, %3
-    movh       %2, %5
-    punpcklbw  %2, %3
-    psubw      %1, %2
-%endif
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
-%if BIT_DEPTH == 8 && cpuflag(ssse3)
-    movh       m%2, [%8+%1*FDEC_STRIDE]
-    movh       m%1, [%7+%1*FENC_STRIDE]
-    punpcklbw  m%1, m%2
-    movh       m%3, [%8+%2*FDEC_STRIDE]
-    movh       m%2, [%7+%2*FENC_STRIDE]
-    punpcklbw  m%2, m%3
-    movh       m%4, [%8+%3*FDEC_STRIDE]
-    movh       m%3, [%7+%3*FENC_STRIDE]
-    punpcklbw  m%3, m%4
-    movh       m%5, [%8+%4*FDEC_STRIDE]
-    movh       m%4, [%7+%4*FENC_STRIDE]
-    punpcklbw  m%4, m%5
-    pmaddubsw  m%1, m%6
-    pmaddubsw  m%2, m%6
-    pmaddubsw  m%3, m%6
-    pmaddubsw  m%4, m%6
-%else
-    LOAD_DIFF  m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
-    LOAD_DIFF  m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
-    LOAD_DIFF  m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
-    LOAD_DIFF  m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
-%endif
-%endmacro
-
-%macro STORE_DCT 6
-    movq   [%5+%6+ 0], m%1
-    movq   [%5+%6+ 8], m%2
-    movq   [%5+%6+16], m%3
-    movq   [%5+%6+24], m%4
-    movhps [%5+%6+32], m%1
-    movhps [%5+%6+40], m%2
-    movhps [%5+%6+48], m%3
-    movhps [%5+%6+56], m%4
-%endmacro
-
-%macro STORE_IDCT 4
-    movhps [r0-4*FDEC_STRIDE], %1
-    movh   [r0-3*FDEC_STRIDE], %1
-    movhps [r0-2*FDEC_STRIDE], %2
-    movh   [r0-1*FDEC_STRIDE], %2
-    movhps [r0+0*FDEC_STRIDE], %3
-    movh   [r0+1*FDEC_STRIDE], %3
-    movhps [r0+2*FDEC_STRIDE], %4
-    movh   [r0+3*FDEC_STRIDE], %4
-%endmacro
-
-%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
-    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9],      %11
-    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3],   %11
-    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
-    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5],   %11
-%if %10
-    lea %8, [%8+4*r1]
-    lea %9, [%9+4*r3]
-%endif
-%endmacro
-
-; 2xdst, 2xtmp, 2xsrcrow
-%macro LOAD_DIFF16x2_AVX2 6
-    pmovzxbw m%1, [r1+%5*FENC_STRIDE]
-    pmovzxbw m%2, [r1+%6*FENC_STRIDE]
-    pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
-    pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
-    psubw    m%1, m%3
-    psubw    m%2, m%4
-%endmacro
-
-%macro DIFFx2 6-7
-    movh       %3, %5
-    punpcklbw  %3, %4
-    psraw      %1, 6
-    paddsw     %1, %3
-    movh       %3, %6
-    punpcklbw  %3, %4
-    psraw      %2, 6
-    paddsw     %2, %3
-    packuswb   %2, %1
-%endmacro
-
-; (high depth) in: %1, %2, min to clip, max to clip, mem128
-; in: %1, tmp, %3, mem64
-%macro STORE_DIFF 4-5
-%if HIGH_BIT_DEPTH
-    psrad      %1, 6
-    psrad      %2, 6
-    packssdw   %1, %2
-    paddw      %1, %5
-    CLIPW      %1, %3, %4
-    mova       %5, %1
-%else
-    movh       %2, %4
-    punpcklbw  %2, %3
-    psraw      %1, 6
-    paddsw     %1, %2
-    packuswb   %1, %1
-    movh       %4, %1
-%endif
-%endmacro
-
-%macro SHUFFLE_MASK_W 8
-    %rep 8
-        %if %1>=0x80
-            db %1, %1
-        %else
-            db %1*2
-            db %1*2+1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-; instruction, accum, input, iteration (zero to swap, nonzero to add)
-%macro ACCUM 4
-%if %4
-    %1        m%2, m%3
-%else
-    SWAP       %2, %3
-%endif
-%endmacro
diff --git a/android/src/main/libenc/jni/libx264/config.guess b/android/src/main/libenc/jni/libx264/config.guess
deleted file mode 100755
index 375fa2a..0000000
--- a/android/src/main/libenc/jni/libx264/config.guess
+++ /dev/null
@@ -1,1540 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
-
-timestamp='2012-09-25'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner.  Please send patches (context
-# diff format) to <config-patches@gnu.org> and include a ChangeLog
-# entry.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
-#
-# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-trap 'exit 1' 1 2 15
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-set_cc_for_build='
-trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
-trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
-: ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
- { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
- { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
- { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
-dummy=$tmp/dummy ;
-tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
-case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
-	for c in cc gcc c89 c99 ; do
-	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
-	     CC_FOR_BUILD="$c"; break ;
-	  fi ;
-	done ;
-	if test x"$CC_FOR_BUILD" = x ; then
-	  CC_FOR_BUILD=no_compiler_found ;
-	fi
-	;;
- ,,*)   CC_FOR_BUILD=$CC ;;
- ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ; set_cc_for_build= ;'
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
-	case "${UNAME_MACHINE_ARCH}" in
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    sh5el) machine=sh5le-unknown ;;
-	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently, or will in the future.
-	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		eval $set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep -q __ELF__
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-		os=netbsd
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "${UNAME_VERSION}" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}"
-	exit ;;
-    *:Bitrig:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
-	exit ;;
-    *:SolidBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    *:MirBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-		;;
-	*5.*)
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE="alphaev5" ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE="alphaev56" ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE="alphapca56" ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE="alphapca57" ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE="alphaev6" ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE="alphaev67" ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE="alphaev69" ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE="alphaev7" ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE="alphaev79" ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
-	exitcode=$?
-	trap '' 0
-	exit $exitcode ;;
-    Alpha\ *:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# Should we change UNAME_MACHINE based on the output of uname instead
-	# of the specific Alpha model?
-	echo alpha-pc-interix
-	exit ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-	echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit ;;
-    arm*:riscos:*:*|arm*:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "`(/bin/universe) 2>/dev/null`" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case `/usr/bin/uname -p` in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    s390x:SunOS:*:*)
-	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-	echo i386-pc-auroraux${UNAME_RELEASE}
-	exit ;;
-    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	eval $set_cc_for_build
-	SUN_ARCH="i386"
-	# If there is a compiler, see if it is configured for 64-bit objects.
-	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
-	# This test works for both compilers.
-	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		SUN_ARCH="x86_64"
-	    fi
-	fi
-	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "`/usr/bin/arch -k`" in
-	    Series*|S4*)
-		UNAME_RELEASE=`uname -v`
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
-	case "`/bin/arch`" in
-	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
-		;;
-	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-	echo m68k-milan-mint${UNAME_RELEASE}
-	exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-	echo m68k-hades-mint${UNAME_RELEASE}
-	exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-	echo m68k-unknown-mint${UNAME_RELEASE}
-	exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten${UNAME_RELEASE}
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c &&
-	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-	  SYSTEM_NAME=`$dummy $dummyarg` &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos${UNAME_RELEASE}
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-	# DG/UX returns AViiON for all architectures
-	UNAME_PROCESSOR=`/usr/bin/uname -p`
-	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
-	then
-	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-	       [ ${TARGET_BINARY_INTERFACE}x = x ]
-	    then
-		echo m88k-dg-dgux${UNAME_RELEASE}
-	    else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
-	    fi
-	else
-	    echo i586-dg-dgux${UNAME_RELEASE}
-	fi
-	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		eval $set_cc_for_build
-		sed 's/^		//' << EOF >$dummy.c
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[4567])
-	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if [ -x /usr/bin/getconf ]; then
-		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-		    case "${sc_cpu_version}" in
-		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-		      532)                      # CPU_PA_RISC2_0
-			case "${sc_kernel_bits}" in
-			  32) HP_ARCH="hppa2.0n" ;;
-			  64) HP_ARCH="hppa2.0w" ;;
-			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-			esac ;;
-		    esac
-		fi
-		if [ "${HP_ARCH}" = "" ]; then
-		    eval $set_cc_for_build
-		    sed 's/^		//' << EOF >$dummy.c
-
-		#define _HPUX_SOURCE
-		#include <stdlib.h>
-		#include <unistd.h>
-
-		int main ()
-		{
-		#if defined(_SC_KERNEL_BITS)
-		    long bits = sysconf(_SC_KERNEL_BITS);
-		#endif
-		    long cpu  = sysconf (_SC_CPU_VERSION);
-
-		    switch (cpu)
-			{
-			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-			case CPU_PA_RISC2_0:
-		#if defined(_SC_KERNEL_BITS)
-			    switch (bits)
-				{
-				case 64: puts ("hppa2.0w"); break;
-				case 32: puts ("hppa2.0n"); break;
-				default: puts ("hppa2.0"); break;
-				} break;
-		#else  /* !defined(_SC_KERNEL_BITS) */
-			    puts ("hppa2.0"); break;
-		#endif
-			default: puts ("hppa1.0"); break;
-			}
-		    exit (0);
-		}
-EOF
-		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if [ ${HP_ARCH} = "hppa2.0w" ]
-	then
-	    eval $set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep -q __LP64__
-	    then
-		HP_ARCH="hppa2.0w"
-	    else
-		HP_ARCH="hppa64"
-	    fi
-	fi
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ia64-hp-hpux${HPUX_REV}
-	exit ;;
-    3050*:HI-UX:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-unknown-osf1mk
-	else
-	    echo ${UNAME_MACHINE}-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-	exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-	exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-	exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-	exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:FreeBSD:*:*)
-	UNAME_PROCESSOR=`/usr/bin/uname -p`
-	case ${UNAME_PROCESSOR} in
-	    amd64)
-		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    *)
-		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	esac
-	exit ;;
-    i*:CYGWIN*:*)
-	echo ${UNAME_MACHINE}-pc-cygwin
-	exit ;;
-    *:MINGW64*:*)
-	echo ${UNAME_MACHINE}-pc-mingw64
-	exit ;;
-    *:MINGW*:*)
-	echo ${UNAME_MACHINE}-pc-mingw32
-	exit ;;
-    i*:MSYS*:*)
-	echo ${UNAME_MACHINE}-pc-msys
-	exit ;;
-    i*:windows32*:*)
-	# uname -m includes "-pc" on this system.
-	echo ${UNAME_MACHINE}-mingw32
-	exit ;;
-    i*:PW*:*)
-	echo ${UNAME_MACHINE}-pc-pw32
-	exit ;;
-    *:Interix*:*)
-	case ${UNAME_MACHINE} in
-	    x86)
-		echo i586-pc-interix${UNAME_RELEASE}
-		exit ;;
-	    authenticamd | genuineintel | EM64T)
-		echo x86_64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	    IA64)
-		echo ia64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	esac ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-	echo i${UNAME_MACHINE}-pc-mks
-	exit ;;
-    8664:Windows_NT:*)
-	echo x86_64-pc-mks
-	exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-	# UNAME_MACHINE based on the output of uname instead of i386?
-	echo i586-pc-interix
-	exit ;;
-    i*:UWIN*:*)
-	echo ${UNAME_MACHINE}-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-unknown-cygwin
-	exit ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
-	exit ;;
-    i*86:Minix:*:*)
-	echo ${UNAME_MACHINE}-pc-minix
-	exit ;;
-    aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    aarch64_be:Linux:*:*)
-	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-	esac
-	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
-	exit ;;
-    arm*:Linux:*:*)
-	eval $set_cc_for_build
-	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_EABI__
-	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
-	else
-	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
-		| grep -q __ARM_PCS_VFP
-	    then
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
-	    else
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
-	    fi
-	fi
-	exit ;;
-    avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
-	exit ;;
-    crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
-	exit ;;
-    frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    i*86:Linux:*:*)
-	LIBC=gnu
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
-	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-	exit ;;
-    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    mips:Linux:*:* | mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef ${UNAME_MACHINE}
-	#undef ${UNAME_MACHINE}el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=${UNAME_MACHINE}el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=${UNAME_MACHINE}
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    or32:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    padre:Linux:*:*)
-	echo sparc-unknown-linux-gnu
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
-	esac
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit ;;
-    ppc64le:Linux:*:*)
-	echo powerpc64le-unknown-linux-gnu
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
-	exit ;;
-    sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
-	exit ;;
-    x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-	# Unixware is an offshoot of SVR4, but it has its own version
-	# number series starting with 2...
-	# I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-	# Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo ${UNAME_MACHINE}-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo ${UNAME_MACHINE}-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo ${UNAME_MACHINE}-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo ${UNAME_MACHINE}-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    i*86:*DOS:*:*)
-	echo ${UNAME_MACHINE}-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
-	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case `/bin/uname -X | grep "^Machine"` in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
-	else
-		echo ${UNAME_MACHINE}-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-	# uname -m prints for DJGPP always 'pc', but it prints nothing about
-	# the processor, so we play safe by assuming i586.
-	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configury will decide that
-	# this is a cross-build.
-	echo i586-pc-msdosdjgpp
-	exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4; exit; } ;;
-    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
-	OS_REL='.3'
-	test -r /etc/.relid \
-	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-	echo powerpc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-			# says <Richard.M.Bartel@ccMail.Census.GOV>
-	echo i586-unisys-sysv4
-	exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo ${UNAME_MACHINE}-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if [ -d /usr/nec ]; then
-		echo mips-nec-sysv${UNAME_RELEASE}
-	else
-		echo mips-unknown-sysv${UNAME_RELEASE}
-	fi
-	exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
-	echo i586-pc-haiku
-	exit ;;
-    x86_64:Haiku:*:*)
-	echo x86_64-unknown-haiku
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux${UNAME_RELEASE}
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Rhapsody:*:*)
-	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    i386)
-		eval $set_cc_for_build
-		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		      grep IS_64BIT_ARCH >/dev/null
-		  then
-		      UNAME_PROCESSOR="x86_64"
-		  fi
-		fi ;;
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = "x86"; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NEO-?:NONSTOP_KERNEL:*:*)
-	echo neo-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    NSE-*:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	if test "$cputype" = "386"; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo ${UNAME_MACHINE}-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-	echo mips-sei-seiux${UNAME_RELEASE}
-	exit ;;
-    *:DragonFly:*:*)
-	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit ;;
-    *:*VMS:*:*)
-	UNAME_MACHINE=`(uname -p) 2>/dev/null`
-	case "${UNAME_MACHINE}" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
-	exit ;;
-    i*86:rdos:*:*)
-	echo ${UNAME_MACHINE}-pc-rdos
-	exit ;;
-    i*86:AROS:*:*)
-	echo ${UNAME_MACHINE}-pc-aros
-	exit ;;
-    x86_64:VMkernel:*:*)
-	echo ${UNAME_MACHINE}-unknown-esx
-	exit ;;
-esac
-
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-	"4"
-#else
-	""
-#endif
-	); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
-cat >&2 <<EOF
-$0: unable to guess system type
-
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
-
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
-and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
-
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches@gnu.org> in order to provide the needed
-information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
-
-hostinfo               = `(hostinfo) 2>/dev/null`
-/bin/universe          = `(/bin/universe) 2>/dev/null`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
-/bin/arch              = `(/bin/arch) 2>/dev/null`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
-
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
-EOF
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/android/src/main/libenc/jni/libx264/config.sub b/android/src/main/libenc/jni/libx264/config.sub
deleted file mode 100755
index 8df5511..0000000
--- a/android/src/main/libenc/jni/libx264/config.sub
+++ /dev/null
@@ -1,1793 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
-
-timestamp='2012-12-06'
-
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted GNU ChangeLog entry.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
-       $0 [OPTION] ALIAS
-
-Canonicalize a configuration name.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help"
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo $1
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
-  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | \
-  kopensolaris*-gnu* | \
-  storm-chaos* | os2-emx* | rtmk-nova*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  android-linux)
-    os=-linux-android
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-	-sun*os*)
-		# Prevent following clause from handling this invalid input.
-		;;
-	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray | -microblaze*)
-		os=
-		basic_machine=$1
-		;;
-	-bluegene*)
-		os=-cnk
-		;;
-	-sim | -cisco | -oki | -wec | -winbond)
-		os=
-		basic_machine=$1
-		;;
-	-scout)
-		;;
-	-wrs)
-		os=-vxworks
-		basic_machine=$1
-		;;
-	-chorusos*)
-		os=-chorusos
-		basic_machine=$1
-		;;
-	-chorusrdb)
-		os=-chorusrdb
-		basic_machine=$1
-		;;
-	-hiux*)
-		os=-hiuxwe2
-		;;
-	-sco6)
-		os=-sco5v6
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5)
-		os=-sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco4)
-		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2.[4-9]*)
-		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2v[4-9]*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco*)
-		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-udk*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-isc)
-		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-clix*)
-		basic_machine=clipper-intergraph
-		;;
-	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-lynx*178)
-		os=-lynxos178
-		;;
-	-lynx*5)
-		os=-lynxos5
-		;;
-	-lynx*)
-		os=-lynxos
-		;;
-	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
-		;;
-	-psos*)
-		os=-psos
-		;;
-	-mint | -mint[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
-	# Recognize the basic CPU types without company name.
-	# Some are omitted here because they have special meanings below.
-	1750a | 580 \
-	| a29k \
-	| aarch64 | aarch64_be \
-	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
-	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-	| am33_2.0 \
-	| arc \
-	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
-	| avr | avr32 \
-	| be32 | be64 \
-	| bfin \
-	| c4x | clipper \
-	| d10v | d30v | dlx | dsp16xx \
-	| epiphany \
-	| fido | fr30 | frv \
-	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-	| hexagon \
-	| i370 | i860 | i960 | ia64 \
-	| ip2k | iq2000 \
-	| le32 | le64 \
-	| lm32 \
-	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
-	| mips | mipsbe | mipseb | mipsel | mipsle \
-	| mips16 \
-	| mips64 | mips64el \
-	| mips64octeon | mips64octeonel \
-	| mips64orion | mips64orionel \
-	| mips64r5900 | mips64r5900el \
-	| mips64vr | mips64vrel \
-	| mips64vr4100 | mips64vr4100el \
-	| mips64vr4300 | mips64vr4300el \
-	| mips64vr5000 | mips64vr5000el \
-	| mips64vr5900 | mips64vr5900el \
-	| mipsisa32 | mipsisa32el \
-	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa64 | mipsisa64el \
-	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64sb1 | mipsisa64sb1el \
-	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipstx39 | mipstx39el \
-	| mn10200 | mn10300 \
-	| moxie \
-	| mt \
-	| msp430 \
-	| nds32 | nds32le | nds32be \
-	| nios | nios2 \
-	| ns16k | ns32k \
-	| open8 \
-	| or32 \
-	| pdp10 | pdp11 | pj | pjl \
-	| powerpc | powerpc64 | powerpc64le | powerpcle \
-	| pyramid \
-	| rl78 | rx \
-	| score \
-	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
-	| sh64 | sh64le \
-	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
-	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
-	| spu \
-	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
-	| ubicom32 \
-	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
-	| we32k \
-	| x86 | xc16x | xstormy16 | xtensa \
-	| z8k | z80)
-		basic_machine=$basic_machine-unknown
-		;;
-	c54x)
-		basic_machine=tic54x-unknown
-		;;
-	c55x)
-		basic_machine=tic55x-unknown
-		;;
-	c6x)
-		basic_machine=tic6x-unknown
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
-		basic_machine=$basic_machine-unknown
-		os=-none
-		;;
-	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
-		;;
-	ms1)
-		basic_machine=mt-unknown
-		;;
-
-	strongarm | thumb | xscale)
-		basic_machine=arm-unknown
-		;;
-	xgate)
-		basic_machine=$basic_machine-unknown
-		os=-none
-		;;
-	xscaleeb)
-		basic_machine=armeb-unknown
-		;;
-
-	xscaleel)
-		basic_machine=armel-unknown
-		;;
-
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-	  basic_machine=$basic_machine-pc
-	  ;;
-	# Object if more than one company name word.
-	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-	# Recognize the basic CPU types with company name.
-	580-* \
-	| a29k-* \
-	| aarch64-* | aarch64_be-* \
-	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
-	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
-	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-	| avr-* | avr32-* \
-	| be32-* | be64-* \
-	| bfin-* | bs2000-* \
-	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| clipper-* | craynv-* | cydra-* \
-	| d10v-* | d30v-* | dlx-* \
-	| elxsi-* \
-	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
-	| h8300-* | h8500-* \
-	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
-	| hexagon-* \
-	| i*86-* | i860-* | i960-* | ia64-* \
-	| ip2k-* | iq2000-* \
-	| le32-* | le64-* \
-	| lm32-* \
-	| m32c-* | m32r-* | m32rle-* \
-	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
-	| microblaze-* | microblazeel-* \
-	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
-	| mips16-* \
-	| mips64-* | mips64el-* \
-	| mips64octeon-* | mips64octeonel-* \
-	| mips64orion-* | mips64orionel-* \
-	| mips64r5900-* | mips64r5900el-* \
-	| mips64vr-* | mips64vrel-* \
-	| mips64vr4100-* | mips64vr4100el-* \
-	| mips64vr4300-* | mips64vr4300el-* \
-	| mips64vr5000-* | mips64vr5000el-* \
-	| mips64vr5900-* | mips64vr5900el-* \
-	| mipsisa32-* | mipsisa32el-* \
-	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa64-* | mipsisa64el-* \
-	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64sb1-* | mipsisa64sb1el-* \
-	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipstx39-* | mipstx39el-* \
-	| mmix-* \
-	| mt-* \
-	| msp430-* \
-	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* \
-	| none-* | np1-* | ns16k-* | ns32k-* \
-	| open8-* \
-	| orion-* \
-	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
-	| pyramid-* \
-	| rl78-* | romp-* | rs6000-* | rx-* \
-	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
-	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
-	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
-	| tahoe-* \
-	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-	| tile*-* \
-	| tron-* \
-	| ubicom32-* \
-	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
-	| vax-* \
-	| we32k-* \
-	| x86-* | x86_64-* | xc16x-* | xps100-* \
-	| xstormy16-* | xtensa*-* \
-	| ymp-* \
-	| z8k-* | z80-*)
-		;;
-	# Recognize the basic CPU types without company name, with glob match.
-	xtensa*)
-		basic_machine=$basic_machine-unknown
-		;;
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	386bsd)
-		basic_machine=i386-unknown
-		os=-bsd
-		;;
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		basic_machine=m68000-att
-		;;
-	3b*)
-		basic_machine=we32k-att
-		;;
-	a29khif)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	abacus)
-		basic_machine=abacus-unknown
-		;;
-	adobe68k)
-		basic_machine=m68010-adobe
-		os=-scout
-		;;
-	alliant | fx80)
-		basic_machine=fx80-alliant
-		;;
-	altos | altos3068)
-		basic_machine=m68k-altos
-		;;
-	am29k)
-		basic_machine=a29k-none
-		os=-bsd
-		;;
-	amd64)
-		basic_machine=x86_64-pc
-		;;
-	amd64-*)
-		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	amdahl)
-		basic_machine=580-amdahl
-		os=-sysv
-		;;
-	amiga | amiga-*)
-		basic_machine=m68k-unknown
-		;;
-	amigaos | amigados)
-		basic_machine=m68k-unknown
-		os=-amigaos
-		;;
-	amigaunix | amix)
-		basic_machine=m68k-unknown
-		os=-sysv4
-		;;
-	apollo68)
-		basic_machine=m68k-apollo
-		os=-sysv
-		;;
-	apollo68bsd)
-		basic_machine=m68k-apollo
-		os=-bsd
-		;;
-	aros)
-		basic_machine=i386-pc
-		os=-aros
-		;;
-	aux)
-		basic_machine=m68k-apple
-		os=-aux
-		;;
-	balance)
-		basic_machine=ns32k-sequent
-		os=-dynix
-		;;
-	blackfin)
-		basic_machine=bfin-unknown
-		os=-linux
-		;;
-	blackfin-*)
-		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	bluegene*)
-		basic_machine=powerpc-ibm
-		os=-cnk
-		;;
-	c54x-*)
-		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c55x-*)
-		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c6x-*)
-		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	c90)
-		basic_machine=c90-cray
-		os=-unicos
-		;;
-	cegcc)
-		basic_machine=arm-unknown
-		os=-cegcc
-		;;
-	convex-c1)
-		basic_machine=c1-convex
-		os=-bsd
-		;;
-	convex-c2)
-		basic_machine=c2-convex
-		os=-bsd
-		;;
-	convex-c32)
-		basic_machine=c32-convex
-		os=-bsd
-		;;
-	convex-c34)
-		basic_machine=c34-convex
-		os=-bsd
-		;;
-	convex-c38)
-		basic_machine=c38-convex
-		os=-bsd
-		;;
-	cray | j90)
-		basic_machine=j90-cray
-		os=-unicos
-		;;
-	craynv)
-		basic_machine=craynv-cray
-		os=-unicosmp
-		;;
-	cr16 | cr16-*)
-		basic_machine=cr16-unknown
-		os=-elf
-		;;
-	crds | unos)
-		basic_machine=m68k-crds
-		;;
-	crisv32 | crisv32-* | etraxfs*)
-		basic_machine=crisv32-axis
-		;;
-	cris | cris-* | etrax*)
-		basic_machine=cris-axis
-		;;
-	crx)
-		basic_machine=crx-unknown
-		os=-elf
-		;;
-	da30 | da30-*)
-		basic_machine=m68k-da30
-		;;
-	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-		basic_machine=mips-dec
-		;;
-	decsystem10* | dec10*)
-		basic_machine=pdp10-dec
-		os=-tops10
-		;;
-	decsystem20* | dec20*)
-		basic_machine=pdp10-dec
-		os=-tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		basic_machine=m68k-motorola
-		;;
-	delta88)
-		basic_machine=m88k-motorola
-		os=-sysv3
-		;;
-	dicos)
-		basic_machine=i686-pc
-		os=-dicos
-		;;
-	djgpp)
-		basic_machine=i586-pc
-		os=-msdosdjgpp
-		;;
-	dpx20 | dpx20-*)
-		basic_machine=rs6000-bull
-		os=-bosx
-		;;
-	dpx2* | dpx2*-bull)
-		basic_machine=m68k-bull
-		os=-sysv3
-		;;
-	ebmon29k)
-		basic_machine=a29k-amd
-		os=-ebmon
-		;;
-	elxsi)
-		basic_machine=elxsi-elxsi
-		os=-bsd
-		;;
-	encore | umax | mmax)
-		basic_machine=ns32k-encore
-		;;
-	es1800 | OSE68k | ose68k | ose | OSE)
-		basic_machine=m68k-ericsson
-		os=-ose
-		;;
-	fx2800)
-		basic_machine=i860-alliant
-		;;
-	genix)
-		basic_machine=ns32k-ns
-		;;
-	gmicro)
-		basic_machine=tron-gmicro
-		os=-sysv
-		;;
-	go32)
-		basic_machine=i386-pc
-		os=-go32
-		;;
-	h3050r* | hiux*)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	h8300hms)
-		basic_machine=h8300-hitachi
-		os=-hms
-		;;
-	h8300xray)
-		basic_machine=h8300-hitachi
-		os=-xray
-		;;
-	h8500hms)
-		basic_machine=h8500-hitachi
-		os=-hms
-		;;
-	harris)
-		basic_machine=m88k-harris
-		os=-sysv3
-		;;
-	hp300-*)
-		basic_machine=m68k-hp
-		;;
-	hp300bsd)
-		basic_machine=m68k-hp
-		os=-bsd
-		;;
-	hp300hpux)
-		basic_machine=m68k-hp
-		os=-hpux
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		basic_machine=m68000-hp
-		;;
-	hp9k3[2-9][0-9])
-		basic_machine=m68k-hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hppa-next)
-		os=-nextstep3
-		;;
-	hppaosf)
-		basic_machine=hppa1.1-hp
-		os=-osf
-		;;
-	hppro)
-		basic_machine=hppa1.1-hp
-		os=-proelf
-		;;
-	i370-ibm* | ibm*)
-		basic_machine=i370-ibm
-		;;
-	i*86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv32
-		;;
-	i*86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv4
-		;;
-	i*86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv
-		;;
-	i*86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-solaris2
-		;;
-	i386mach)
-		basic_machine=i386-mach
-		os=-mach
-		;;
-	i386-vsta | vsta)
-		basic_machine=i386-unknown
-		os=-vsta
-		;;
-	iris | iris4d)
-		basic_machine=mips-sgi
-		case $os in
-		    -irix*)
-			;;
-		    *)
-			os=-irix4
-			;;
-		esac
-		;;
-	isi68 | isi)
-		basic_machine=m68k-isi
-		os=-sysv
-		;;
-	m68knommu)
-		basic_machine=m68k-unknown
-		os=-linux
-		;;
-	m68knommu-*)
-		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
-	magnum | m3230)
-		basic_machine=mips-mips
-		os=-sysv
-		;;
-	merlin)
-		basic_machine=ns32k-utek
-		os=-sysv
-		;;
-	microblaze*)
-		basic_machine=microblaze-xilinx
-		;;
-	mingw64)
-		basic_machine=x86_64-pc
-		os=-mingw64
-		;;
-	mingw32)
-		basic_machine=i386-pc
-		os=-mingw32
-		;;
-	mingw32ce)
-		basic_machine=arm-unknown
-		os=-mingw32ce
-		;;
-	miniframe)
-		basic_machine=m68000-convergent
-		;;
-	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-		;;
-	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-		;;
-	monitor)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	morphos)
-		basic_machine=powerpc-unknown
-		os=-morphos
-		;;
-	msdos)
-		basic_machine=i386-pc
-		os=-msdos
-		;;
-	ms1-*)
-		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
-		;;
-	msys)
-		basic_machine=i386-pc
-		os=-msys
-		;;
-	mvs)
-		basic_machine=i370-ibm
-		os=-mvs
-		;;
-	nacl)
-		basic_machine=le32-unknown
-		os=-nacl
-		;;
-	ncr3000)
-		basic_machine=i486-ncr
-		os=-sysv4
-		;;
-	netbsd386)
-		basic_machine=i386-unknown
-		os=-netbsd
-		;;
-	netwinder)
-		basic_machine=armv4l-rebel
-		os=-linux
-		;;
-	news | news700 | news800 | news900)
-		basic_machine=m68k-sony
-		os=-newsos
-		;;
-	news1000)
-		basic_machine=m68030-sony
-		os=-newsos
-		;;
-	news-3600 | risc-news)
-		basic_machine=mips-sony
-		os=-newsos
-		;;
-	necv70)
-		basic_machine=v70-nec
-		os=-sysv
-		;;
-	next | m*-next )
-		basic_machine=m68k-next
-		case $os in
-		    -nextstep* )
-			;;
-		    -ns2*)
-		      os=-nextstep2
-			;;
-		    *)
-		      os=-nextstep3
-			;;
-		esac
-		;;
-	nh3000)
-		basic_machine=m68k-harris
-		os=-cxux
-		;;
-	nh[45]000)
-		basic_machine=m88k-harris
-		os=-cxux
-		;;
-	nindy960)
-		basic_machine=i960-intel
-		os=-nindy
-		;;
-	mon960)
-		basic_machine=i960-intel
-		os=-mon960
-		;;
-	nonstopux)
-		basic_machine=mips-compaq
-		os=-nonstopux
-		;;
-	np1)
-		basic_machine=np1-gould
-		;;
-	neo-tandem)
-		basic_machine=neo-tandem
-		;;
-	nse-tandem)
-		basic_machine=nse-tandem
-		;;
-	nsr-tandem)
-		basic_machine=nsr-tandem
-		;;
-	op50n-* | op60c-*)
-		basic_machine=hppa1.1-oki
-		os=-proelf
-		;;
-	openrisc | openrisc-*)
-		basic_machine=or32-unknown
-		;;
-	os400)
-		basic_machine=powerpc-ibm
-		os=-os400
-		;;
-	OSE68000 | ose68000)
-		basic_machine=m68000-ericsson
-		os=-ose
-		;;
-	os68k)
-		basic_machine=m68k-none
-		os=-os68k
-		;;
-	pa-hitachi)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	paragon)
-		basic_machine=i860-intel
-		os=-osf
-		;;
-	parisc)
-		basic_machine=hppa-unknown
-		os=-linux
-		;;
-	parisc-*)
-		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	pbd)
-		basic_machine=sparc-tti
-		;;
-	pbb)
-		basic_machine=m68k-tti
-		;;
-	pc532 | pc532-*)
-		basic_machine=ns32k-pc532
-		;;
-	pc98)
-		basic_machine=i386-pc
-		;;
-	pc98-*)
-		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium | p5 | k5 | k6 | nexgen | viac3)
-		basic_machine=i586-pc
-		;;
-	pentiumpro | p6 | 6x86 | athlon | athlon_*)
-		basic_machine=i686-pc
-		;;
-	pentiumii | pentium2 | pentiumiii | pentium3)
-		basic_machine=i686-pc
-		;;
-	pentium4)
-		basic_machine=i786-pc
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium4-*)
-		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pn)
-		basic_machine=pn-gould
-		;;
-	power)	basic_machine=power-ibm
-		;;
-	ppc | ppcbe)	basic_machine=powerpc-unknown
-		;;
-	ppc-* | ppcbe-*)
-		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
-		basic_machine=powerpcle-unknown
-		;;
-	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64)	basic_machine=powerpc64-unknown
-		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
-		basic_machine=powerpc64le-unknown
-		;;
-	ppc64le-* | powerpc64little-*)
-		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ps2)
-		basic_machine=i386-ibm
-		;;
-	pw32)
-		basic_machine=i586-unknown
-		os=-pw32
-		;;
-	rdos | rdos64)
-		basic_machine=x86_64-pc
-		os=-rdos
-		;;
-	rdos32)
-		basic_machine=i386-pc
-		os=-rdos
-		;;
-	rom68k)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	rm[46]00)
-		basic_machine=mips-siemens
-		;;
-	rtpc | rtpc-*)
-		basic_machine=romp-ibm
-		;;
-	s390 | s390-*)
-		basic_machine=s390-ibm
-		;;
-	s390x | s390x-*)
-		basic_machine=s390x-ibm
-		;;
-	sa29200)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	sb1)
-		basic_machine=mipsisa64sb1-unknown
-		;;
-	sb1el)
-		basic_machine=mipsisa64sb1el-unknown
-		;;
-	sde)
-		basic_machine=mipsisa32-sde
-		os=-elf
-		;;
-	sei)
-		basic_machine=mips-sei
-		os=-seiux
-		;;
-	sequent)
-		basic_machine=i386-sequent
-		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
-	sh5el)
-		basic_machine=sh5le-unknown
-		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparclite-wrs | simso-wrs)
-		basic_machine=sparclite-wrs
-		os=-vxworks
-		;;
-	sps7)
-		basic_machine=m68k-bull
-		os=-sysv2
-		;;
-	spur)
-		basic_machine=spur-unknown
-		;;
-	st2000)
-		basic_machine=m68k-tandem
-		;;
-	stratus)
-		basic_machine=i860-stratus
-		os=-sysv4
-		;;
-	strongarm-* | thumb-*)
-		basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	sun2)
-		basic_machine=m68000-sun
-		;;
-	sun2os3)
-		basic_machine=m68000-sun
-		os=-sunos3
-		;;
-	sun2os4)
-		basic_machine=m68000-sun
-		os=-sunos4
-		;;
-	sun3os3)
-		basic_machine=m68k-sun
-		os=-sunos3
-		;;
-	sun3os4)
-		basic_machine=m68k-sun
-		os=-sunos4
-		;;
-	sun4os3)
-		basic_machine=sparc-sun
-		os=-sunos3
-		;;
-	sun4os4)
-		basic_machine=sparc-sun
-		os=-sunos4
-		;;
-	sun4sol2)
-		basic_machine=sparc-sun
-		os=-solaris2
-		;;
-	sun3 | sun3-*)
-		basic_machine=m68k-sun
-		;;
-	sun4)
-		basic_machine=sparc-sun
-		;;
-	sun386 | sun386i | roadrunner)
-		basic_machine=i386-sun
-		;;
-	sv1)
-		basic_machine=sv1-cray
-		os=-unicos
-		;;
-	symmetry)
-		basic_machine=i386-sequent
-		os=-dynix
-		;;
-	t3e)
-		basic_machine=alphaev5-cray
-		os=-unicos
-		;;
-	t90)
-		basic_machine=t90-cray
-		os=-unicos
-		;;
-	tile*)
-		basic_machine=$basic_machine-unknown
-		os=-linux-gnu
-		;;
-	tx39)
-		basic_machine=mipstx39-unknown
-		;;
-	tx39el)
-		basic_machine=mipstx39el-unknown
-		;;
-	toad1)
-		basic_machine=pdp10-xkl
-		os=-tops20
-		;;
-	tower | tower-32)
-		basic_machine=m68k-ncr
-		;;
-	tpf)
-		basic_machine=s390x-ibm
-		os=-tpf
-		;;
-	udi29k)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	ultra3)
-		basic_machine=a29k-nyu
-		os=-sym1
-		;;
-	v810 | necv810)
-		basic_machine=v810-nec
-		os=-none
-		;;
-	vaxv)
-		basic_machine=vax-dec
-		os=-sysv
-		;;
-	vms)
-		basic_machine=vax-dec
-		os=-vms
-		;;
-	vpp*|vx|vx-*)
-		basic_machine=f301-fujitsu
-		;;
-	vxworks960)
-		basic_machine=i960-wrs
-		os=-vxworks
-		;;
-	vxworks68)
-		basic_machine=m68k-wrs
-		os=-vxworks
-		;;
-	vxworks29k)
-		basic_machine=a29k-wrs
-		os=-vxworks
-		;;
-	w65*)
-		basic_machine=w65-wdc
-		os=-none
-		;;
-	w89k-*)
-		basic_machine=hppa1.1-winbond
-		os=-proelf
-		;;
-	xbox)
-		basic_machine=i686-pc
-		os=-mingw32
-		;;
-	xps | xps100)
-		basic_machine=xps100-honeywell
-		;;
-	xscale-* | xscalee[bl]-*)
-		basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
-		;;
-	ymp)
-		basic_machine=ymp-cray
-		os=-unicos
-		;;
-	z8k-*-coff)
-		basic_machine=z8k-unknown
-		os=-sim
-		;;
-	z80-*-coff)
-		basic_machine=z80-unknown
-		os=-sim
-		;;
-	none)
-		basic_machine=none-none
-		os=-none
-		;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		basic_machine=hppa1.1-winbond
-		;;
-	op50n)
-		basic_machine=hppa1.1-oki
-		;;
-	op60c)
-		basic_machine=hppa1.1-oki
-		;;
-	romp)
-		basic_machine=romp-ibm
-		;;
-	mmix)
-		basic_machine=mmix-knuth
-		;;
-	rs6000)
-		basic_machine=rs6000-ibm
-		;;
-	vax)
-		basic_machine=vax-dec
-		;;
-	pdp10)
-		# there are many clones, so DEC is not a safe bet
-		basic_machine=pdp10-unknown
-		;;
-	pdp11)
-		basic_machine=pdp11-dec
-		;;
-	we32k)
-		basic_machine=we32k-att
-		;;
-	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
-		basic_machine=sh-unknown
-		;;
-	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-		basic_machine=sparc-sun
-		;;
-	cydra)
-		basic_machine=cydra-cydrome
-		;;
-	orion)
-		basic_machine=orion-highlevel
-		;;
-	orion105)
-		basic_machine=clipper-highlevel
-		;;
-	mac | mpw | mac-mpw)
-		basic_machine=m68k-apple
-		;;
-	pmac | pmac-mpw)
-		basic_machine=powerpc-apple
-		;;
-	*-unknown)
-		# Make sure to match an already-canonicalized machine name.
-		;;
-	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
-		;;
-	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
-	# First match some system type aliases
-	# that might get confused with valid system types.
-	# -solaris* is a basic system type, with this one exception.
-	-auroraux)
-		os=-auroraux
-		;;
-	-solaris1 | -solaris1.*)
-		os=`echo $os | sed -e 's|solaris1|sunos4|'`
-		;;
-	-solaris)
-		os=-solaris2
-		;;
-	-svr4*)
-		os=-sysv4
-		;;
-	-unixware*)
-		os=-sysv4.2uw
-		;;
-	-gnu/linux*)
-		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
-		;;
-	# First accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
-	# -sysv* is not here because it comes later, after sysvr4.
-	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* \
-	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* | -aros* \
-	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -bitrig* | -openbsd* | -solidbsd* \
-	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
-	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
-	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* | -cegcc* \
-	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
-	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* \
-	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
-	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
-	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
-	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
-	# Remember, each alternative MUST END IN *, to match a version number.
-		;;
-	-qnx*)
-		case $basic_machine in
-		    x86-* | i*86-*)
-			;;
-		    *)
-			os=-nto$os
-			;;
-		esac
-		;;
-	-nto-qnx*)
-		;;
-	-nto*)
-		os=`echo $os | sed -e 's|nto|nto-qnx|'`
-		;;
-	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
-	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
-		;;
-	-mac*)
-		os=`echo $os | sed -e 's|mac|macos|'`
-		;;
-	-linux-dietlibc)
-		os=-linux-dietlibc
-		;;
-	-linux*)
-		os=`echo $os | sed -e 's|linux|linux-gnu|'`
-		;;
-	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
-		;;
-	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
-		;;
-	-opened*)
-		os=-openedition
-		;;
-	-os400*)
-		os=-os400
-		;;
-	-wince*)
-		os=-wince
-		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
-	-utek*)
-		os=-bsd
-		;;
-	-dynix*)
-		os=-bsd
-		;;
-	-acis*)
-		os=-aos
-		;;
-	-atheos*)
-		os=-atheos
-		;;
-	-syllable*)
-		os=-syllable
-		;;
-	-386bsd)
-		os=-bsd
-		;;
-	-ctix* | -uts*)
-		os=-sysv
-		;;
-	-nova*)
-		os=-rtmk-nova
-		;;
-	-ns2 )
-		os=-nextstep2
-		;;
-	-nsk*)
-		os=-nsk
-		;;
-	# Preserve the version number of sinix5.
-	-sinix5.*)
-		os=`echo $os | sed -e 's|sinix|sysv|'`
-		;;
-	-sinix*)
-		os=-sysv4
-		;;
-	-tpf*)
-		os=-tpf
-		;;
-	-triton*)
-		os=-sysv3
-		;;
-	-oss*)
-		os=-sysv3
-		;;
-	-svr4)
-		os=-sysv4
-		;;
-	-svr3)
-		os=-sysv3
-		;;
-	-sysvr4)
-		os=-sysv4
-		;;
-	# This must come after -sysvr4.
-	-sysv*)
-		;;
-	-ose*)
-		os=-ose
-		;;
-	-es1800*)
-		os=-ose
-		;;
-	-xenix)
-		os=-xenix
-		;;
-	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-		os=-mint
-		;;
-	-aros*)
-		os=-aros
-		;;
-	-kaos*)
-		os=-kaos
-		;;
-	-zvmoe)
-		os=-zvmoe
-		;;
-	-dicos*)
-		os=-dicos
-		;;
-	-nacl*)
-		;;
-	-none)
-		;;
-	*)
-		# Get rid of the `-' at the beginning of $os.
-		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-		exit 1
-		;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
-	score-*)
-		os=-elf
-		;;
-	spu-*)
-		os=-elf
-		;;
-	*-acorn)
-		os=-riscix1.2
-		;;
-	arm*-rebel)
-		os=-linux
-		;;
-	arm*-semi)
-		os=-aout
-		;;
-	c4x-* | tic4x-*)
-		os=-coff
-		;;
-	hexagon-*)
-		os=-elf
-		;;
-	tic54x-*)
-		os=-coff
-		;;
-	tic55x-*)
-		os=-coff
-		;;
-	tic6x-*)
-		os=-coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=-tops20
-		;;
-	pdp11-*)
-		os=-none
-		;;
-	*-dec | vax-*)
-		os=-ultrix4.2
-		;;
-	m68*-apollo)
-		os=-domain
-		;;
-	i386-sun)
-		os=-sunos4.0.2
-		;;
-	m68000-sun)
-		os=-sunos3
-		;;
-	m68*-cisco)
-		os=-aout
-		;;
-	mep-*)
-		os=-elf
-		;;
-	mips*-cisco)
-		os=-elf
-		;;
-	mips*-*)
-		os=-elf
-		;;
-	or32-*)
-		os=-coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=-sysv3
-		;;
-	sparc-* | *-sun)
-		os=-sunos4.1.1
-		;;
-	*-be)
-		os=-beos
-		;;
-	*-haiku)
-		os=-haiku
-		;;
-	*-ibm)
-		os=-aix
-		;;
-	*-knuth)
-		os=-mmixware
-		;;
-	*-wec)
-		os=-proelf
-		;;
-	*-winbond)
-		os=-proelf
-		;;
-	*-oki)
-		os=-proelf
-		;;
-	*-hp)
-		os=-hpux
-		;;
-	*-hitachi)
-		os=-hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=-sysv
-		;;
-	*-cbm)
-		os=-amigaos
-		;;
-	*-dg)
-		os=-dgux
-		;;
-	*-dolphin)
-		os=-sysv3
-		;;
-	m68k-ccur)
-		os=-rtu
-		;;
-	m88k-omron*)
-		os=-luna
-		;;
-	*-next )
-		os=-nextstep
-		;;
-	*-sequent)
-		os=-ptx
-		;;
-	*-crds)
-		os=-unos
-		;;
-	*-ns)
-		os=-genix
-		;;
-	i370-*)
-		os=-mvs
-		;;
-	*-next)
-		os=-nextstep3
-		;;
-	*-gould)
-		os=-sysv
-		;;
-	*-highlevel)
-		os=-bsd
-		;;
-	*-encore)
-		os=-bsd
-		;;
-	*-sgi)
-		os=-irix
-		;;
-	*-siemens)
-		os=-sysv4
-		;;
-	*-masscomp)
-		os=-rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=-uxpv
-		;;
-	*-rom68k)
-		os=-coff
-		;;
-	*-*bug)
-		os=-coff
-		;;
-	*-apple)
-		os=-macos
-		;;
-	*-atari*)
-		os=-mint
-		;;
-	*)
-		os=-none
-		;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-	*-unknown)
-		case $os in
-			-riscix*)
-				vendor=acorn
-				;;
-			-sunos*)
-				vendor=sun
-				;;
-			-cnk*|-aix*)
-				vendor=ibm
-				;;
-			-beos*)
-				vendor=be
-				;;
-			-hpux*)
-				vendor=hp
-				;;
-			-mpeix*)
-				vendor=hp
-				;;
-			-hiux*)
-				vendor=hitachi
-				;;
-			-unos*)
-				vendor=crds
-				;;
-			-dgux*)
-				vendor=dg
-				;;
-			-luna*)
-				vendor=omron
-				;;
-			-genix*)
-				vendor=ns
-				;;
-			-mvs* | -opened*)
-				vendor=ibm
-				;;
-			-os400*)
-				vendor=ibm
-				;;
-			-ptx*)
-				vendor=sequent
-				;;
-			-tpf*)
-				vendor=ibm
-				;;
-			-vxsim* | -vxworks* | -windiss*)
-				vendor=wrs
-				;;
-			-aux*)
-				vendor=apple
-				;;
-			-hms*)
-				vendor=hitachi
-				;;
-			-mpw* | -macos*)
-				vendor=apple
-				;;
-			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-				vendor=atari
-				;;
-			-vos*)
-				vendor=stratus
-				;;
-		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
-		;;
-esac
-
-echo $basic_machine$os
-exit
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/android/src/main/libenc/jni/libx264/configure b/android/src/main/libenc/jni/libx264/configure
deleted file mode 100755
index 78f87dc..0000000
--- a/android/src/main/libenc/jni/libx264/configure
+++ /dev/null
@@ -1,1505 +0,0 @@
-#!/bin/bash
-
-if test x"$1" = x"-h" -o x"$1" = x"--help" ; then
-cat <<EOF
-Usage: ./configure [options]
-
-Help:
-  -h, --help               print this message
-
-Standard options:
-  --prefix=PREFIX          install architecture-independent files in PREFIX
-                           [/usr/local]
-  --exec-prefix=EPREFIX    install architecture-dependent files in EPREFIX
-                           [PREFIX]
-  --bindir=DIR             install binaries in DIR [EPREFIX/bin]
-  --libdir=DIR             install libs in DIR [EPREFIX/lib]
-  --includedir=DIR         install includes in DIR [PREFIX/include]
-  --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS
-  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS
-  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS
-  --extra-rcflags=ERCFLAGS add ERCFLAGS to RCFLAGS
-
-Configuration options:
-  --disable-cli            disable cli
-  --system-libx264         use system libx264 instead of internal
-  --enable-shared          build shared library
-  --enable-static          build static library
-  --disable-opencl         disable OpenCL features
-  --disable-gpl            disable GPL-only features
-  --disable-thread         disable multithreaded encoding
-  --disable-win32thread    disable win32threads (windows only)
-  --disable-interlaced     disable interlaced encoding support
-  --bit-depth=BIT_DEPTH    set output bit depth (8-10) [8]
-  --chroma-format=FORMAT   output chroma format (420, 422, 444, all) [all]
-
-Advanced options:
-  --disable-asm            disable platform-specific assembly optimizations
-  --enable-lto             enable link-time optimization
-  --enable-debug           add -g
-  --enable-gprof           add -pg
-  --enable-strip           add -s
-  --enable-pic             build position-independent code
-
-Cross-compilation:
-  --host=HOST              build programs to run on HOST
-  --cross-prefix=PREFIX    use PREFIX for compilation tools
-  --sysroot=SYSROOT        root of cross-build tree
-
-External library support:
-  --disable-avs            disable avisynth support
-  --disable-swscale        disable swscale support
-  --disable-lavf           disable libavformat support
-  --disable-ffms           disable ffmpegsource support
-  --disable-gpac           disable gpac support
-  --disable-lsmash         disable lsmash support
-
-EOF
-exit 1
-fi
-
-log_check() {
-    echo -n "checking $1... " >> config.log
-}
-
-log_ok() {
-    echo "yes" >> config.log
-}
-
-log_fail() {
-    echo "no" >> config.log
-}
-
-log_msg() {
-    echo "$1" >> config.log
-}
-
-cc_cflags() {
-    # several non gcc compilers issue an incredibly large number of warnings on high warning levels,
-    # suppress them by reducing the warning level rather than having to use #pragmas
-    for arg in $*; do
-        [[ "$arg" = -falign-loops* ]] && arg=
-        [ "$arg" = -fno-tree-vectorize ] && arg=
-        [ "$arg" = -Wshadow ] && arg=
-        [ "$arg" = -Wno-maybe-uninitialized ] && arg=
-        [[ "$arg" = -mpreferred-stack-boundary* ]] && arg=
-        [[ "$arg" = -l* ]] && arg=
-        [[ "$arg" = -L* ]] && arg=
-        if [ $compiler_style = MS ]; then
-            [ "$arg" = -ffast-math ] && arg="-fp:fast"
-            [ "$arg" = -Wall ] && arg=
-            [ "$arg" = -Werror ] && arg="-W3 -WX"
-            [ "$arg" = -g ] && arg=-Z7
-            [ "$arg" = -fomit-frame-pointer ] && arg=
-            [ "$arg" = -s ] && arg=
-            [ "$arg" = -fPIC ] && arg=
-        else
-            [ "$arg" = -ffast-math ] && arg=
-            [ "$arg" = -Wall ] && arg=
-            [ "$arg" = -Werror ] && arg="-w3 -Werror"
-        fi
-        [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2
-
-        [ -n "$arg" ] && echo -n "$arg "
-    done
-}
-
-cl_ldflags() {
-    for arg in $*; do
-        arg=${arg/LIBPATH/libpath}
-        [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib
-        [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L}
-        [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware
-        [ "$arg" = -s ] && arg=
-        [ "$arg" = -Wl,-Bsymbolic ] && arg=
-        [ "$arg" = -fno-tree-vectorize ] && arg=
-        [ "$arg" = -Werror ] && arg=
-        [ "$arg" = -Wshadow ] && arg=
-        [ "$arg" = -Wmaybe-uninitialized ] && arg=
-        [[ "$arg" = -Qdiag-error* ]] && arg=
-
-        arg=${arg/pthreadGC/pthreadVC}
-        [ "$arg" = avifil32.lib ] && arg=vfw32.lib
-        [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib
-        [ "$arg" = x264.lib ] && arg=libx264.lib
-
-        [ -n "$arg" ] && echo -n "$arg "
-    done
-}
-
-cc_check() {
-    if [ -z "$3" ]; then
-        if [ -z "$1$2" ]; then
-            log_check "whether $CC works"
-        elif [ -z "$1" ]; then
-            log_check "for $2"
-        else
-            log_check "for $1"
-        fi
-    elif [ -z "$1" ]; then
-        if [ -z "$2" ]; then
-            log_check "whether $CC supports $3"
-        else
-            log_check "whether $CC supports $3 with $2"
-        fi
-    else
-        log_check "for $3 in $1";
-    fi
-    rm -f conftest.c
-    for arg in $1; do
-        echo "#include <$arg>" >> conftest.c
-    done
-    echo "int main (void) { $3 return 0; }" >> conftest.c
-    if [ $compiler_style = MS ]; then
-        cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
-    else
-        cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
-    fi
-    if $cc_cmd >conftest.log 2>&1; then
-        res=$?
-        log_ok
-    else
-        res=$?
-        log_fail
-        log_msg "Failed commandline was:"
-        log_msg "--------------------------------------------------"
-        log_msg "$cc_cmd"
-        cat conftest.log >> config.log
-        log_msg "--------------------------------------------------"
-        log_msg "Failed program was:"
-        log_msg "--------------------------------------------------"
-        cat conftest.c >> config.log
-        log_msg "--------------------------------------------------"
-    fi
-    return $res
-}
-
-cpp_check() {
-    log_check "whether $3 is true"
-    rm -f conftest.c
-    for arg in $1; do
-        echo "#include <$arg>" >> conftest.c
-    done
-    echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c
-    if [ $compiler_style = MS ]; then
-        cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P"
-    else
-        cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest"
-    fi
-    if $cpp_cmd >conftest.log 2>&1; then
-        res=$?
-        log_ok
-    else
-        res=$?
-        log_fail
-        log_msg "--------------------------------------------------"
-        cat conftest.log >> config.log
-        log_msg "--------------------------------------------------"
-        log_msg "Failed program was:"
-        log_msg "--------------------------------------------------"
-        cat conftest.c >> config.log
-        log_msg "--------------------------------------------------"
-    fi
-    return $res
-}
-
-as_check() {
-    log_check "whether $AS supports $1"
-    echo "$1" > conftest$AS_EXT
-    as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o"
-    if $as_cmd >conftest.log 2>&1; then
-        res=$?
-        log_ok
-    else
-        res=$?
-        log_fail
-        log_msg "Failed commandline was:"
-        log_msg "--------------------------------------------------"
-        log_msg "$as_cmd"
-        cat conftest.log >> config.log
-        log_msg "--------------------------------------------------"
-        log_msg "Failed program was:"
-        log_msg "--------------------------------------------------"
-        cat conftest$AS_EXT >> config.log
-        log_msg "--------------------------------------------------"
-    fi
-    return $res
-}
-
-rc_check() {
-    log_check "whether $RC works"
-    echo "$1" > conftest.rc
-    if [ $compiler = GNU ]; then
-        rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc"
-    else
-        rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc"
-    fi
-    if $rc_cmd >conftest.log 2>&1; then
-        res=$?
-        log_ok
-    else
-        res=$?
-        log_fail
-        log_msg "Failed commandline was:"
-        log_msg "--------------------------------------------------"
-        log_msg "$rc_cmd"
-        cat conftest.log >> config.log
-        log_msg "--------------------------------------------------"
-        log_msg "Failed program was:"
-        log_msg "--------------------------------------------------"
-        cat conftest.rc >> config.log
-        log_msg "--------------------------------------------------"
-    fi
-    return $res
-}
-
-define() {
-    echo "#define $1$([ -n "$2" ] && echo " $2" || echo " 1")" >> config.h
-}
-
-die() {
-    log_msg "DIED: $@"
-    echo "$@"
-    exit 1
-}
-
-configure_system_override() {
-    log_check "system libx264 configuration"
-    x264_config_path="$1/x264_config.h"
-    if [ -e "$x264_config_path" ]; then
-        res=$?
-        log_ok
-        arg="$(grep '#define X264_GPL ' $x264_config_path | sed -e 's/#define X264_GPL *//; s/ *$//')"
-        if [ -n "$arg" ]; then
-            [ "$arg" = 0 ] && arg="no" || arg="yes"
-            [ "$arg" != "$gpl" ] && die "Incompatible license with system libx264"
-        fi
-        arg="$(grep '#define X264_BIT_DEPTH ' $x264_config_path | sed -e 's/#define X264_BIT_DEPTH *//; s/ *$//')"
-        if [ -n "$arg" ]; then
-            if [ "$arg" != "$bit_depth" ]; then
-                echo "Override output bit depth with system libx264 configuration"
-                bit_depth="$arg"
-            fi
-        fi
-        arg="$(grep '#define X264_CHROMA_FORMAT ' $x264_config_path | sed -e 's/#define X264_CHROMA_FORMAT *//; s/ *$//')"
-        if [ -n "$arg" ]; then
-            [ "$arg" = 0 ] && arg="all" || arg="${arg#X264_CSP_I}"
-            if [ "$arg" != "$chroma_format" ]; then
-                echo "Override output chroma format with system libx264 configuration"
-                chroma_format="$arg"
-            fi
-        fi
-        arg="$(grep '#define X264_INTERLACED ' $x264_config_path | sed -e 's/#define X264_INTERLACED *//; s/ *$//')"
-        if [ -n "$arg" ]; then
-            [ "$arg" = 0 ] && arg="no" || arg="yes"
-            if [ "$arg" != "$interlaced" ]; then
-                echo "Override interlaced encoding support with system libx264 configuration"
-                interlaced="$arg"
-            fi
-        fi
-    else
-        res=$?
-        log_fail
-        log_msg "Failed search path was: $x264_config_path"
-    fi
-    return $res
-}
-
-rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest*
-
-# Construct a path to the specified directory relative to the working directory
-relative_path() {
-    local base="${PWD%/}"
-    local path="$(cd "$1" >/dev/null; printf '%s/.' "${PWD%/}")"
-    local up=''
-
-    while [[ $path != "$base/"* ]]; do
-        base="${base%/*}"
-        up="../$up"
-    done
-
-    dirname "$up${path#"$base/"}"
-}
-
-SRCPATH="$(relative_path "$(dirname "$0")")"
-echo "$SRCPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path."
-[ -e "$SRCPATH/config.h" -o -e "$SRCPATH/x264_config.h" ] && die "Out of tree builds are impossible with config.h/x264_config.h in source dir."
-
-prefix='/usr/local'
-exec_prefix='${prefix}'
-bindir='${exec_prefix}/bin'
-libdir='${exec_prefix}/lib'
-includedir='${prefix}/include'
-DEVNULL='/dev/null'
-
-cli="yes"
-cli_libx264="internal"
-shared="no"
-static="no"
-avs="auto"
-lavf="auto"
-ffms="auto"
-gpac="auto"
-lsmash="auto"
-mp4="no"
-gpl="yes"
-thread="auto"
-swscale="auto"
-asm="auto"
-interlaced="yes"
-lto="no"
-debug="no"
-gprof="no"
-strip="no"
-pic="no"
-bit_depth="8"
-chroma_format="all"
-compiler="GNU"
-compiler_style="GNU"
-opencl="yes"
-
-CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
-LDFLAGS="$LDFLAGS"
-LDFLAGSCLI="$LDFLAGSCLI"
-ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)"
-RCFLAGS="$RCFLAGS"
-CHECK_CFLAGS=""
-HAVE_GETOPT_LONG=1
-cross_prefix=""
-
-EXE=""
-AS_EXT=".S"
-NL="
-"
-
-# list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
-             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \
-             MSA MMAP WINRT"
-
-# parse options
-
-for opt do
-    optarg="${opt#*=}"
-    case "$opt" in
-        --prefix=*)
-            prefix="$optarg"
-            ;;
-        --exec-prefix=*)
-            exec_prefix="$optarg"
-            ;;
-        --bindir=*)
-            bindir="$optarg"
-            ;;
-        --libdir=*)
-            libdir="$optarg"
-            ;;
-        --includedir=*)
-            includedir="$optarg"
-            ;;
-        --disable-cli)
-            cli="no"
-            ;;
-        --system-libx264)
-            cli_libx264="system"
-            ;;
-        --enable-shared)
-            shared="yes"
-            ;;
-        --enable-static)
-            static="yes"
-            ;;
-        --disable-asm)
-            asm="no"
-            ;;
-        --disable-interlaced)
-            interlaced="no"
-            ;;
-        --disable-avs)
-            avs="no"
-            ;;
-        --disable-lavf)
-            lavf="no"
-            ;;
-        --disable-ffms)
-            ffms="no"
-            ;;
-        --disable-gpac)
-            gpac="no"
-            ;;
-        --disable-lsmash)
-            lsmash="no"
-            ;;
-        --disable-gpl)
-            gpl="no"
-            ;;
-        --extra-asflags=*)
-            ASFLAGS="$ASFLAGS $optarg"
-            ;;
-        --extra-cflags=*)
-            CFLAGS="$CFLAGS $optarg"
-            ;;
-        --extra-ldflags=*)
-            LDFLAGS="$LDFLAGS $optarg"
-            ;;
-        --extra-rcflags=*)
-            RCFLAGS="$RCFLAGS $optarg"
-            ;;
-        --disable-thread)
-            thread="no"
-            ;;
-        --disable-win32thread)
-            [ "$thread" != "no" ] && thread="posix"
-            ;;
-        --disable-swscale)
-            swscale="no"
-            ;;
-        --enable-lto)
-            lto="auto"
-            ;;
-        --enable-debug)
-            debug="yes"
-            ;;
-        --enable-gprof)
-            CFLAGS="$CFLAGS -pg"
-            LDFLAGS="$LDFLAGS -pg"
-            gprof="yes"
-            ;;
-        --enable-strip)
-            strip="yes"
-            ;;
-        --enable-pic)
-            pic="yes"
-            ;;
-        --host=*)
-            host="$optarg"
-            ;;
-        --disable-opencl)
-            opencl="no"
-            ;;
-        --cross-prefix=*)
-            cross_prefix="$optarg"
-            ;;
-        --sysroot=*)
-            CFLAGS="$CFLAGS --sysroot=$optarg"
-            LDFLAGS="$LDFLAGS --sysroot=$optarg"
-            ;;
-        --bit-depth=*)
-            bit_depth="$optarg"
-            if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then
-                echo "Supplied bit depth must be in range [8,10]."
-                exit 1
-            fi
-            bit_depth=`expr $bit_depth + 0`
-            ;;
-        --chroma-format=*)
-            chroma_format="$optarg"
-            if [ $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then
-                echo "Supplied chroma format must be 420, 422, 444 or all."
-                exit 1
-            fi
-            ;;
-        *)
-            echo "Unknown option $opt, ignored"
-            ;;
-    esac
-done
-
-[ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static."
-
-CC="${CC-${cross_prefix}gcc}"
-STRIP="${STRIP-${cross_prefix}strip}"
-INSTALL="${INSTALL-install}"
-PKGCONFIG="${PKGCONFIG-${cross_prefix}pkg-config}"
-
-# ar and ranlib doesn't load the LTO plugin by default, prefer the gcc-prefixed wrappers which does.
-if ${cross_prefix}gcc-ar --version >/dev/null 2>&1; then
-    AR="${AR-${cross_prefix}gcc-ar}"
-else
-    AR="${AR-${cross_prefix}ar}"
-fi
-if ${cross_prefix}gcc-ranlib --version >/dev/null 2>&1; then
-    RANLIB="${RANLIB-${cross_prefix}gcc-ranlib}"
-else
-    RANLIB="${RANLIB-${cross_prefix}ranlib}"
-fi
-
-if [ "x$host" = x ]; then
-    host=`${SRCPATH}/config.guess`
-fi
-# normalize a triplet into a quadruplet
-host=`${SRCPATH}/config.sub $host`
-
-# split $host
-host_cpu="${host%%-*}"
-host="${host#*-}"
-host_vendor="${host%%-*}"
-host_os="${host#*-}"
-
-trap 'rm -f conftest*' EXIT
-
-# test for use of compilers that require specific handling
-cc_base=`basename "$CC"`
-QPRE="-"
-if [[ $host_os = mingw* || $host_os = cygwin* ]]; then
-    if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then
-        # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths.
-        [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS"
-        compiler=ICL
-        compiler_style=MS
-        CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras"
-        QPRE="-Q"
-        `$CC 2>&1 | grep -q IA-32` && host_cpu=i486
-        `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64
-        cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer"
-        if cc_check '' -Qdiag-error:10006,10157 ; then
-            CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157"
-        fi
-    elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then
-        # Standard Microsoft Visual Studio
-        compiler=CL
-        compiler_style=MS
-        CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras"
-        `$CC 2>&1 | grep -q 'x86'` && host_cpu=i486
-        `$CC 2>&1 | grep -q 'x64'` && host_cpu=x86_64
-        cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer"
-    else
-        # MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones.
-        CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L"
-    fi
-else
-    if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then
-        AR="xiar"
-        compiler=ICC
-    fi
-fi
-
-if [[ "$cc_base" = clang* ]]; then
-    if cc_check '' -Werror=unknown-warning-option ; then
-        CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option"
-    fi
-fi
-
-libm=""
-case $host_os in
-    beos*)
-        SYS="BEOS"
-        define HAVE_MALLOC_H
-        ;;
-    darwin*)
-        SYS="MACOSX"
-        libm="-lm"
-        if [ "$pic" = "no" ]; then
-            cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
-        fi
-        ;;
-    freebsd*)
-        SYS="FREEBSD"
-        libm="-lm"
-        ;;
-    kfreebsd*-gnu)
-        SYS="FREEBSD"
-        define HAVE_MALLOC_H
-        libm="-lm"
-        ;;
-    netbsd*)
-        SYS="NETBSD"
-        libm="-lm"
-        ;;
-    openbsd*)
-        SYS="OPENBSD"
-        libm="-lm"
-        ;;
-    *linux*)
-        SYS="LINUX"
-        define HAVE_MALLOC_H
-        libm="-lm"
-        ;;
-    gnu*)
-        SYS="HURD"
-        define HAVE_MALLOC_H
-        libm="-lm"
-        ;;
-    cygwin*|mingw*|msys*)
-        EXE=".exe"
-        if [[ $host_os = cygwin* ]] && cpp_check "" "" "defined(__CYGWIN__)" ; then
-            SYS="CYGWIN"
-            define HAVE_MALLOC_H
-        else
-            SYS="WINDOWS"
-            DEVNULL="NUL"
-            LDFLAGSCLI="$LDFLAGSCLI -lshell32"
-            [ $compiler = GNU ] && RC="${RC-${cross_prefix}windres}" || RC="${RC-rc}"
-        fi
-        ;;
-    sunos*|solaris*)
-        SYS="SunOS"
-        define HAVE_MALLOC_H
-        libm="-lm"
-        if cc_check "" /usr/lib/64/values-xpg6.o; then
-            LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o"
-        else
-            LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o"
-        fi
-        if test -x /usr/ucb/install ; then
-            INSTALL=/usr/ucb/install
-        elif test -x /usr/bin/ginstall ; then
-            # OpenSolaris
-            INSTALL=/usr/bin/ginstall
-        elif test -x /usr/gnu/bin/install ; then
-            # OpenSolaris
-            INSTALL=/usr/gnu/bin/install
-        fi
-        HAVE_GETOPT_LONG=0
-        ;;
-    *qnx*)
-        SYS="QNX"
-        define HAVE_MALLOC_H
-        libm="-lm"
-        HAVE_GETOPT_LONG=0
-        CFLAGS="$CFLAGS -I\$(SRCPATH)/extras"
-        ;;
-    *haiku*)
-        SYS="HAIKU"
-        ;;
-    *)
-        die "Unknown system $host, edit the configure"
-        ;;
-esac
-
-LDFLAGS="$LDFLAGS $libm"
-
-stack_alignment=4
-case $host_cpu in
-    i*86)
-        ARCH="X86"
-        AS="${AS-yasm}"
-        AS_EXT=".asm"
-        ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
-        if [ $compiler = GNU ]; then
-            if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then
-                CFLAGS="$CFLAGS -march=i686"
-            fi
-            if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then
-                CFLAGS="$CFLAGS -mfpmath=sse -msse -msse2"
-            fi
-            CFLAGS="-m32 $CFLAGS"
-            LDFLAGS="-m32 $LDFLAGS"
-        fi
-        if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho32 -DPREFIX"
-        elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
-            ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
-            LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
-            [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware"
-            [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS"
-        else
-            ASFLAGS="$ASFLAGS -f elf32"
-        fi
-        ;;
-    x86_64)
-        ARCH="X86_64"
-        AS="${AS-yasm}"
-        AS_EXT=".asm"
-        ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
-        stack_alignment=16
-        [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
-        if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
-            if cc_check '' "-arch x86_64"; then
-                CFLAGS="$CFLAGS -arch x86_64"
-                LDFLAGS="$LDFLAGS -arch x86_64"
-            fi
-        elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
-            ASFLAGS="$ASFLAGS -f win64"
-            if [ $compiler = GNU ]; then
-                # only the GNU toolchain is inconsistent in prefixing function names with _
-                cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
-                cc_check "" "-Wl,--high-entropy-va" && LDFLAGS="$LDFLAGS -Wl,--high-entropy-va"
-                LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware"
-                LDFLAGSCLI="$LDFLAGSCLI -Wl,--image-base,0x140000000"
-                SOFLAGS="$SOFLAGS -Wl,--image-base,0x180000000"
-                RCFLAGS="--target=pe-x86-64 $RCFLAGS"
-            fi
-        else
-            ASFLAGS="$ASFLAGS -f elf64"
-        fi
-        ;;
-    powerpc*)
-        ARCH="PPC"
-        if [ $asm = auto ] ; then
-            define HAVE_ALTIVEC
-            AS="${AS-${CC}}"
-            AS_EXT=".c"
-            if [ $SYS = MACOSX ] ; then
-                CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4"
-            else
-                CFLAGS="$CFLAGS -maltivec -mabi=altivec"
-                define HAVE_ALTIVEC_H
-            fi
-        fi
-        ;;
-    sparc)
-        ARCH="SPARC"
-        ;;
-    mips*)
-        ARCH="MIPS"
-        AS="${AS-${CC}}"
-        AS_EXT=".c"
-        ;;
-    arm*)
-        ARCH="ARM"
-        if [ "$SYS" = MACOSX ] ; then
-            AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}"
-            ASFLAGS="$ASFLAGS -DPREFIX -DPIC"  # apple's ld doesn't support movw/movt relocations at all
-            # build for armv7 by default
-            if ! echo $CFLAGS | grep -Eq '\-arch' ; then
-                CFLAGS="$CFLAGS -arch armv7"
-                LDFLAGS="$LDFLAGS -arch armv7"
-            fi
-        else
-            AS="${AS-${CC}}"
-        fi
-        ;;
-    aarch64)
-        ARCH="AARCH64"
-        stack_alignment=16
-        if [ "$SYS" = MACOSX ] ; then
-            AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -- ${CC}}"
-            ASFLAGS="$ASFLAGS -DPREFIX"
-        else
-            AS="${AS-${CC}}"
-        fi
-        ;;
-    s390|s390x)
-        ARCH="S390"
-        ;;
-    hppa*|parisc*)
-        ARCH="PARISC"
-        ;;
-    ia64)
-        ARCH="IA64"
-        ;;
-    alpha*)
-        ARCH="ALPHA"
-        ;;
-    *)
-        ARCH="$(echo $host_cpu | tr a-z A-Z)"
-        ;;
-esac
-
-if [ $SYS = WINDOWS ]; then
-    if ! rc_check "0 RCDATA {0}" ; then
-        RC=""
-    fi
-
-    if cpp_check "winapifamily.h" "" "!WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)" ; then
-        [ $compiler = CL ] || die "WinRT requires MSVC"
-        define HAVE_WINRT
-        CFLAGS="$CFLAGS -MD"
-        LDFLAGS="$LDFLAGS -appcontainer"
-        if ! cpp_check "" "" "defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0603" ; then
-            die "_WIN32_WINNT must be defined to at least 0x0603 (Windows 8.1) for WinRT"
-        elif cpp_check "" "" "_WIN32_WINNT >= 0x0A00" ; then
-            # Universal Windows Platform (Windows 10)
-            LDFLAGS="$LDFLAGS -lWindowsApp"
-        fi
-        cli="no"
-        opencl="no"
-    fi
-fi
-
-log_msg "x264 configure script"
-if [ -n "$*" ]; then
-    msg="Command line options:"
-    for i in $@; do
-        msg="$msg \"$i\""
-    done
-    log_msg "$msg"
-fi
-log_msg ""
-
-# check requirements
-
-cc_check || die "No working C compiler found."
-
-if [ $compiler_style = GNU ]; then
-    if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then
-        CFLAGS="$CFLAGS -std=gnu99 -D_GNU_SOURCE"
-    elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then
-        CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE"
-    elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then
-        die "C99 compiler is needed for compilation."
-    fi
-fi
-
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then
-    pic="yes"
-fi
-
-if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
-    if cc_check '' -mpreferred-stack-boundary=5 ; then
-        CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
-        stack_alignment=32
-    elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then
-        CFLAGS="$CFLAGS -mpreferred-stack-boundary=4"
-        stack_alignment=16
-    fi
-elif [ $compiler = ICC -a $ARCH = X86 ]; then
-    # icc on linux has various degrees of mod16 stack support
-    if [ $SYS = LINUX ]; then
-        # >= 12 defaults to a mod16 stack
-        if cpp_check "" "" "__INTEL_COMPILER >= 1200" ; then
-            stack_alignment=16
-        # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so.
-        elif cpp_check "" "" "__INTEL_COMPILER >= 1100" ; then
-            CFLAGS="$CFLAGS -falign-stack=assume-16-byte"
-            stack_alignment=16
-        fi
-        # < 11 is completely incapable of keeping a mod16 stack
-    fi
-fi
-
-if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
-    if ! as_check "vpmovzxwd ymm0, xmm0" ; then
-        VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
-        echo "Found $VER"
-        echo "Minimum version is yasm-1.2.0"
-        echo "If you really want to compile without asm, configure with --disable-asm."
-        exit 1
-    fi
-    cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM
-    ASFLAGS="$ASFLAGS -Worphan-labels"
-    define HAVE_MMX
-fi
-
-if [ $asm = auto -a $ARCH = ARM ] ; then
-    # set flags so neon is built by default
-    echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon"
-
-    if  cc_check '' '' '__asm__("rev ip, ip");' ; then      define HAVE_ARMV6
-        cc_check '' '' '__asm__("movt r0, #0");'         && define HAVE_ARMV6T2
-        cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON
-        ASFLAGS="$ASFLAGS -c"
-    else
-        echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
-        echo "If you really want to run on such a CPU, configure with --disable-asm."
-        exit 1
-    fi
-fi
-
-if [ $asm = auto -a $ARCH = AARCH64 ] ; then
-    if  cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON
-        ASFLAGS="$ASFLAGS -c"
-    else
-        echo "no NEON support, try adding -mfpu=neon to CFLAGS"
-        echo "If you really want to run on such a CPU, configure with --disable-asm."
-        exit 1
-    fi
-fi
-
-if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then
-    # check if the assembler supports '.func' (clang 3.5 does not)
-    as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1
-fi
-
-if [ $asm = auto -a $ARCH = MIPS ] ; then
-    if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then
-        cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS"
-    fi
-
-    if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then
-        define HAVE_MSA
-    else
-        echo "You specified a pre-MSA CPU in your CFLAGS."
-        echo "If you really want to run on such a CPU, configure with --disable-asm."
-        exit 1
-    fi
-fi
-
-[ $asm = no ] && AS=""
-[ "x$AS" = x ] && asm="no" || asm="yes"
-
-define ARCH_$ARCH
-define SYS_$SYS
-
-define STACK_ALIGNMENT $stack_alignment
-ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment"
-
-# skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well
-CPU_ENDIAN="little-endian"
-if [ $compiler = GNU ]; then
-    echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
-    $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed"
-    if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then
-        define WORDS_BIGENDIAN
-        CPU_ENDIAN="big-endian"
-    elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then
-        die "endian test failed"
-    fi
-fi
-
-if [ "$cli_libx264" = "system" -a "$shared" != "yes" ] ; then
-    [ "$static" = "yes" ] && die "Option --system-libx264 can not be used together with --enable-static"
-    if $PKGCONFIG --exists x264 2>/dev/null; then
-        X264_LIBS="$($PKGCONFIG --libs x264)"
-        X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$($PKGCONFIG --variable=includedir x264)}"
-        configure_system_override "$X264_INCLUDE_DIR" || die "Detection of system libx264 configuration failed"
-    else
-        die "Can not find system libx264"
-    fi
-fi
-
-# autodetect options that weren't forced nor disabled
-
-libpthread=""
-if [ "$SYS" = "WINDOWS" -a "$thread" = "posix" ] ; then
-    if [ "$gpl" = "no" ] ; then
-        echo "Warning: pthread-win32 is LGPL and is therefore not supported with --disable-gpl"
-        thread="no"
-    elif cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then
-        libpthread="-lpthread"
-    elif cc_check pthread.h -lpthreadGC2 "pthread_create(0,0,0,0);" ; then
-        libpthread="-lpthreadGC2"
-    elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
-        libpthread="-lpthreadGC2 -lwsock32"
-        define PTW32_STATIC_LIB
-    elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
-        libpthread="-lpthreadGC2 -lws2_32"
-        define PTW32_STATIC_LIB
-    else
-        thread="no"
-    fi
-elif [ "$thread" != "no" ] ; then
-    thread="no"
-    case $SYS in
-        BEOS)
-            thread="beos"
-            define HAVE_BEOSTHREAD
-            ;;
-        WINDOWS)
-            thread="win32"
-            define HAVE_WIN32THREAD
-            ;;
-        QNX)
-            cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc"
-            ;;
-        *)
-            if cc_check pthread.h -lc "pthread_create(0,0,0,0);" ; then
-               thread="posix"
-               libpthread="-lpthread"
-            else
-                cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread=""
-            fi
-            ;;
-    esac
-fi
-if [ "$thread" = "posix" ]; then
-    LDFLAGS="$LDFLAGS $libpthread"
-    define HAVE_POSIXTHREAD
-    if [ "$SYS" = "LINUX" ] && cc_check sched.h "-D_GNU_SOURCE -Werror" "cpu_set_t p_aff; return CPU_COUNT(&p_aff);" ; then
-        define HAVE_CPU_COUNT
-    fi
-fi
-[ "$thread" != "no" ] && define HAVE_THREAD
-
-if cc_check "math.h" "-Werror" "return log2f(2);" ; then
-    define HAVE_LOG2F
-fi
-
-if [ "$SYS" != "WINDOWS" ] && cpp_check "sys/mman.h unistd.h" "" "defined(MAP_PRIVATE)"; then
-    define HAVE_MMAP
-fi
-
-if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then
-    define HAVE_THP
-fi
-
-if [ "$cli" = "no" ] ; then
-    avs="no"
-    lavf="no"
-    ffms="no"
-    gpac="no"
-    lsmash="no"
-    mp4="no"
-    swscale="no"
-fi
-
-if [ "$swscale" = "auto" ] ; then
-    swscale="no"
-    if $PKGCONFIG --exists libswscale 2>/dev/null; then
-        SWSCALE_LIBS="$SWSCALE_LIBS $($PKGCONFIG --libs libswscale libavutil)"
-        SWSCALE_CFLAGS="$SWSCALE_CFLAGS $($PKGCONFIG --cflags libswscale libavutil)"
-    fi
-    [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil"
-
-    if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then
-        if cpp_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "defined(AV_PIX_FMT_FLAG_RGB)" ; then
-            swscale="yes"
-        else
-            echo "Warning: AV_PIX_FMT_FLAG_RGB is missing from libavutil, update for swscale support"
-        fi
-    fi
-fi
-
-if [ "$lavf" = "auto" ] ; then
-    lavf="no"
-    if $PKGCONFIG --exists libavformat libavcodec libswscale 2>/dev/null; then
-        LAVF_LIBS="$LAVF_LIBS $($PKGCONFIG --libs libavformat libavcodec libavutil libswscale)"
-        LAVF_CFLAGS="$LAVF_CFLAGS $($PKGCONFIG --cflags libavformat libavcodec libavutil libswscale)"
-    fi
-    if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then
-        LAVF_LIBS="-lavformat"
-        for lib in -lpostproc -lavcodec -lswscale -lavutil -lm -lz -lbz2 $libpthread -lavifil32 -lws2_32; do
-            cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib"
-        done
-    fi
-    LAVF_LIBS="-L. $LAVF_LIBS"
-    if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "av_frame_free(0);" ; then
-        if [ "$swscale" = "yes" ]; then
-            lavf="yes"
-        else
-            echo "Warning: libavformat is not supported without swscale support"
-        fi
-    fi
-fi
-
-if [ "$ffms" = "auto" ] ; then
-    ffms_major="2"; ffms_minor="21"; ffms_micro="0"; ffms_bump="0"
-    ffms="no"
-
-    if $PKGCONFIG --exists ffms2 2>/dev/null; then
-        FFMS2_LIBS="$FFMS2_LIBS $($PKGCONFIG --libs ffms2)"
-        FFMS2_CFLAGS="$FFMS2_CFLAGS $($PKGCONFIG --cflags ffms2)"
-    fi
-    [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2"
-
-    if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then
-        ffms="yes"
-    elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then
-        ffms="yes"
-        FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS"
-    fi
-
-    error="ffms must be at least version $ffms_major.$ffms_minor.$ffms_micro.$ffms_bump"
-    if [ $ffms = "yes" ] && ! cpp_check "ffms.h" "$FFMS2_CFLAGS" "FFMS_VERSION >= (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)" "$error"; then
-       ffms="no"
-       echo "Warning: $error"
-    fi
-    if [ "$ffms" = "yes" -a "$swscale" = "no" ]; then
-        echo "Warning: ffms is not supported without swscale support"
-        ffms="no"
-    fi
-fi
-
-if [ "$swscale" = "yes" ]; then
-    LDFLAGSCLI="$SWSCALE_LIBS $LDFLAGSCLI"
-    CFLAGS="$CFLAGS $SWSCALE_CFLAGS"
-    define HAVE_SWSCALE
-    if [ "$lavf" = "yes" ]; then
-        LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI"
-        CFLAGS="$CFLAGS $LAVF_CFLAGS"
-        define HAVE_LAVF
-    fi
-    if [ "$ffms" = "yes" ]; then
-        LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI"
-        CFLAGS="$CFLAGS $FFMS2_CFLAGS"
-        define HAVE_FFMS
-    fi
-fi
-
-if [ "$lsmash" = "auto" ] ; then
-    lsmash="no"
-    if $PKGCONFIG --exists liblsmash 2>/dev/null; then
-        LSMASH_LIBS="$LSMASH_LIBS $($PKGCONFIG --libs liblsmash)"
-        LSMASH_CFLAGS="$LSMASH_CFLAGS $($PKGCONFIG --cflags liblsmash)"
-    fi
-    [ -z "$LSMASH_LIBS" ] && LSMASH_LIBS="-llsmash"
-
-    if cc_check lsmash.h "$LSMASH_CFLAGS $LSMASH_LIBS" ; then
-        if cpp_check lsmash.h "$LSMASH_CFLAGS" "LSMASH_VERSION_MAJOR > 1 || (LSMASH_VERSION_MAJOR == 1 && LSMASH_VERSION_MINOR >= 5)" ; then
-            lsmash="yes"
-        else
-            echo "Warning: lsmash is too old, update to rev.895 or later"
-        fi
-    fi
-fi
-
-if [ "$gpac" = "auto" -a "$lsmash" != "yes" ] ; then
-    gpac="no"
-    GPAC_LIBS="-lgpac_static"
-    cc_check "" -lz && GPAC_LIBS="$GPAC_LIBS -lz"
-    if [ "$SYS" = "WINDOWS" ] ; then
-        cc_check "" -lws2_32 && GPAC_LIBS="$GPAC_LIBS -lws2_32"
-        cc_check "" -lwinmm && GPAC_LIBS="$GPAC_LIBS -lwinmm"
-    fi
-    if cc_check gpac/isomedia.h "$GPAC_LIBS" ; then
-        if cc_check gpac/isomedia.h "$GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then
-            gpac="yes"
-        else
-            echo "Warning: gpac is too old, update to 2007-06-21 UTC or later"
-        fi
-    fi
-fi
-
-if [ "$lsmash" = "yes" ] ; then
-    mp4="lsmash"
-    LDFLAGSCLI="$LSMASH_LIBS $LDFLAGSCLI"
-    CFLAGS="$CFLAGS $LSMASH_CFLAGS"
-    define HAVE_LSMASH
-elif [ "$gpac" = "yes" ] ; then
-    mp4="gpac"
-    define HAVE_GPAC
-    LDFLAGSCLI="$GPAC_LIBS $LDFLAGSCLI"
-fi
-
-if [ "$avs" = "auto" ] ; then
-    avs="no"
-    # cygwin can use avisynth if it can use LoadLibrary
-    if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then
-        avs="avisynth"
-        define HAVE_AVS
-        define USE_AVXSYNTH 0
-    elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
-    # AvxSynth currently only supports Linux and OSX
-        avs="avxsynth"
-        define HAVE_AVS
-        define USE_AVXSYNTH 1
-        AVS_LIBS="-ldl"
-        LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI"
-    fi
-fi
-
-cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT
-
-if [ "$pic" = "yes" ] ; then
-    [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC"
-    ASFLAGS="$ASFLAGS -DPIC"
-    # resolve textrels in the x86 asm
-    cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
-    [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"
-fi
-
-if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then
-    CFLAGS="$CFLAGS -fomit-frame-pointer"
-fi
-
-if [ "$strip" = "yes" ]; then
-    LDFLAGS="$LDFLAGS -s"
-fi
-
-if [ "$debug" = "yes" ]; then
-    CFLAGS="-O1 -g $CFLAGS"
-    RCFLAGS="$RCFLAGS -DDEBUG"
-else
-    CFLAGS="-O3 -ffast-math $CFLAGS"
-    if [ "$lto" = "auto" ] && [ $compiler = GNU ] && cc_check "" "-flto" ; then
-        lto="yes"
-        CFLAGS="$CFLAGS -flto"
-        LDFLAGS="$LDFLAGS -O3 -flto"
-    fi
-fi
-[ "$lto" = "auto" ] && lto="no"
-
-if cc_check '' -fno-tree-vectorize ; then
-    CFLAGS="$CFLAGS -fno-tree-vectorize"
-fi
-
-if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then
-    # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero
-    cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss"
-fi
-
-if cc_check "stdio.h" "" "fseeko(stdin,0,0);" ; then
-    define fseek fseeko
-    define ftell ftello
-elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then
-    define fseek fseeko64
-    define ftell ftello64
-elif cc_check "stdio.h" "" "_fseeki64(stdin,0,0);" ; then
-    define fseek _fseeki64
-    define ftell _ftelli64
-fi
-
-if cc_check '' -Wshadow ; then
-    CFLAGS="-Wshadow $CFLAGS"
-fi
-
-if cc_check '' -Wmaybe-uninitialized ; then
-    CFLAGS="-Wno-maybe-uninitialized $CFLAGS"
-fi
-
-if [ $compiler = ICC -o $compiler = ICL ] ; then
-    if cc_check 'extras/intel_dispatcher.h' '' 'x264_intel_dispatcher_override();' ; then
-        define HAVE_INTEL_DISPATCHER
-    fi
-fi
-
-if [ "$bit_depth" -gt "8" ]; then
-    define HIGH_BIT_DEPTH
-    ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
-    opencl="no"
-else
-    ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
-fi
-
-if [ "$chroma_format" != "all" ]; then
-    define CHROMA_FORMAT CHROMA_$chroma_format
-fi
-
-ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth"
-
-[ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 || x264_gpl=0
-
-[ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0
-
-libdl=""
-if [ "$opencl" = "yes" ]; then
-    opencl="no"
-    # cygwin can use opencl if it can use LoadLibrary
-    if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then
-        opencl="yes"
-        define HAVE_OPENCL
-    elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
-        opencl="yes"
-        define HAVE_OPENCL
-        libdl="-ldl"
-    fi
-    LDFLAGS="$LDFLAGS $libdl"
-fi
-
-#define undefined vars as 0
-for var in $CONFIG_HAVE; do
-    grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
-done
-
-# generate exported config file
-
-config_chroma_format="X264_CSP_I$chroma_format"
-[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0"
-cat > x264_config.h << EOF
-#define X264_BIT_DEPTH     $bit_depth
-#define X264_GPL           $x264_gpl
-#define X264_INTERLACED    $x264_interlaced
-#define X264_CHROMA_FORMAT $config_chroma_format
-EOF
-
-${SRCPATH}/version.sh >> x264_config.h
-
-if [ "$cli_libx264" = "system" ] ; then
-    if [ "$shared" = "yes" ]; then
-        CLI_LIBX264='$(SONAME)'
-    else
-        CLI_LIBX264=
-        LDFLAGSCLI="$X264_LIBS $LDFLAGSCLI"
-        cc_check 'stdint.h x264.h' '' 'x264_encoder_open(0);' || die "System libx264 can't be used for compilation of this version"
-    fi
-else
-    CLI_LIBX264='$(LIBX264)'
-fi
-
-DEPMM="${QPRE}MM"
-DEPMT="${QPRE}MT"
-if [ $compiler_style = MS ]; then
-    AR="lib -nologo -out:"
-    LD="link -out:"
-    if [ $compiler = ICL ]; then
-        AR="xi$AR"
-        LD="xi$LD"
-    else
-        mslink="$(dirname "$(command -v cl 2>/dev/null)")/link"
-        [ -x "$mslink" ] && LD="\"$mslink\" -out:"
-    fi
-    HAVE_GETOPT_LONG=0
-    LDFLAGS="-nologo -incremental:no $(cl_ldflags $LDFLAGS)"
-    LDFLAGSCLI="$(cl_ldflags $LDFLAGSCLI)"
-    LIBX264=libx264.lib
-    RANLIB=
-    [ -n "$RC" ] && RCFLAGS="$RCFLAGS -nologo -I. -I\$(SRCPATH)/extras -fo"
-    STRIP=
-    if [ $debug = yes ]; then
-        LDFLAGS="-debug $LDFLAGS"
-        CFLAGS="-D_DEBUG $CFLAGS"
-    else
-        CFLAGS="-DNDEBUG $CFLAGS"
-    fi
-else # gcc/icc
-    DEPMM="$DEPMM -g0"
-    AR="$AR rc "
-    LD="$CC -o "
-    LIBX264=libx264.a
-    [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -o "
-fi
-[ $compiler != GNU ] && CFLAGS="$(cc_cflags $CFLAGS)"
-if [ $compiler = ICC -o $compiler = ICL ]; then
-    # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP
-    [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__
-    PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir."
-    PROF_GEN_LD=
-    PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir."
-    PROF_USE_LD=
-elif [ $compiler = CL ]; then
-    # Visual Studio
-    # _M_IX86_FP is only defined on x86
-    [ $ARCH = X86 ] && cpp_check '' '' '_M_IX86_FP >= 1' && define __SSE__
-    [ $ARCH = X86_64 ] && define __SSE__
-    # As long as the cli application can't link against the dll, the dll can not be pgo'd.
-    # pgds are link flag specific and the -dll flag for creating the dll makes it unshareable with the cli
-    PROF_GEN_CC="-GL"
-    PROF_GEN_LD="-LTCG:PGINSTRUMENT"
-    PROF_USE_CC="-GL"
-    PROF_USE_LD="-LTCG:PGOPTIMIZE"
-else
-    PROF_GEN_CC="-fprofile-generate"
-    PROF_GEN_LD="-fprofile-generate"
-    PROF_USE_CC="-fprofile-use"
-    PROF_USE_LD="-fprofile-use"
-fi
-
-# generate config files
-
-cat > config.mak << EOF
-SRCPATH=$SRCPATH
-prefix=$prefix
-exec_prefix=$exec_prefix
-bindir=$bindir
-libdir=$libdir
-includedir=$includedir
-SYS_ARCH=$ARCH
-SYS=$SYS
-CC=$CC
-CFLAGS=$CFLAGS
-COMPILER=$compiler
-COMPILER_STYLE=$compiler_style
-DEPMM=$DEPMM
-DEPMT=$DEPMT
-LD=$LD
-LDFLAGS=$LDFLAGS
-LIBX264=$LIBX264
-AR=$AR
-RANLIB=$RANLIB
-STRIP=$STRIP
-INSTALL=$INSTALL
-AS=$AS
-ASFLAGS=$ASFLAGS
-RC=$RC
-RCFLAGS=$RCFLAGS
-EXE=$EXE
-HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG
-DEVNULL=$DEVNULL
-PROF_GEN_CC=$PROF_GEN_CC
-PROF_GEN_LD=$PROF_GEN_LD
-PROF_USE_CC=$PROF_USE_CC
-PROF_USE_LD=$PROF_USE_LD
-HAVE_OPENCL=$opencl
-EOF
-
-if [ $compiler_style = MS ]; then
-    echo '%.o: %.c' >> config.mak
-    echo '	$(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
-fi
-
-if [ "$cli" = "yes" ]; then
-    echo 'default: cli' >> config.mak
-    echo 'install: install-cli' >> config.mak
-fi
-
-if [ "$shared" = "yes" ]; then
-    API=$(grep '#define X264_BUILD' < ${SRCPATH}/x264.h | cut -f 3 -d ' ')
-    if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then
-        echo "SONAME=libx264-$API.dll" >> config.mak
-        if [ $compiler_style = MS ]; then
-            echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak
-            # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations
-            # MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time
-            echo "SOFLAGS=-dll -def:x264.def -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak
-            echo "EXPORTS" > x264.def
-            # export API functions
-            grep "^\(int\|void\|x264_t\).*x264" ${SRCPATH}/x264.h | sed -e "s/.*\(x264.*\)(.*/\1/;s/open/open_$API/g" >> x264.def
-            # export API variables/data. must be flagged with the DATA keyword
-            grep "extern.*x264" ${SRCPATH}/x264.h | sed -e "s/.*\(x264\w*\)\W.*/\1 DATA/;" >> x264.def
-        else
-            echo 'IMPLIBNAME=libx264.dll.a' >> config.mak
-            echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) $SOFLAGS" >> config.mak
-        fi
-    elif [ "$SYS" = "MACOSX" ]; then
-        echo "SOSUFFIX=dylib" >> config.mak
-        echo "SONAME=libx264.$API.dylib" >> config.mak
-        echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak
-    elif [ "$SYS" = "SunOS" ]; then
-        echo "SOSUFFIX=so" >> config.mak
-        echo "SONAME=libx264.so.$API" >> config.mak
-        echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak
-    else
-        echo "SOSUFFIX=so" >> config.mak
-        echo "SONAME=libx264.so.$API" >> config.mak
-        echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak
-    fi
-    echo 'default: lib-shared' >> config.mak
-    echo 'install: install-lib-shared' >> config.mak
-fi
-
-if [ "$static" = "yes" ]; then
-    echo 'default: lib-static' >> config.mak
-    echo 'install: install-lib-static' >> config.mak
-fi
-
-echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak
-echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak
-
-cat > x264.pc << EOF
-prefix=$prefix
-exec_prefix=$exec_prefix
-libdir=$libdir
-includedir=$includedir
-
-Name: x264
-Description: H.264 (MPEG4 AVC) encoder library
-Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//')
-Libs: -L$libdir -lx264 $([ "$shared" = "yes" ] || echo $libpthread $libm $libdl)
-Libs.private: $([ "$shared" = "yes" ] && echo $libpthread $libm $libdl)
-Cflags: -I$includedir
-EOF
-
-filters="crop select_every"
-gpl_filters=""
-[ $swscale = yes ] && filters="resize $filters"
-[ $gpl = yes ] && filters="$filters $gpl_filters"
-
-cat > conftest.log <<EOF
-platform:      $ARCH
-byte order:    $CPU_ENDIAN
-system:        $SYS
-cli:           $cli
-libx264:       $cli_libx264
-shared:        $shared
-static:        $static
-asm:           $asm
-interlaced:    $interlaced
-avs:           $avs
-lavf:          $lavf
-ffms:          $ffms
-mp4:           $mp4
-gpl:           $gpl
-thread:        $thread
-opencl:        $opencl
-filters:       $filters
-lto:           $lto
-debug:         $debug
-gprof:         $gprof
-strip:         $strip
-PIC:           $pic
-bit depth:     $bit_depth
-chroma format: $chroma_format
-EOF
-
-echo >> config.log
-cat conftest.log >> config.log
-cat conftest.log
-
-[ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile
-mkdir -p common/{aarch64,arm,ppc,x86} encoder extras filters/video input output tools
-
-echo
-echo "You can run 'make' or 'make fprofiled' now."
-
diff --git a/android/src/main/libenc/jni/libx264/doc/ratecontrol.txt b/android/src/main/libenc/jni/libx264/doc/ratecontrol.txt
deleted file mode 100755
index e93ced2..0000000
--- a/android/src/main/libenc/jni/libx264/doc/ratecontrol.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-A qualitative overview of x264's ratecontrol methods
-By Loren Merritt
-
-Historical note:
-This document is outdated, but a significant part of it is still accurate.  Here are some important ways ratecontrol has changed since the authoring of this document:
-- By default, MB-tree is used instead of qcomp for weighting frame quality based on complexity.  MB-tree is effectively a generalization of qcomp to the macroblock level.  MB-tree also replaces the constant offsets for B-frame quantizers.  The legacy algorithm is still available for low-latency applications.
-- Adaptive quantization is now used to distribute quality among each frame; frames are no longer constant quantizer, even if MB-tree is off.
-- VBV runs per-row rather than per-frame to improve accuracy.
-
-x264's ratecontrol is based on libavcodec's, and is mostly empirical. But I can retroactively propose the following theoretical points which underlie most of the algorithms:
-
-- You want the movie to be somewhere approaching constant quality. However, constant quality does not mean constant PSNR nor constant QP. Details are less noticeable in high-complexity or high-motion scenes, so you can get away with somewhat higher QP for the same perceived quality.
-- On the other hand, you get more quality per bit if you spend those bits in scenes where motion compensation works well: A given artifact may stick around several seconds in a low-motion scene, and you only have to fix it in one frame to improve the quality of the whole scene.
-- Both of the above are correlated with the number of bits it takes to encode a frame at a given QP.
-- Given one encoding of a frame, we can predict the number of bits needed to encode it at a different QP. This prediction gets less accurate if the QPs are far apart.
-- The importance of a frame depends on the number of other frames that are predicted from it. Hence I-frames get reduced QP depending on the number and complexity of following inter-frames, disposable B-frames get higher QP than P-frames, and referenced B-frames are between P-frames and disposable B-frames.
-
-
-The modes:
-
-    2pass:
-Given some data about each frame of a 1st pass (e.g. generated by 1pass ABR, below), we try to choose QPs to maximize quality while matching a specified total size. This is separated into 3 parts:
-(1) Before starting the 2nd pass, select the relative number of bits to allocate between frames. This pays no attention to the total size of the encode. The default formula, empirically selected to balance between the 1st 2 theoretical points, is "complexity ** 0.6", where complexity is defined to be the bit size of the frame at a constant QP (estimated from the 1st pass).
-(2) Scale the results of (1) to fill the requested total size. Optional: Impose VBV limitations. Due to nonlinearities in the frame size predictor and in VBV, this is an iterative process.
-(3) Now start encoding. After each frame, update future QPs to compensate for mispredictions in size. If the 2nd pass is consistently off from the predicted size (usually because we use slower compression options than the 1st pass), then we multiply all future frames' qscales by the reciprocal of the error. Additionally, there is a short-term compensation to prevent us from deviating too far from the desired size near the beginning (when we don't have much data for the global compensation) and near the end (when global doesn't have time to react).
-
-    1pass, average bitrate:
-The goal is the same as in 2pass, but here we don't have the benefit of a previous encode, so all ratecontrol must be done during the encode.
-(1) This is the same as in 2pass, except that instead of estimating complexity from a previous encode, we run a fast motion estimation algo over a half-resolution version of the frame, and use the SATD residuals (these are also used in the decision between P- and B-frames). Also, we don't know the size or complexity of the following GOP, so I-frame bonus is based on the past.
-(2) We don't know the complexities of future frames, so we can only scale based on the past. The scaling factor is chosen to be the one that would have resulted in the desired bitrate if it had been applied to all frames so far.
-(3) Overflow compensation is the same as in 2pass. By tuning the strength of compensation, you can get anywhere from near the quality of 2pass (but unpredictable size, like +- 10%) to reasonably strict filesize but lower quality.
-
-    1pass, constant bitrate (VBV compliant):
-(1) Same as ABR.
-(2) Scaling factor is based on a local average (dependent on VBV buffer size) instead of all past frames.
-(3) Overflow compensation is stricter, and has an additional term to hard limit the QPs if the VBV is near empty. Note that no hard limit is done for a full VBV, so CBR may use somewhat less than the requested bitrate. Note also that if a frame violates VBV constraints despite the best efforts of prediction, it is not re-encoded.
-
-    1pass, constant ratefactor:
-(1) Same as ABR.
-(2) The scaling factor is a constant based on the --crf argument.
-(3) No overflow compensation is done.
-
-    constant quantizer:
-QPs are simply based on frame type.
diff --git a/android/src/main/libenc/jni/libx264/doc/regression_test.txt b/android/src/main/libenc/jni/libx264/doc/regression_test.txt
deleted file mode 100755
index 3e38a9e..0000000
--- a/android/src/main/libenc/jni/libx264/doc/regression_test.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Here is one test method which checks that the encoder's
-view of decoded pictures in the same as the decoder's view.
-This ensures that there is no distortion besides what is
-inherently caused by compression.
-
-# Install and compile x264 :
-git clone git://git.videolan.org/x264.git x264
-cd x264
-./configure
-make
-cd ..
-
-# Install and compile JM reference decoder :
-wget http://iphome.hhi.de/suehring/tml/download/jm17.2.zip
-unzip jm17.2.zip
-cd JM
-sh unixprep.sh
-cd ldecod
-make
-cd ../..
-
-./x264/x264 input.yuv --dump-yuv fdec.yuv -o output.h264
-./JM/bin/ldecod.exe -i output.h264 -o ref.yuv
-diff ref.yuv fdec.yuv
diff --git a/android/src/main/libenc/jni/libx264/doc/standards.txt b/android/src/main/libenc/jni/libx264/doc/standards.txt
deleted file mode 100755
index 7474d8f..0000000
--- a/android/src/main/libenc/jni/libx264/doc/standards.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-x264 is written in C. The particular variant of C is: intersection of C99 and gcc>=3.4.
-checkasm is written in gcc, with no attempt at compatibility with anything else.
-
-We make the following additional assumptions which are true of real systems but not guaranteed by C99:
-* Two's complement.
-* Signed right-shifts are sign-extended.
-* int is 32-bit or larger.
-
-x86-specific assumptions:
-* The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
-* We call emms before any float operation and before returning from libx264, not after each mmx operation. So bad things could happen if the compiler inserts float operations where they aren't expected.
diff --git a/android/src/main/libenc/jni/libx264/doc/threads.txt b/android/src/main/libenc/jni/libx264/doc/threads.txt
deleted file mode 100755
index cea1f65..0000000
--- a/android/src/main/libenc/jni/libx264/doc/threads.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-Historical notes:
-Slice-based threads was the original threading model of x264.  It was replaced with frame-based threads in r607.  This document was originally written at that time.  Slice-based threading was brought back (as an optional mode) in r1364 for low-latency encoding.  Furthermore, frame-based threading was modified significantly in r1246, with the addition of threaded lookahead.
-
-Old threading method: slice-based
-application calls x264
-x264 runs B-adapt and ratecontrol (serial)
-split frame into several slices, and spawn a thread for each slice
-wait until all threads are done
-deblock and hpel filter (serial)
-return to application
-In x264cli, there is one additional thread to decode the input.
-
-New threading method: frame-based
-application calls x264
-x264 requests a frame from lookahead, which runs B-adapt and ratecontrol parallel to the current thread, separated by a buffer of size sync-lookahead
-spawn a thread for this frame
-thread runs encode, deblock, hpel filter
-meanwhile x264 waits for the oldest thread to finish
-return to application, but the rest of the threads continue running in the background
-No additional threads are needed to decode the input, unless decoding is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel.
-
-Penalties for slice-based threading:
-Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary.
-In CBR mode, multiple slices encode simultaneously, thus increasing the maximum misprediction possible with VBV.
-Some parts of the encoder are serial, so it doesn't scale well with lots of cpus.
-
-Some numbers on penalties for slicing:
-Tested at 720p with 45 slices (one per mb row) to maximize the total cost for easy measurement. Averaged over 4 movies at crf20 and crf30. Total cost: +30% bitrate at constant psnr.
-I enabled the various components of slicing one at a time, and measured the portion of that cost they contribute:
-    * 34% intra prediction
-    * 25% redundant slice headers, nal headers, and rounding to whole bytes
-    * 16% mv prediction
-    * 16% reset cabac contexts
-    * 6% deblocking between slices (you don't strictly have to turn this off just for standard compliance, but you do if you want to use slices for decoder multithreading)
-    * 2% cabac neighbors (cbp, skip, etc)
-The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varing deblock strength).
-But none of the proportions should depend strongly on the number of slices: some are triggered per slice while some are triggered per macroblock-that's-on-the-edge-of-a-slice, but as long as there's no more than 1 slice per row, the relative frequency of those two conditions is determined solely by the image width.
-
-
-Penalties for frame-base threading:
-To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion.
-We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame.
-Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes.
-
-Benchmarks:
-cpu: 8core Nehalem (2x E5520) 2.27GHz, hyperthreading disabled
-kernel: linux 2.6.34.7, 64-bit
-x264: r1732 b20059aa
-input: http://media.xiph.org/video/derf/y4m/1080p/park_joy_1080p.y4m
-
-NOTE: the "thread count" listed below does not count the lookahead thread, only encoding threads.  This is why for "veryfast", the speedup for 2 and 3 threads exceeds the logical limit.
-
-threads  speedup       psnr
-      slice frame   slice  frame
-x264 --preset veryfast --tune psnr --crf 30
- 1:   1.00x 1.00x  +0.000 +0.000
- 2:   1.41x 2.29x  -0.005 -0.002
- 3:   1.70x 3.65x  -0.035 +0.000
- 4:   1.96x 3.97x  -0.029 -0.001
- 5:   2.10x 3.98x  -0.047 -0.002
- 6:   2.29x 3.97x  -0.060 +0.001
- 7:   2.36x 3.98x  -0.057 -0.001
- 8:   2.43x 3.98x  -0.067 -0.001
- 9:         3.96x         +0.000
-10:         3.99x         +0.000
-11:         4.00x         +0.001
-12:         4.00x         +0.001
-
-x264 --preset medium --tune psnr --crf 30
- 1:   1.00x 1.00x  +0.000 +0.000
- 2:   1.54x 1.59x  -0.002 -0.003
- 3:   2.01x 2.81x  -0.005 +0.000
- 4:   2.51x 3.11x  -0.009 +0.000
- 5:   2.89x 4.20x  -0.012 -0.000
- 6:   3.27x 4.50x  -0.016 -0.000
- 7:   3.58x 5.45x  -0.019 -0.002
- 8:   3.79x 5.76x  -0.015 -0.002
- 9:         6.49x         -0.000
-10:         6.64x         -0.000
-11:         6.94x         +0.000
-12:         6.96x         +0.000
-
-x264 --preset slower --tune psnr --crf 30
- 1:   1.00x 1.00x  +0.000 +0.000
- 2:   1.54x 1.83x  +0.000 +0.002
- 3:   1.98x 2.21x  -0.006 +0.002
- 4:   2.50x 2.61x  -0.011 +0.002
- 5:   2.93x 3.94x  -0.018 +0.003
- 6:   3.45x 4.19x  -0.024 +0.001
- 7:   3.84x 4.52x  -0.028 -0.001
- 8:   4.13x 5.04x  -0.026 -0.001
- 9:         6.15x         +0.001
-10:         6.24x         +0.001
-11:         6.55x         -0.001
-12:         6.89x         -0.001
diff --git a/android/src/main/libenc/jni/libx264/doc/vui.txt b/android/src/main/libenc/jni/libx264/doc/vui.txt
deleted file mode 100755
index d6e6c94..0000000
--- a/android/src/main/libenc/jni/libx264/doc/vui.txt
+++ /dev/null
@@ -1,177 +0,0 @@
-Video Usability Information (VUI) Guide
-by Christian Heine ( sennindemokrit at gmx dot net )
-
-1. Sample Aspect Ratio
------------------------
-
-* What is it?
-    The Sample Aspect Ratio (SAR) (sometimes called Pixel Aspect Ratio or just
-    Pel Aspect Ratio) is defined as the ratio of the width of the sample to the
-    height of the sample. While pixels on a computer monitor generally are
-    "square" meaning that their SAR is 1:1, digitized video usually has rather
-    odd SARs. Playback of material with a particular SAR on a system with
-    a different SAR will result in a stretched/squashed image. A correction is
-    necessary that relies on the knowledge of both SARs.
-
-* How do I use it?
-    You can derive the SAR of an image from the width, height and the
-    display aspect ratio (DAR) of the image as follows:
-
-    SAR_x   DAR_x * height
-    ----- = --------------
-    SAR_y   DAR_y * width
-
-    for example:
-    width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11
-
-    Please note that if your material is a digitized analog signal, you should
-    not use this equation to calculate the SAR. Refer to the manual of your
-    digitizing equipment or this link instead.
-
-    A Quick Guide to Digital Video Resolution and Aspect Ratio Conversions
-    http://www.iki.fi/znark/video/conversion/
-
-* Should I use this option?
-    In one word: yes. Most decoders/ media players nowadays support automatic
-    correction of aspect ratios, and there are just few exceptions. You should
-    even use it, if the SAR of your material is 1:1, as the default of x264 is
-    "SAR not defined".
-
-2. Overscan
-------------
-
-* What is it?
-    The term overscan generally refers to all regions of an image that do
-    not contain information but are added to achieve a certain resolution or
-    aspect ratio. A "letterboxed" image therefore has overscan at the top and
-    the bottom. This is not the overscan this option refers to. Neither refers
-    it to the overscan that is added as part of the process of digitizing an
-    analog signal. Instead it refers to the "overscan" process on a display
-    that shows only a part of the image. What that part is depends on the
-    display.
-
-* How do I use this option?
-    As I'm not sure about what part of the image is shown when the display uses
-    an overscan process, I can't provide you with rules or examples. The safe
-    assumption would be "overscan=show" as this always shows the whole image.
-    Use "overscan=crop" only if you are sure about the consequences. You may
-    also use the default value ("undefined").
-
-* Should I use this option?
-    Only if you know exactly what you are doing. Don't use it on video streams
-    that have general overscan. Instead try to to crop the borders before
-    encoding and benefit from the higher bitrate/ image quality.
-
-    Furthermore the H264 specification says that the setting "overscan=show"
-    must be respected, but "overscan=crop" may be ignored. In fact most
-    playback equipment ignores this setting and shows the whole image.
-
-3. Video Format
-----------------
-
-* What is it?
-    A purely informative setting, that explains what the type of your analog
-    video was, before you digitized it.
-
-* How do I use this option?
-    Just set it to the desired value. ( e.g. NTSC, PAL )
-    If you transcode from MPEG2, you may find the value for this option in the
-    m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details)
-
-* Should I use this option?
-    That is entirely up to you. I have no idea how this information would ever
-    be relevant. I consider it to be informative only.
-
-4. Full Range
---------------
-
-* What is it?
-    Another relic from digitizing analog video. When digitizing analog video
-    the digital representation of the luma and chroma levels is limited to lie
-    within 16..235 and 16..240 respectively. Playback equipment usually assumes
-    all digitized samples to be within this range. However most DVDs use the
-    full range of 0..255 for luma and chroma samples, possibly resulting in an
-    oversaturation when played back on that equipment. To avoid this a range
-    correction is needed.
-
-* How do I use this option?
-    If your source material is a digitized analog video/TV broadcast it is
-    quite possible that it is range limited. If you can make sure that it is
-    range limited you can safely set full range to off. If you are not sure
-    or want to make sure that your material is played back without
-    oversaturation, set if to on. Please note that the default for this option
-    in x264 is off, which is not a safe assumption.
-
-* Should I use this option?
-    Yes, but there are few decoders/ media players that distinguish
-    between the two options.
-
-5. Color Primaries, Transfer Characteristics, Matrix Coefficients
--------------------------------------------------------------------
-
-* What is it?
-    A videophile setting. The average users won't ever need it.
-    Not all monitor models show all colors the same way. When comparing the
-    same image on two different monitor models you might find that one of them
-    "looks more blue", while the other "looks more green". Bottom line is, each
-    monitor model has a different color profile, which can be used to correct
-    colors in a way, that images look almost the same on all monitors. The same
-    goes for printers and film/ video digitizing equipment. If the color
-    profile of the digitizing equipment is known, it is possible to correct the
-    colors and gamma of the decoded h264 stream in a way that the video stream
-    looks the same, regardless of the digitizing equipment used.
-
-* How do I use these options?
-    If you are able to find out which characteristics your digitizing equipment
-    uses, (see the equipment documentation or make reference measurements)
-    then find the most suitable characteristics in the list of available
-    characteristics (see H264 Annex E) and pass it to x264. Otherwise leave it
-    to the default (unspecified).
-    If you transcode from MPEG2, you may find the values for these options in
-    the m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details)
-
-* Should I use these options?
-    Only if you know exactly what you are doing. The default setting is better
-    than a wrong one. Use of this option is not a bad idea though.
-    Unfortunately I don't know any decoder/ media player that ever even
-    attempted color/gamma/color matrix correction.
-
-6. Chroma Sample Location
---------------------------
-
-* What is it?
-    A videophile setting. The average user won't ever notice a difference.
-    Due to a weakness of the eye, it is often economic to reduce the number of
-    chroma samples in a process called subsampling. In particular x264 uses
-    only one chroma sample of each chroma channel every block of 2x2 luma
-    samples. There are a number of possibilities on how this subsampling is
-    done, each resulting in another relative location of the chroma sample
-    towards the luma samples. The Chroma Sample Location matters when the
-    subsampling process is reversed, e.g. the number of chroma samples is
-    increased. This is most likely to happen at color space conversions. If it
-    is not done correctly the chroma values may appear shifted compared to the
-    luma samples by at most 1 pixel, or strangely blurred.
-
-* How do I use this option?
-    Because x264 does no subsampling, since it only accepts already subsampled
-    input frames, you have to determine the method yourself.
-
-    If you transcode from MPEG1 with proper subsampled 4:2:0, and don't do any
-    color space conversion, you should set this option to 1.
-    If you transcode from MPEG2 with proper subsampled 4:2:0, and don't do any
-    color space conversion, you should set this option to 0.
-    If you transcode from MPEG4 with proper subsampled 4:2:0, and don't do any
-    color space conversion, you should set this option to 0.
-
-    If you do the color space conversion yourself this isn't that easy. If the
-    filter kernel of the subsampling is ( 0.5, 0.5 ) in one direction then the
-    chroma sample location in that direction is between the two luma samples.
-    If your filter kernel is ( 0.25, 0.5, 0.25 ) in one direction then the
-    chroma sample location in that direction is equal to one of the luma
-    samples. H264 Annex E contains images that tell you how to "transform" your
-    Chroma Sample Location into a value of 0 to 5 that you can pass to x264.
-
-* Should I use this option?
-    Unless you are a perfectionist, don't bother. Media players ignore this
-    setting, and favor their own (fixed) assumed Chroma Sample Location.
-
diff --git a/android/src/main/libenc/jni/libx264/encoder/analyse.c b/android/src/main/libenc/jni/libx264/encoder/analyse.c
deleted file mode 100755
index e158ad1..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/analyse.c
+++ /dev/null
@@ -1,4019 +0,0 @@
-/*****************************************************************************
- * analyse.c: macroblock analysis
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-#include "me.h"
-#include "ratecontrol.h"
-#include "analyse.h"
-#include "rdo.c"
-
-typedef struct
-{
-    /* 16x16 */
-    int       i_rd16x16;
-    x264_me_t me16x16;
-    x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
-
-    /* 8x8 */
-    int       i_cost8x8;
-    /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
-    ALIGNED_4( int16_t mvc[32][5][2] );
-    x264_me_t me8x8[4];
-
-    /* Sub 4x4 */
-    int       i_cost4x4[4]; /* cost per 8x8 partition */
-    x264_me_t me4x4[4][4];
-
-    /* Sub 8x4 */
-    int       i_cost8x4[4]; /* cost per 8x8 partition */
-    x264_me_t me8x4[4][2];
-
-    /* Sub 4x8 */
-    int       i_cost4x8[4]; /* cost per 8x8 partition */
-    x264_me_t me4x8[4][2];
-
-    /* 16x8 */
-    int       i_cost16x8;
-    x264_me_t me16x8[2];
-
-    /* 8x16 */
-    int       i_cost8x16;
-    x264_me_t me8x16[2];
-
-} x264_mb_analysis_list_t;
-
-typedef struct
-{
-    /* conduct the analysis using this lamda and QP */
-    int i_lambda;
-    int i_lambda2;
-    int i_qp;
-    uint16_t *p_cost_mv;
-    uint16_t *p_cost_ref[2];
-    int i_mbrd;
-
-
-    /* I: Intra part */
-    /* Take some shortcuts in intra search if intra is deemed unlikely */
-    int b_fast_intra;
-    int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
-    int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
-    int b_try_skip;
-
-    /* Luma part */
-    int i_satd_i16x16;
-    int i_satd_i16x16_dir[7];
-    int i_predict16x16;
-
-    int i_satd_i8x8;
-    int i_cbp_i8x8_luma;
-    ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
-    int i_predict8x8[4];
-
-    int i_satd_i4x4;
-    int i_predict4x4[16];
-
-    int i_satd_pcm;
-
-    /* Chroma part */
-    int i_satd_chroma;
-    int i_satd_chroma_dir[7];
-    int i_predict8x8chroma;
-
-    /* II: Inter part P/B frame */
-    x264_mb_analysis_list_t l0;
-    x264_mb_analysis_list_t l1;
-
-    int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
-    int i_cost16x16direct;
-    int i_cost8x8bi;
-    int i_cost8x8direct[4];
-    int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
-    int i_cost_est16x8[2]; /* Per-partition estimated cost */
-    int i_cost_est8x16[2];
-    int i_cost16x8bi;
-    int i_cost8x16bi;
-    int i_rd16x16bi;
-    int i_rd16x16direct;
-    int i_rd16x8bi;
-    int i_rd8x16bi;
-    int i_rd8x8bi;
-
-    int i_mb_partition16x8[2]; /* mb_partition_e */
-    int i_mb_partition8x16[2];
-    int i_mb_type16x8; /* mb_class_e */
-    int i_mb_type8x16;
-
-    int b_direct_available;
-    int b_early_terminate;
-
-} x264_mb_analysis_t;
-
-/* lambda = pow(2,qp/6-2) */
-const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
-{
-   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
-   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
-   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
-   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
-  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
-  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
-  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
- 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
- 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
-1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
-2048,2299,                               /* 80-81 */
-};
-
-/* lambda2 = pow(lambda,2) * .9 * 256 */
-/* Capped to avoid overflow */
-const int x264_lambda2_tab[QP_MAX_MAX+1] =
-{
-       14,       18,       22,       28,       36,       45,      57,      72, /*  0- 7 */
-       91,      115,      145,      182,      230,      290,     365,     460, /*  8-15 */
-      580,      731,      921,     1161,     1462,     1843,    2322,    2925, /* 16-23 */
-     3686,     4644,     5851,     7372,     9289,    11703,   14745,   18578, /* 24-31 */
-    23407,    29491,    37156,    46814,    58982,    74313,   93628,  117964, /* 32-39 */
-   148626,   187257,   235929,   297252,   374514,   471859,  594505,  749029, /* 40-47 */
-   943718,  1189010,  1498059,  1887436,  2378021,  2996119, 3774873, 4756042, /* 48-55 */
-  5992238,  7549747,  9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
- 38048341, 47937906, 60397977, 76096683, 95875813,120795955,                   /* 64-69 */
-134217727,134217727,134217727,134217727,134217727,134217727,                   /* 70-75 */
-134217727,134217727,134217727,134217727,134217727,134217727,                   /* 76-81 */
-};
-
-const uint8_t x264_exp2_lut[64] =
-{
-      0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
-     48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
-    106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
-    175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
-};
-
-const float x264_log2_lut[128] =
-{
-    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
-    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
-    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
-    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
-    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
-    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
-    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
-    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
-    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
-    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
-    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
-    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
-    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
-    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
-    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
-    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
-};
-
-/* Avoid an int/float conversion. */
-const float x264_log2_lz_lut[32] =
-{
-    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-};
-
-// should the intra and inter lambdas be different?
-// I'm just matching the behaviour of deadzone quant.
-static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
-{
-    // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {
-               46,       58,       73,       92,      117,      147,
-              185,      233,      294,      370,      466,      587,
-              740,      932,     1174,     1480,     1864,     2349,
-             2959,     3728,     4697,     5918,     7457,     9395,
-            11837,    14914,    18790,    23674,    29828,    37581,
-            47349,    59656,    75163,    94699,   119313,   150326,
-           189399,   238627,   300652,   378798,   477255,   601304,
-           757596,   954511,  1202608,  1515192,  1909022,  2405217,
-          3030384,  3818045,  4810435,  6060769,  7636091,  9620872,
-         12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
-         48486154, 61088726, 76966972, 96972308,
-        122177453,134217727,134217727,134217727,134217727,134217727,
-        134217727,134217727,134217727,134217727,134217727,134217727,
-    },
-    // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {
-               27,       34,       43,       54,       68,       86,
-              108,      136,      172,      216,      273,      343,
-              433,      545,      687,      865,     1090,     1374,
-             1731,     2180,     2747,     3461,     4361,     5494,
-             6922,     8721,    10988,    13844,    17442,    21976,
-            27688,    34885,    43953,    55377,    69771,    87906,
-           110755,   139543,   175813,   221511,   279087,   351627,
-           443023,   558174,   703255,   886046,  1116348,  1406511,
-          1772093,  2232697,  2813022,  3544186,  4465396,  5626046,
-          7088374,  8930791, 11252092, 14176748, 17861583, 22504184,
-         28353495, 35723165, 45008368, 56706990,
-         71446330, 90016736,113413980,134217727,134217727,134217727,
-        134217727,134217727,134217727,134217727,134217727,134217727,
-        134217727,134217727,134217727,134217727,134217727,134217727,
-    }
-};
-
-#define MAX_CHROMA_LAMBDA_OFFSET 36
-static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
-{
-       16,    20,    25,    32,    40,    50,
-       64,    80,   101,   128,   161,   203,
-      256,   322,   406,   512,   645,   812,
-     1024,  1290,  1625,  2048,  2580,  3250,
-     4096,  5160,  6501,  8192, 10321, 13003,
-    16384, 20642, 26007, 32768, 41285, 52015,
-    65535
-};
-
-/* TODO: calculate CABAC costs */
-static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
-{
-    9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
-};
-static const uint8_t i_mb_b16x8_cost_table[17] =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
-};
-static const uint8_t i_sub_mb_b_cost_table[13] =
-{
-    7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
-};
-static const uint8_t i_sub_mb_p_cost_table[4] =
-{
-    5, 3, 3, 1
-};
-
-static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
-
-static uint16_t x264_cost_ref[QP_MAX+1][3][33];
-static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
-static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
-
-static int init_costs( x264_t *h, float *logs, int qp )
-{
-    int lambda = x264_lambda_tab[qp];
-    if( h->cost_mv[qp] )
-        return 0;
-    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
-    CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
-    h->cost_mv[qp] += 2*4*2048;
-    for( int i = 0; i <= 2*4*2048; i++ )
-    {
-        h->cost_mv[qp][-i] =
-        h->cost_mv[qp][i]  = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
-    }
-    x264_pthread_mutex_lock( &cost_ref_mutex );
-    for( int i = 0; i < 3; i++ )
-        for( int j = 0; j < 33; j++ )
-            x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
-    x264_pthread_mutex_unlock( &cost_ref_mutex );
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
-    {
-        for( int j = 0; j < 4; j++ )
-        {
-            CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
-            h->cost_mv_fpel[qp][j] += 2*2048;
-            for( int i = -2*2048; i < 2*2048; i++ )
-                h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
-        }
-    }
-    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
-    for( int i = 0; i < 17; i++ )
-        cost_i4x4_mode[i] = 3*lambda*(i!=8);
-    return 0;
-fail:
-    return -1;
-}
-
-int x264_analyse_init_costs( x264_t *h )
-{
-    float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
-    if( !logs )
-        return -1;
-
-    logs[0] = 0.718f;
-    for( int i = 1; i <= 2*4*2048; i++ )
-        logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
-
-    for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
-        if( init_costs( h, logs, qp ) )
-            goto fail;
-
-    if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
-        goto fail;
-
-    x264_free( logs );
-    return 0;
-fail:
-    x264_free( logs );
-    return -1;
-}
-
-void x264_analyse_free_costs( x264_t *h )
-{
-    for( int i = 0; i < QP_MAX+1; i++ )
-    {
-        if( h->cost_mv[i] )
-            x264_free( h->cost_mv[i] - 2*4*2048 );
-        if( h->cost_mv_fpel[i][0] )
-            for( int j = 0; j < 4; j++ )
-                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
-    }
-}
-
-void x264_analyse_weight_frame( x264_t *h, int end )
-{
-    for( int j = 0; j < h->i_ref[0]; j++ )
-    {
-        if( h->sh.weight[j][0].weightfn )
-        {
-            x264_frame_t *frame = h->fref[0][j];
-            int width = frame->i_width[0] + 2*PADH;
-            int i_padv = PADV << PARAM_INTERLACED;
-            int offset, height;
-            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
-            height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
-            offset = h->fenc->i_lines_weighted*frame->i_stride[0];
-            h->fenc->i_lines_weighted += height;
-            if( height )
-                for( int k = j; k < h->i_ref[0]; k++ )
-                    if( h->sh.weight[k][0].weightfn )
-                    {
-                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
-                        x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
-                                                 src + offset, frame->i_stride[0],
-                                                 width, height, &h->sh.weight[k][0] );
-                    }
-            break;
-        }
-    }
-}
-
-/* initialize an array of lambda*nbits for all possible mvs */
-static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
-{
-    a->p_cost_mv = h->cost_mv[a->i_qp];
-    a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
-}
-
-static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
-{
-    int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
-    a->i_lambda = x264_lambda_tab[qp];
-    a->i_lambda2 = x264_lambda2_tab[qp];
-
-    h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
-    if( h->param.analyse.i_trellis )
-    {
-        h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
-        h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
-        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
-        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
-    }
-    h->mb.i_psy_rd_lambda = a->i_lambda;
-    /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
-    int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
-    h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-
-    if( qp > QP_MAX_SPEC )
-    {
-        h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
-        h->nr_residual_sum = h->nr_residual_sum_buf[1];
-        h->nr_count = h->nr_count_buf[1];
-        h->mb.b_noise_reduction = 1;
-        qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
-    }
-    else
-    {
-        h->nr_offset = h->nr_offset_denoise;
-        h->nr_residual_sum = h->nr_residual_sum_buf[0];
-        h->nr_count = h->nr_count_buf[0];
-        h->mb.b_noise_reduction = 0;
-    }
-
-    a->i_qp = h->mb.i_qp = qp;
-    h->mb.i_chroma_qp = h->chroma_qp_table[qp];
-}
-
-static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
-{
-    int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
-
-    /* mbrd == 1 -> RD mode decision */
-    /* mbrd == 2 -> RD refinement */
-    /* mbrd == 3 -> QPRD */
-    a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
-    h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
-    a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
-
-    x264_mb_analyse_init_qp( h, a, qp );
-
-    h->mb.b_transform_8x8 = 0;
-
-    /* I: Intra part */
-    a->i_satd_i16x16 =
-    a->i_satd_i8x8   =
-    a->i_satd_i4x4   =
-    a->i_satd_chroma = COST_MAX;
-
-    /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
-     * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
-    uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
-    a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
-
-    a->b_fast_intra = 0;
-    a->b_avoid_topright = 0;
-    h->mb.i_skip_intra =
-        h->mb.b_lossless ? 0 :
-        a->i_mbrd ? 2 :
-        !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
-
-    /* II: Inter part P/B frame */
-    if( h->sh.i_type != SLICE_TYPE_I )
-    {
-        int i_fmv_range = 4 * h->param.analyse.i_mv_range;
-        // limit motion search to a slightly smaller range than the theoretical limit,
-        // since the search may go a few iterations past its given range
-        int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
-
-        /* Calculate max allowed MV range */
-#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
-        h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
-        h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
-        h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
-        h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
-        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
-        {
-            int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
-            int max_mv = max_x - 4*16*h->mb.i_mb_x;
-            /* If we're left of the refresh bar, don't reference right of it. */
-            if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
-                h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
-        }
-        h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
-        h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
-        if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
-        {
-            int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
-            int thread_mvy_range = i_fmv_range;
-
-            if( h->i_thread_frames > 1 )
-            {
-                int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
-                int thresh = pix_y + h->param.analyse.i_mv_range_thread;
-                for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
-                    for( int j = 0; j < h->i_ref[i]; j++ )
-                    {
-                        x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
-                        thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
-                    }
-
-                if( h->param.b_deterministic )
-                    thread_mvy_range = h->param.analyse.i_mv_range_thread;
-                if( PARAM_INTERLACED )
-                    thread_mvy_range >>= 1;
-
-                x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
-            }
-
-            if( PARAM_INTERLACED )
-            {
-                /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
-                for( int i = 0; i < 3; i++ )
-                {
-                    int j = i == 2;
-                    mb_y = (h->mb.i_mb_y >> j) + (i == 1);
-                    h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
-                    h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
-                    h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
-                    h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
-                    h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
-                    h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
-                    h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
-                }
-            }
-            else
-            {
-                h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
-                h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
-                h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
-                h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
-                h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
-                h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
-                h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
-            }
-        }
-        if( PARAM_INTERLACED )
-        {
-            int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
-            h->mb.mv_min[1] = h->mb.mv_miny_row[i];
-            h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
-            h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
-            h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
-            h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
-            h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
-        }
-#undef CLIP_FMV
-
-        a->l0.me16x16.cost =
-        a->l0.i_rd16x16    =
-        a->l0.i_cost8x8    =
-        a->l0.i_cost16x8   =
-        a->l0.i_cost8x16   = COST_MAX;
-        if( h->sh.i_type == SLICE_TYPE_B )
-        {
-            a->l1.me16x16.cost =
-            a->l1.i_rd16x16    =
-            a->l1.i_cost8x8    =
-            a->i_cost8x8direct[0] =
-            a->i_cost8x8direct[1] =
-            a->i_cost8x8direct[2] =
-            a->i_cost8x8direct[3] =
-            a->l1.i_cost16x8   =
-            a->l1.i_cost8x16   =
-            a->i_rd16x16bi     =
-            a->i_rd16x16direct =
-            a->i_rd8x8bi       =
-            a->i_rd16x8bi      =
-            a->i_rd8x16bi      =
-            a->i_cost16x16bi   =
-            a->i_cost16x16direct =
-            a->i_cost8x8bi     =
-            a->i_cost16x8bi    =
-            a->i_cost8x16bi    = COST_MAX;
-        }
-        else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
-            for( int i = 0; i < 4; i++ )
-            {
-                a->l0.i_cost4x4[i] =
-                a->l0.i_cost8x4[i] =
-                a->l0.i_cost4x8[i] = COST_MAX;
-            }
-
-        /* Fast intra decision */
-        if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
-        {
-            /* Always run in fast-intra mode for subme < 3 */
-            if( h->mb.i_subpel_refine > 2 &&
-              ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
-                IS_INTRA( h->mb.i_mb_type_top ) ||
-                IS_INTRA( h->mb.i_mb_type_topleft ) ||
-                IS_INTRA( h->mb.i_mb_type_topright ) ||
-                (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
-                (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
-            { /* intra is likely */ }
-            else
-            {
-                a->b_fast_intra = 1;
-            }
-        }
-        h->mb.b_skip_mc = 0;
-        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
-            h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
-        {
-            a->b_force_intra = 1;
-            a->b_fast_intra = 0;
-            a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
-        }
-        else
-            a->b_force_intra = 0;
-    }
-}
-
-/* Prediction modes allowed for various combinations of neighbors. */
-/* Terminated by a -1. */
-/* In order, no neighbors, left, top, top/left, top/left/topleft */
-static const int8_t i16x16_mode_available[5][5] =
-{
-    {I_PRED_16x16_DC_128, -1, -1, -1, -1},
-    {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
-    {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
-    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
-    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
-};
-
-static const int8_t chroma_mode_available[5][5] =
-{
-    {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
-    {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
-    {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
-    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
-    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
-};
-
-static const int8_t i8x8_mode_available[2][5][10] =
-{
-    {
-        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
-    },
-    {
-        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
-        {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
-    }
-};
-
-static const int8_t i4x4_mode_available[2][5][10] =
-{
-    {
-        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
-    },
-    {
-        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
-        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
-    }
-};
-
-static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
-{
-    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
-    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return i16x16_mode_available[idx];
-}
-
-static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
-{
-    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
-    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return chroma_mode_available[idx];
-}
-
-static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
-{
-    int avoid_topright = force_intra && (i&1);
-    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
-    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return i8x8_mode_available[avoid_topright][idx];
-}
-
-static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
-{
-    int avoid_topright = force_intra && ((i&5) == 5);
-    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
-    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return i4x4_mode_available[avoid_topright][idx];
-}
-
-/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
-static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
-{
-    ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
-
-    if( do_both_dct || h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
-    if( do_both_dct || !h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
-}
-
-/* Reset fenc satd scores cache for psy RD */
-static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
-{
-    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
-        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
-    if( !h->mb.i_psy_rd )
-        return;
-    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
-    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
-    if( b_satd )
-        h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
-}
-
-static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
-{
-    if( a->i_satd_chroma < COST_MAX )
-        return;
-
-    if( CHROMA444 )
-    {
-        if( !h->mb.b_chroma_me )
-        {
-            a->i_satd_chroma = 0;
-            return;
-        }
-
-        /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
-        if( h->mb.b_lossless )
-        {
-            x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
-            x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
-        }
-        else
-        {
-            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
-            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
-        }
-        a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
-                         + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
-        return;
-    }
-
-    const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
-    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
-
-    /* Prediction selection for chroma */
-    if( predict_mode[3] >= 0 && !h->mb.b_lossless )
-    {
-        int satdu[4], satdv[4];
-        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
-        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
-        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
-        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
-        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
-        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
-
-        for( ; *predict_mode >= 0; predict_mode++ )
-        {
-            int i_mode = *predict_mode;
-            int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
-
-            a->i_satd_chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
-        }
-    }
-    else
-    {
-        for( ; *predict_mode >= 0; predict_mode++ )
-        {
-            int i_satd;
-            int i_mode = *predict_mode;
-
-            /* we do the prediction */
-            if( h->mb.b_lossless )
-                x264_predict_lossless_chroma( h, i_mode );
-            else
-            {
-                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
-            }
-
-            /* we calculate the cost */
-            i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
-                     h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
-                     a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
-
-            a->i_satd_chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
-        }
-    }
-
-    h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
-}
-
-/* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
-static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
-{
-    const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
-    pixel *p_src = h->mb.pic.p_fenc[0];
-    pixel *p_dst = h->mb.pic.p_fdec[0];
-    static const int8_t intra_analysis_shortcut[2][2][2][5] =
-    {
-        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
-          {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
-         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
-          {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
-        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
-          {-1, -1, -1, -1, -1}},
-         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
-          {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
-    };
-
-    int idx;
-    int lambda = a->i_lambda;
-
-    /*---------------- Try all mode and calculate their score ---------------*/
-    /* Disabled i16x16 for AVC-Intra compat */
-    if( !h->param.i_avcintra_class )
-    {
-        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
-
-        /* Not heavily tuned */
-        static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
-        int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
-
-        if( !h->mb.b_lossless && predict_mode[3] >= 0 )
-        {
-            h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
-            a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
-            a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
-            a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
-            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
-            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
-            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
-
-            /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
-            if( a->i_satd_i16x16 <= i16x16_thresh )
-            {
-                h->predict_16x16[I_PRED_16x16_P]( p_dst );
-                a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
-                a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
-                COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
-            }
-        }
-        else
-        {
-            for( ; *predict_mode >= 0; predict_mode++ )
-            {
-                int i_satd;
-                int i_mode = *predict_mode;
-
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_16x16( h, 0, i_mode );
-                else
-                    h->predict_16x16[i_mode]( p_dst );
-
-                i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
-                         lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
-                COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
-                a->i_satd_i16x16_dir[i_mode] = i_satd;
-            }
-        }
-
-        if( h->sh.i_type == SLICE_TYPE_B )
-            /* cavlc mb type prefix */
-            a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
-
-        if( a->i_satd_i16x16 > i16x16_thresh )
-            return;
-    }
-
-    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
-    /* 8x8 prediction selection */
-    if( flags & X264_ANALYSE_I8x8 )
-    {
-        ALIGNED_ARRAY_32( pixel, edge,[36] );
-        x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
-        int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
-
-        // FIXME some bias like in i4x4?
-        int i_cost = lambda * 4; /* base predmode costs */
-        h->mb.i_cbp_luma = 0;
-
-        if( h->sh.i_type == SLICE_TYPE_B )
-            i_cost += lambda * i_mb_b_cost_table[I_8x8];
-
-        for( idx = 0;; idx++ )
-        {
-            int x = idx&1;
-            int y = idx>>1;
-            pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
-            pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
-            int i_best = COST_MAX;
-            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
-
-            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
-            h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
-
-            if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
-            {
-                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
-                i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
-                i_cost += i_best & 0xffff;
-                i_best >>= 16;
-                a->i_predict8x8[idx] = i_best;
-                if( idx == 3 || i_cost > i_satd_thresh )
-                    break;
-                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
-            }
-            else
-            {
-                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
-                {
-                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
-                    h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
-                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
-                    satd[i_pred_mode] -= 3 * lambda;
-                    for( int i = 2; i >= 0; i-- )
-                    {
-                        int cost = satd[i];
-                        a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
-                        COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
-                    }
-
-                    /* Take analysis shortcuts: don't analyse modes that are too
-                     * far away direction-wise from the favored mode. */
-                    if( a->i_mbrd < 1 + a->b_fast_intra )
-                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
-                    else
-                        predict_mode += 3;
-                }
-
-                for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
-                {
-                    int i_satd;
-                    int i_mode = *predict_mode;
-
-                    if( h->mb.b_lossless )
-                        x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
-                    else
-                        h->predict_8x8[i_mode]( p_dst_by, edge );
-
-                    i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
-                    if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
-                        i_satd -= 3 * lambda;
-
-                    COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
-                    a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
-                }
-                i_cost += i_best + 3*lambda;
-
-                if( idx == 3 || i_cost > i_satd_thresh )
-                    break;
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
-                else
-                    h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
-                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
-            }
-            /* we need to encode this block now (for next ones) */
-            x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
-        }
-
-        if( idx == 3 )
-        {
-            a->i_satd_i8x8 = i_cost;
-            if( h->mb.i_skip_intra )
-            {
-                h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
-                h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
-                h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
-                h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
-                h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
-                if( h->mb.i_skip_intra == 2 )
-                    h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
-            }
-        }
-        else
-        {
-            static const uint16_t cost_div_fix8[3] = {1024,512,341};
-            a->i_satd_i8x8 = COST_MAX;
-            i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
-        }
-        /* Not heavily tuned */
-        static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
-        if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
-            return;
-    }
-
-    /* 4x4 prediction selection */
-    if( flags & X264_ANALYSE_I4x4 )
-    {
-        int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
-        int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
-        h->mb.i_cbp_luma = 0;
-
-        if( a->b_early_terminate && a->i_mbrd )
-            i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
-
-        if( h->sh.i_type == SLICE_TYPE_B )
-            i_cost += lambda * i_mb_b_cost_table[I_4x4];
-
-        for( idx = 0;; idx++ )
-        {
-            pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
-            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
-            int i_best = COST_MAX;
-            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
-
-            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
-
-            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                /* emulate missing topright samples */
-                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
-
-            if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
-            {
-                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
-                i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
-                i_cost += i_best & 0xffff;
-                i_best >>= 16;
-                a->i_predict4x4[idx] = i_best;
-                if( i_cost > i_satd_thresh || idx == 15 )
-                    break;
-                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
-            }
-            else
-            {
-                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
-                {
-                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
-                    h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
-                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
-                    satd[i_pred_mode] -= 3 * lambda;
-                    i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
-                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
-                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
-
-                    /* Take analysis shortcuts: don't analyse modes that are too
-                     * far away direction-wise from the favored mode. */
-                    if( a->i_mbrd < 1 + a->b_fast_intra )
-                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
-                    else
-                        predict_mode += 3;
-                }
-
-                if( i_best > 0 )
-                {
-                    for( ; *predict_mode >= 0; predict_mode++ )
-                    {
-                        int i_satd;
-                        int i_mode = *predict_mode;
-
-                        if( h->mb.b_lossless )
-                            x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
-                        else
-                            h->predict_4x4[i_mode]( p_dst_by );
-
-                        i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
-                        if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
-                        {
-                            i_satd -= lambda * 3;
-                            if( i_satd <= 0 )
-                            {
-                                i_best = i_satd;
-                                a->i_predict4x4[idx] = i_mode;
-                                break;
-                            }
-                        }
-
-                        COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
-                    }
-                }
-
-                i_cost += i_best + 3 * lambda;
-                if( i_cost > i_satd_thresh || idx == 15 )
-                    break;
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
-                else
-                    h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
-                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
-            }
-            /* we need to encode this block now (for next ones) */
-            x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
-        }
-        if( idx == 15 )
-        {
-            a->i_satd_i4x4 = i_cost;
-            if( h->mb.i_skip_intra )
-            {
-                h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
-                h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
-                h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
-                h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
-                h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
-                if( h->mb.i_skip_intra == 2 )
-                    h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
-            }
-        }
-        else
-            a->i_satd_i4x4 = COST_MAX;
-    }
-}
-
-static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
-{
-    if( !a->b_early_terminate )
-        i_satd_thresh = COST_MAX;
-
-    if( a->i_satd_i16x16 < i_satd_thresh )
-    {
-        h->mb.i_type = I_16x16;
-        x264_analyse_update_cache( h, a );
-        a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-    else
-        a->i_satd_i16x16 = COST_MAX;
-
-    if( a->i_satd_i4x4 < i_satd_thresh )
-    {
-        h->mb.i_type = I_4x4;
-        x264_analyse_update_cache( h, a );
-        a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-    else
-        a->i_satd_i4x4 = COST_MAX;
-
-    if( a->i_satd_i8x8 < i_satd_thresh )
-    {
-        h->mb.i_type = I_8x8;
-        x264_analyse_update_cache( h, a );
-        a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-        a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
-    }
-    else
-        a->i_satd_i8x8 = COST_MAX;
-}
-
-static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
-{
-    uint64_t i_satd, i_best;
-    int plane_count = CHROMA444 ? 3 : 1;
-    h->mb.i_skip_intra = 0;
-
-    if( h->mb.i_type == I_16x16 )
-    {
-        int old_pred_mode = a->i_predict16x16;
-        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
-        int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
-        i_best = a->i_satd_i16x16;
-        for( ; *predict_mode >= 0; predict_mode++ )
-        {
-            int i_mode = *predict_mode;
-            if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
-                continue;
-            h->mb.i_intra16x16_pred_mode = i_mode;
-            i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
-            COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
-        }
-    }
-
-    /* RD selection for chroma prediction */
-    if( !CHROMA444 )
-    {
-        const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
-        if( predict_mode[1] >= 0 )
-        {
-            int8_t predict_mode_sorted[4];
-            int i_max;
-            int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
-
-            for( i_max = 0; *predict_mode >= 0; predict_mode++ )
-            {
-                int i_mode = *predict_mode;
-                if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
-                    predict_mode_sorted[i_max++] = i_mode;
-            }
-
-            if( i_max > 0 )
-            {
-                int i_cbp_chroma_best = h->mb.i_cbp_chroma;
-                int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
-                /* the previous thing encoded was x264_intra_rd(), so the pixels and
-                 * coefs for the current chroma mode are still around, so we only
-                 * have to recount the bits. */
-                i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
-                for( int i = 0; i < i_max; i++ )
-                {
-                    int i_mode = predict_mode_sorted[i];
-                    if( h->mb.b_lossless )
-                        x264_predict_lossless_chroma( h, i_mode );
-                    else
-                    {
-                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
-                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
-                    }
-                    /* if we've already found a mode that needs no residual, then
-                     * probably any mode with a residual will be worse.
-                     * so avoid dct on the remaining modes to improve speed. */
-                    i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
-                    COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
-                }
-                h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
-                h->mb.i_cbp_chroma = i_cbp_chroma_best;
-            }
-        }
-    }
-
-    if( h->mb.i_type == I_4x4 )
-    {
-        pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
-        int nnz[3] = {0};
-        for( int idx = 0; idx < 16; idx++ )
-        {
-            pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
-                             h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
-                             h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
-            i_best = COST_MAX64;
-
-            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
-
-            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                for( int p = 0; p < plane_count; p++ )
-                    /* emulate missing topright samples */
-                    MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
-
-            for( ; *predict_mode >= 0; predict_mode++ )
-            {
-                int i_mode = *predict_mode;
-                i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
-
-                if( i_best > i_satd )
-                {
-                    a->i_predict4x4[idx] = i_mode;
-                    i_best = i_satd;
-                    for( int p = 0; p < plane_count; p++ )
-                    {
-                        pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
-                        pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
-                        pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
-                        pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
-                        nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
-                    }
-                }
-            }
-
-            for( int p = 0; p < plane_count; p++ )
-            {
-                MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
-                MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
-                MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
-                MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
-                h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
-            }
-
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
-        }
-    }
-    else if( h->mb.i_type == I_8x8 )
-    {
-        ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
-        pixel4 pels_h[3][2] = {{0}};
-        pixel pels_v[3][7] = {{0}};
-        uint16_t nnz[3][2] = {{0}}; //shut up gcc
-        for( int idx = 0; idx < 4; idx++ )
-        {
-            int x = idx&1;
-            int y = idx>>1;
-            int s8 = X264_SCAN8_0 + 2*x + 16*y;
-            pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
-                             h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
-                             h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
-            int cbp_luma_new = 0;
-            int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
-
-            i_best = COST_MAX64;
-
-            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
-            for( int p = 0; p < plane_count; p++ )
-                h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
-
-            for( ; *predict_mode >= 0; predict_mode++ )
-            {
-                int i_mode = *predict_mode;
-                if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
-                    continue;
-
-                h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
-                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
-
-                if( i_best > i_satd )
-                {
-                    a->i_predict8x8[idx] = i_mode;
-                    cbp_luma_new = h->mb.i_cbp_luma;
-                    i_best = i_satd;
-
-                    for( int p = 0; p < plane_count; p++ )
-                    {
-                        pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
-                        pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
-                        if( !(idx&1) )
-                            for( int j = 0; j < 7; j++ )
-                                pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
-                        nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
-                        nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
-                    }
-                }
-            }
-            a->i_cbp_i8x8_luma = cbp_luma_new;
-            for( int p = 0; p < plane_count; p++ )
-            {
-                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
-                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
-                if( !(idx&1) )
-                    for( int j = 0; j < 7; j++ )
-                        dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
-                M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
-                M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
-            }
-
-            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
-        }
-    }
-}
-
-#define LOAD_FENC(m, src, xoff, yoff) \
-{ \
-    (m)->p_cost_mv = a->p_cost_mv; \
-    (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
-    (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
-    (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
-    (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
-    (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
-    (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
-}
-
-#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
-{ \
-    (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    if( CHROMA444 ) \
-    { \
-        (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
-        (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
-        (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
-        (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
-        (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
-        (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
-        (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
-        (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
-    } \
-    else \
-        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
-    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->weight = x264_weight_none; \
-    (m)->i_ref = ref; \
-}
-
-#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
-    (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->weight = h->sh.weight[i_ref];
-
-#define REF_COST(list, ref) \
-    (a->p_cost_ref[list][ref])
-
-static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
-{
-    x264_me_t m;
-    int i_mvc;
-    ALIGNED_4( int16_t mvc[8][2] );
-    int i_halfpel_thresh = INT_MAX;
-    int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
-
-    /* 16x16 Search on all ref frame */
-    m.i_pixel = PIXEL_16x16;
-    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
-
-    a->l0.me16x16.cost = INT_MAX;
-    for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
-    {
-        m.i_ref_cost = REF_COST( 0, i_ref );
-        i_halfpel_thresh -= m.i_ref_cost;
-
-        /* search with ref */
-        LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
-        LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
-
-        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
-
-        if( h->mb.ref_blind_dupe == i_ref )
-        {
-            CP32( m.mv, a->l0.mvc[0][0] );
-            x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
-        }
-        else
-        {
-            x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
-            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
-        }
-
-        /* save mv for predicting neighbors */
-        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
-        CP32( a->l0.mvc[i_ref][0], m.mv );
-
-        /* early termination
-         * SSD threshold would probably be better than SATD */
-        if( i_ref == 0
-            && a->b_try_skip
-            && m.cost-m.cost_mv < 300*a->i_lambda
-            &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
-              + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
-            && x264_macroblock_probe_pskip( h ) )
-        {
-            h->mb.i_type = P_SKIP;
-            x264_analyse_update_cache( h, a );
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
-            return;
-        }
-
-        m.cost += m.i_ref_cost;
-        i_halfpel_thresh += m.i_ref_cost;
-
-        if( m.cost < a->l0.me16x16.cost )
-            h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
-    }
-
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
-
-    h->mb.i_type = P_L0;
-    if( a->i_mbrd )
-    {
-        x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
-        if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
-        {
-            h->mb.i_partition = D_16x16;
-            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
-            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-            if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
-                h->mb.i_type = P_SKIP;
-        }
-    }
-}
-
-static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
-{
-    x264_me_t m;
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    int i_maxref = h->mb.pic.i_fref[0]-1;
-
-    h->mb.i_partition = D_8x8;
-
-    #define CHECK_NEIGHBOUR(i)\
-    {\
-        int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
-        if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
-            i_maxref = ref;\
-    }
-
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
-     * than those used by the neighbors */
-    if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
-        h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
-    {
-        i_maxref = 0;
-        CHECK_NEIGHBOUR(  -8 - 1 );
-        CHECK_NEIGHBOUR(  -8 + 0 );
-        CHECK_NEIGHBOUR(  -8 + 2 );
-        CHECK_NEIGHBOUR(  -8 + 4 );
-        CHECK_NEIGHBOUR(   0 - 1 );
-        CHECK_NEIGHBOUR( 2*8 - 1 );
-    }
-    #undef CHECK_NEIGHBOUR
-
-    for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
-        CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
-
-    for( int i = 0; i < 4; i++ )
-    {
-        x264_me_t *l0m = &a->l0.me8x8[i];
-        int x8 = i&1;
-        int y8 = i>>1;
-
-        m.i_pixel = PIXEL_8x8;
-
-        LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
-        l0m->cost = INT_MAX;
-        for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
-        {
-            m.i_ref_cost = REF_COST( 0, i_ref );
-
-            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
-            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
-
-            x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
-            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-            if( h->mb.ref_blind_dupe == i_ref )
-            {
-                CP32( m.mv, a->l0.mvc[0][i+1] );
-                x264_me_refine_qpel_refdupe( h, &m, NULL );
-            }
-            else
-                x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
-
-            m.cost += m.i_ref_cost;
-
-            CP32( a->l0.mvc[i_ref][i+1], m.mv );
-
-            if( m.cost < l0m->cost )
-                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
-            if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
-                i_ref = h->mb.ref_blind_dupe;
-            else
-                i_ref++;
-        }
-        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
-        x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
-
-        a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
-
-        /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
-           are effectively zero. */
-        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
-            l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
-    }
-
-    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
-                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
-    /* P_8x8 ref0 has no ref cost */
-    if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
-                               a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
-        a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
-    M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
-}
-
-static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
-{
-    /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
-     * reference frame flags.  Thus, if we're not doing mixedrefs, just
-     * don't bother analysing the dupes. */
-    const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
-    const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    int i_mvc;
-    int16_t (*mvc)[2] = a->l0.mvc[i_ref];
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    i_mvc = 1;
-    CP32( mvc[0], a->l0.me16x16.mv );
-
-    for( int i = 0; i < 4; i++ )
-    {
-        x264_me_t *m = &a->l0.me8x8[i];
-        int x8 = i&1;
-        int y8 = i>>1;
-
-        m->i_pixel = PIXEL_8x8;
-        m->i_ref_cost = i_ref_cost;
-
-        LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
-        LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
-        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
-
-        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
-        x264_me_search( h, m, mvc, i_mvc );
-
-        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
-
-        CP32( mvc[i_mvc], m->mv );
-        i_mvc++;
-
-        a->i_satd8x8[0][i] = m->cost - m->cost_mv;
-
-        /* mb type cost */
-        m->cost += i_ref_cost;
-        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
-            m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
-    }
-
-    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
-                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
-    /* theoretically this should include 4*ref_cost,
-     * but 3 seems a better approximation of cabac. */
-    if( h->param.b_cabac )
-        a->l0.i_cost8x8 -= i_ref_cost;
-    M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
-}
-
-static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
-{
-    x264_me_t m;
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    ALIGNED_4( int16_t mvc[3][2] );
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_16x8;
-
-    for( int i = 0; i < 2; i++ )
-    {
-        x264_me_t *l0m = &a->l0.me16x8[i];
-        const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
-        const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
-        const int ref8[2] = { minref, maxref };
-        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
-
-        m.i_pixel = PIXEL_16x8;
-
-        LOAD_FENC( &m, p_fenc, 0, 8*i );
-        l0m->cost = INT_MAX;
-        for( int j = 0; j < i_ref8s; j++ )
-        {
-            const int i_ref = ref8[j];
-            m.i_ref_cost = REF_COST( 0, i_ref );
-
-            /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
-            CP32( mvc[0], a->l0.mvc[i_ref][0] );
-            CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
-            CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
-
-            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
-            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
-
-            x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
-            x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
-            /* We can only take this shortcut if the first search was performed on ref0. */
-            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
-            {
-                /* We can just leave the MV from the previous ref search. */
-                x264_me_refine_qpel_refdupe( h, &m, NULL );
-            }
-            else
-                x264_me_search( h, &m, mvc, 3 );
-
-            m.cost += m.i_ref_cost;
-
-            if( m.cost < l0m->cost )
-                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
-        }
-
-        /* Early termination based on the current SATD score of partition[0]
-           plus the estimated SATD score of partition[1] */
-        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
-        {
-            a->l0.i_cost16x8 = COST_MAX;
-            return;
-        }
-
-        x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
-        x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
-    }
-
-    a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
-}
-
-static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
-{
-    x264_me_t m;
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    ALIGNED_4( int16_t mvc[3][2] );
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x16;
-
-    for( int i = 0; i < 2; i++ )
-    {
-        x264_me_t *l0m = &a->l0.me8x16[i];
-        const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
-        const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
-        const int ref8[2] = { minref, maxref };
-        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
-
-        m.i_pixel = PIXEL_8x16;
-
-        LOAD_FENC( &m, p_fenc, 8*i, 0 );
-        l0m->cost = INT_MAX;
-        for( int j = 0; j < i_ref8s; j++ )
-        {
-            const int i_ref = ref8[j];
-            m.i_ref_cost = REF_COST( 0, i_ref );
-
-            CP32( mvc[0], a->l0.mvc[i_ref][0] );
-            CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
-            CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
-
-            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
-            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
-
-            x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
-            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-            /* We can only take this shortcut if the first search was performed on ref0. */
-            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
-            {
-                /* We can just leave the MV from the previous ref search. */
-                x264_me_refine_qpel_refdupe( h, &m, NULL );
-            }
-            else
-                x264_me_search( h, &m, mvc, 3 );
-
-            m.cost += m.i_ref_cost;
-
-            if( m.cost < l0m->cost )
-                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
-        }
-
-        /* Early termination based on the current SATD score of partition[0]
-           plus the estimated SATD score of partition[1] */
-        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
-        {
-            a->l0.i_cost8x16 = COST_MAX;
-            return;
-        }
-
-        x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
-        x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
-    }
-
-    a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
-}
-
-static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
-                                                                     pixel **p_fref, int i8x8, int size, int chroma )
-{
-    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
-    pixel *pix2 = pix1+8;
-    int i_stride = h->mb.pic.i_stride[1];
-    int chroma_h_shift = chroma <= CHROMA_422;
-    int chroma_v_shift = chroma == CHROMA_420;
-    int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
-    int i_ref = a->l0.me8x8[i8x8].i_ref;
-    int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    x264_weight_t *weight = h->sh.weight[i_ref];
-
-    // FIXME weight can be done on 4x4 blocks even if mc is smaller
-#define CHROMA4x4MC( width, height, me, x, y ) \
-    if( chroma == CHROMA_444 ) \
-    { \
-        int mvx = (me).mv[0] + 4*2*x; \
-        int mvy = (me).mv[1] + 4*2*y; \
-        h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
-                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
-        h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
-                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
-    } \
-    else \
-    { \
-        int offset = x + (2>>chroma_v_shift)*16*y; \
-        int chroma_height = (2>>chroma_v_shift)*height; \
-        h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
-                         (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
-        if( weight[1].weightfn ) \
-            weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
-        if( weight[2].weightfn ) \
-            weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
-    }
-
-    if( size == PIXEL_4x4 )
-    {
-        x264_me_t *m = a->l0.me4x4[i8x8];
-        CHROMA4x4MC( 2,2, m[0], 0,0 );
-        CHROMA4x4MC( 2,2, m[1], 2,0 );
-        CHROMA4x4MC( 2,2, m[2], 0,2 );
-        CHROMA4x4MC( 2,2, m[3], 2,2 );
-    }
-    else if( size == PIXEL_8x4 )
-    {
-        x264_me_t *m = a->l0.me8x4[i8x8];
-        CHROMA4x4MC( 4,2, m[0], 0,0 );
-        CHROMA4x4MC( 4,2, m[1], 0,2 );
-    }
-    else
-    {
-        x264_me_t *m = a->l0.me4x8[i8x8];
-        CHROMA4x4MC( 2,4, m[0], 0,0 );
-        CHROMA4x4MC( 2,4, m[1], 2,0 );
-    }
-#undef CHROMA4x4MC
-
-    int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
-    int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
-    return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
-         + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
-}
-
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
-{
-    if( CHROMA_FORMAT == CHROMA_444 )
-        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
-    else if( CHROMA_FORMAT == CHROMA_422 )
-        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
-    else
-        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
-}
-
-static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
-{
-    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    const int i_ref = a->l0.me8x8[i8x8].i_ref;
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-    {
-        const int idx = 4*i8x8 + i4x4;
-        const int x4 = block_idx_x[idx];
-        const int y4 = block_idx_y[idx];
-        const int i_mvc = (i4x4 == 0);
-
-        x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
-
-        m->i_pixel = PIXEL_4x4;
-
-        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
-        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
-        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
-
-        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
-        x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
-
-        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
-    }
-    a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
-                            a->l0.me4x4[i8x8][1].cost +
-                            a->l0.me4x4[i8x8][2].cost +
-                            a->l0.me4x4[i8x8][3].cost +
-                            REF_COST( 0, i_ref ) +
-                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
-    if( h->mb.b_chroma_me && !CHROMA444 )
-        a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
-}
-
-static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
-{
-    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    const int i_ref = a->l0.me8x8[i8x8].i_ref;
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    for( int i8x4 = 0; i8x4 < 2; i8x4++ )
-    {
-        const int idx = 4*i8x8 + 2*i8x4;
-        const int x4 = block_idx_x[idx];
-        const int y4 = block_idx_y[idx];
-        const int i_mvc = (i8x4 == 0);
-
-        x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
-
-        m->i_pixel = PIXEL_8x4;
-
-        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
-        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
-        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
-
-        x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
-        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
-
-        x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
-    }
-    a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
-                            REF_COST( 0, i_ref ) +
-                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
-    if( h->mb.b_chroma_me && !CHROMA444 )
-        a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
-}
-
-static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
-{
-    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    pixel **p_fenc = h->mb.pic.p_fenc;
-    const int i_ref = a->l0.me8x8[i8x8].i_ref;
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    for( int i4x8 = 0; i4x8 < 2; i4x8++ )
-    {
-        const int idx = 4*i8x8 + i4x8;
-        const int x4 = block_idx_x[idx];
-        const int y4 = block_idx_y[idx];
-        const int i_mvc = (i4x8 == 0);
-
-        x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
-
-        m->i_pixel = PIXEL_4x8;
-
-        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
-        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
-        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
-
-        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
-        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
-
-        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
-    }
-    a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
-                            REF_COST( 0, i_ref ) +
-                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
-    if( h->mb.b_chroma_me && !CHROMA444 )
-        a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
-}
-
-static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
-{
-    ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
-    ALIGNED_ARRAY_N( pixel,  bi, [2],[16*16] );
-    int i_chroma_cost = 0;
-    int chromapix = h->luma2chroma_pixel[i_pixel];
-
-#define COST_BI_CHROMA( m0, m1, width, height ) \
-{ \
-    if( CHROMA444 ) \
-    { \
-        h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
-                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
-        h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
-                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
-        h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
-                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
-        h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
-                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
-    } \
-    else \
-    { \
-        int v_shift = CHROMA_V_SHIFT; \
-        int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
-                         m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
-        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
-                         m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
-    } \
-    h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-    h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-    i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
-                  + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
-}
-
-    if( i_pixel == PIXEL_16x16 )
-        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
-    else if( i_pixel == PIXEL_16x8 )
-        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
-    else if( i_pixel == PIXEL_8x16 )
-        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
-    else
-        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
-
-    return i_chroma_cost;
-}
-
-static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
-{
-    /* Assumes that fdec still contains the results of
-     * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
-
-    pixel *p_fenc = h->mb.pic.p_fenc[0];
-    pixel *p_fdec = h->mb.pic.p_fdec[0];
-
-    a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
-    if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
-    {
-        int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
-
-        for( int i = 0; i < 4; i++ )
-        {
-            const int x = (i&1)*8;
-            const int y = (i>>1)*8;
-            a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
-                                                              &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
-            if( h->mb.b_chroma_me )
-            {
-                int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
-                int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
-                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
-                                       + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
-            }
-            a->i_cost16x16direct += a->i_cost8x8direct[i];
-
-            /* mb type cost */
-            a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
-        }
-    }
-    else
-    {
-        a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
-        if( h->mb.b_chroma_me )
-        {
-            int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
-            a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
-                                 +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
-        }
-    }
-}
-
-static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
-{
-    ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
-    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
-    pixel *src0, *src1;
-    intptr_t stride0 = 16, stride1 = 16;
-    int i_ref, i_mvc;
-    ALIGNED_4( int16_t mvc[9][2] );
-    int try_skip = a->b_try_skip;
-    int list1_skipped = 0;
-    int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
-    int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
-                                (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
-
-    x264_me_t m;
-    m.i_pixel = PIXEL_16x16;
-
-    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
-
-    /* 16x16 Search on list 0 and list 1 */
-    a->l0.me16x16.cost = INT_MAX;
-    a->l1.me16x16.cost = INT_MAX;
-    for( int l = 1; l >= 0; )
-    {
-        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-
-        /* This loop is extremely munged in order to facilitate the following order of operations,
-         * necessary for an efficient fast skip.
-         * 1.  Search list1 ref0.
-         * 2.  Search list0 ref0.
-         * 3.  Try skip.
-         * 4.  Search the rest of list0.
-         * 5.  Go back and finish list1.
-         */
-        for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
-        {
-            if( try_skip && l == 1 && i_ref > 0 )
-            {
-                list1_skipped = 1;
-                break;
-            }
-
-            m.i_ref_cost = REF_COST( l, i_ref );
-
-            /* search with ref */
-            LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
-            x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
-            x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
-            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
-
-            /* add ref cost */
-            m.cost += m.i_ref_cost;
-
-            if( m.cost < lX->me16x16.cost )
-                h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
-
-            /* save mv for predicting neighbors */
-            CP32( lX->mvc[i_ref][0], m.mv );
-            CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
-
-            /* Fast skip detection. */
-            if( i_ref == 0 && try_skip )
-            {
-                if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
-                    abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
-                {
-                    try_skip = 0;
-                }
-                else if( !l )
-                {
-                    /* We already tested skip */
-                    h->mb.i_type = B_SKIP;
-                    x264_analyse_update_cache( h, a );
-                    return;
-                }
-            }
-        }
-        if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
-            break;
-        if( list1_skipped && l == 0 )
-            l = 1;
-        else
-            l--;
-    }
-
-    /* get cost of BI mode */
-    h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
-    h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
-    int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
-    src0 = h->mc.get_ref( pix0, &stride0,
-                          h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
-                          a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
-    src1 = h->mc.get_ref( pix1, &stride1,
-                          h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
-                          a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
-
-    h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-
-    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
-                     + ref_costs
-                     + a->l0.bi16x16.cost_mv
-                     + a->l1.bi16x16.cost_mv;
-
-    if( h->mb.b_chroma_me )
-        a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
-
-    /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
-    if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
-    {
-        int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
-                       + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
-        int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
-                       + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
-        h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
-                                h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
-                                h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-        int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
-                   + ref_costs + l0_mv_cost + l1_mv_cost;
-
-        if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
-        {
-            ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
-
-            if( CHROMA444 )
-            {
-                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
-                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
-                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
-                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
-                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
-                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
-            }
-            else
-            {
-                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
-                int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
-                int v_shift = CHROMA_V_SHIFT;
-
-                if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
-                {
-                    int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
-                    h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
-                                     h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
-                }
-                else
-                    h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
-                                                         h->mb.pic.i_stride[1], 16>>v_shift );
-
-                if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
-                {
-                    int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
-                    h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
-                                     h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
-                }
-                else
-                    h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
-                                                         h->mb.pic.i_stride[1], 16>>v_shift );
-
-                h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
-                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-                h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
-                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-
-                cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
-                       +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
-            }
-        }
-
-        if( cost00 < a->i_cost16x16bi )
-        {
-            M32( a->l0.bi16x16.mv ) = 0;
-            M32( a->l1.bi16x16.mv ) = 0;
-            a->l0.bi16x16.cost_mv = l0_mv_cost;
-            a->l1.bi16x16.cost_mv = l1_mv_cost;
-            a->i_cost16x16bi = cost00;
-        }
-    }
-
-    /* mb type cost */
-    a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
-    a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
-    a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
-}
-
-static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
-{
-    int x = 2*(i&1);
-    int y = i&2;
-
-    switch( h->mb.i_sub_partition[i] )
-    {
-        case D_L0_8x8:
-            x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
-            break;
-        case D_L0_8x4:
-            x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
-            x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
-            break;
-        case D_L0_4x8:
-            x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
-            x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
-            break;
-        case D_L0_4x4:
-            x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
-            x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
-            x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
-            x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
-            break;
-        default:
-            x264_log( h, X264_LOG_ERROR, "internal error\n" );
-            break;
-    }
-}
-
-static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
-{
-    int x = 2*(idx&1);
-    int y = idx&2;
-    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
-    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
-    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
-    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
-}
-
-#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
-    if( x264_mb_partition_listX_table[0][part] ) \
-    { \
-        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
-        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
-    } \
-    else \
-    { \
-        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
-        if( b_mvd ) \
-            x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
-    } \
-    if( x264_mb_partition_listX_table[1][part] ) \
-    { \
-        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
-        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
-    } \
-    else \
-    { \
-        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
-        if( b_mvd ) \
-            x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
-    }
-
-static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
-{
-    int x = 2*(i&1);
-    int y = i&2;
-    if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
-    {
-        x264_mb_load_mv_direct8x8( h, i );
-        if( b_mvd )
-        {
-            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
-            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
-            x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
-        }
-    }
-    else
-    {
-        CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
-    }
-}
-static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
-{
-    CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
-}
-static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
-{
-    CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
-}
-#undef CACHE_MV_BI
-
-static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
-{
-    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
-    int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
-
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
-     * than those used by the neighbors */
-    #define CHECK_NEIGHBOUR(i)\
-    {\
-        int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
-        if( ref > i_maxref[l] )\
-            i_maxref[l] = ref;\
-    }
-
-    for( int l = 0; l < 2; l++ )
-    {
-        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-        if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
-            h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
-        {
-            i_maxref[l] = 0;
-            CHECK_NEIGHBOUR(  -8 - 1 );
-            CHECK_NEIGHBOUR(  -8 + 0 );
-            CHECK_NEIGHBOUR(  -8 + 2 );
-            CHECK_NEIGHBOUR(  -8 + 4 );
-            CHECK_NEIGHBOUR(   0 - 1 );
-            CHECK_NEIGHBOUR( 2*8 - 1 );
-        }
-    }
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    a->i_cost8x8bi = 0;
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int x8 = i&1;
-        int y8 = i>>1;
-        int i_part_cost;
-        int i_part_cost_bi;
-        intptr_t stride[2] = {8,8};
-        pixel *src[2];
-        x264_me_t m;
-        m.i_pixel = PIXEL_8x8;
-        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
-
-        for( int l = 0; l < 2; l++ )
-        {
-            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-
-            lX->me8x8[i].cost = INT_MAX;
-            for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
-            {
-                m.i_ref_cost = REF_COST( l, i_ref );
-
-                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
-
-                x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
-                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
-                x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
-                m.cost += m.i_ref_cost;
-
-                if( m.cost < lX->me8x8[i].cost )
-                {
-                    h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
-                    a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
-                }
-
-                /* save mv for predicting other partitions within this MB */
-                CP32( lX->mvc[i_ref][i+1], m.mv );
-            }
-        }
-
-        /* BI mode */
-        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
-                                a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
-        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
-                                a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
-        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
-                                h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
-
-        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
-        i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
-                         + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
-                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
-
-        if( h->mb.b_chroma_me )
-        {
-            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
-            i_part_cost_bi += i_chroma_cost;
-            a->i_satd8x8[2][i] += i_chroma_cost;
-        }
-
-        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
-        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
-
-        i_part_cost = a->l0.me8x8[i].cost;
-        h->mb.i_sub_partition[i] = D_L0_8x8;
-        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
-        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
-        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
-        a->i_cost8x8bi += i_part_cost;
-
-        /* XXX Needed for x264_mb_predict_mv */
-        x264_mb_cache_mv_b8x8( h, a, i, 0 );
-    }
-
-    /* mb type cost */
-    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
-}
-
-static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
-{
-    pixel **p_fref[2] =
-        { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
-          h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
-    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
-
-    /* XXX Needed for x264_mb_predict_mv */
-    h->mb.i_partition = D_8x8;
-
-    a->i_cost8x8bi = 0;
-
-    for( int i = 0; i < 4; i++ )
-    {
-        int x8 = i&1;
-        int y8 = i>>1;
-        int i_part_cost;
-        int i_part_cost_bi = 0;
-        intptr_t stride[2] = {8,8};
-        pixel *src[2];
-
-        for( int l = 0; l < 2; l++ )
-        {
-            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-            x264_me_t *m = &lX->me8x8[i];
-            m->i_pixel = PIXEL_8x8;
-            LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
-
-            m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
-            m->i_ref = lX->me16x16.i_ref;
-
-            LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
-
-            x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
-            x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
-            x264_me_search( h, m, &lX->me16x16.mv, 1 );
-            a->i_satd8x8[l][i] = m->cost - m->cost_mv;
-            m->cost += m->i_ref_cost;
-
-            x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
-
-            /* save mv for predicting other partitions within this MB */
-            CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
-
-            /* BI mode */
-            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
-                                    m->mv[0], m->mv[1], 8, 8, x264_weight_none );
-            i_part_cost_bi += m->cost_mv + m->i_ref_cost;
-        }
-        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
-        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
-        i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
-        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
-        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
-
-        if( h->mb.b_chroma_me )
-        {
-            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
-            i_part_cost_bi += i_chroma_cost;
-            a->i_satd8x8[2][i] += i_chroma_cost;
-        }
-
-        i_part_cost = a->l0.me8x8[i].cost;
-        h->mb.i_sub_partition[i] = D_L0_8x8;
-        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
-        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
-        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
-        a->i_cost8x8bi += i_part_cost;
-
-        /* XXX Needed for x264_mb_predict_mv */
-        x264_mb_cache_mv_b8x8( h, a, i, 0 );
-    }
-
-    /* mb type cost */
-    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
-}
-
-static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
-{
-    ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
-    ALIGNED_4( int16_t mvc[3][2] );
-
-    h->mb.i_partition = D_16x8;
-    a->i_cost16x8bi = 0;
-
-    for( int i = 0; i < 2; i++ )
-    {
-        int i_part_cost;
-        int i_part_cost_bi = 0;
-        intptr_t stride[2] = {16,16};
-        pixel *src[2];
-        x264_me_t m;
-        m.i_pixel = PIXEL_16x8;
-        LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
-
-        for( int l = 0; l < 2; l++ )
-        {
-            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-            int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
-            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
-            lX->me16x8[i].cost = INT_MAX;
-            for( int j = 0; j < i_ref8s; j++ )
-            {
-                int i_ref = ref8[j];
-                m.i_ref_cost = REF_COST( l, i_ref );
-
-                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
-
-                CP32( mvc[0], lX->mvc[i_ref][0] );
-                CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
-                CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
-
-                x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
-                x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
-                x264_me_search( h, &m, mvc, 3 );
-                m.cost += m.i_ref_cost;
-
-                if( m.cost < lX->me16x8[i].cost )
-                    h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
-            }
-        }
-
-        /* BI mode */
-        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
-                                a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
-        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
-                                a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
-        h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
-                                h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
-
-        i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
-                        + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
-                        + a->l1.me16x8[i].i_ref_cost;
-
-        if( h->mb.b_chroma_me )
-            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
-
-        i_part_cost = a->l0.me16x8[i].cost;
-        a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
-
-        if( a->l1.me16x8[i].cost < i_part_cost )
-        {
-            i_part_cost = a->l1.me16x8[i].cost;
-            a->i_mb_partition16x8[i] = D_L1_8x8;
-        }
-        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
-        {
-            i_part_cost = i_part_cost_bi;
-            a->i_mb_partition16x8[i] = D_BI_8x8;
-        }
-        a->i_cost16x8bi += i_part_cost;
-
-        /* Early termination based on the current SATD score of partition[0]
-           plus the estimated SATD score of partition[1] */
-        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
-            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
-        {
-            a->i_cost16x8bi = COST_MAX;
-            return;
-        }
-
-        x264_mb_cache_mv_b16x8( h, a, i, 0 );
-    }
-
-    /* mb type cost */
-    a->i_mb_type16x8 = B_L0_L0
-        + (a->i_mb_partition16x8[0]>>2) * 3
-        + (a->i_mb_partition16x8[1]>>2);
-    a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
-}
-
-static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
-{
-    ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
-    ALIGNED_4( int16_t mvc[3][2] );
-
-    h->mb.i_partition = D_8x16;
-    a->i_cost8x16bi = 0;
-
-    for( int i = 0; i < 2; i++ )
-    {
-        int i_part_cost;
-        int i_part_cost_bi = 0;
-        intptr_t stride[2] = {8,8};
-        pixel *src[2];
-        x264_me_t m;
-        m.i_pixel = PIXEL_8x16;
-        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
-
-        for( int l = 0; l < 2; l++ )
-        {
-            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
-            int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
-            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
-            lX->me8x16[i].cost = INT_MAX;
-            for( int j = 0; j < i_ref8s; j++ )
-            {
-                int i_ref = ref8[j];
-                m.i_ref_cost = REF_COST( l, i_ref );
-
-                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
-
-                CP32( mvc[0], lX->mvc[i_ref][0] );
-                CP32( mvc[1], lX->mvc[i_ref][i+1] );
-                CP32( mvc[2], lX->mvc[i_ref][i+3] );
-
-                x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
-                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
-                x264_me_search( h, &m, mvc, 3 );
-                m.cost += m.i_ref_cost;
-
-                if( m.cost < lX->me8x16[i].cost )
-                    h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
-            }
-        }
-
-        /* BI mode */
-        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
-                                a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
-        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
-                                a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
-        h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
-
-        i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
-                        + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
-                        + a->l1.me8x16[i].i_ref_cost;
-
-        if( h->mb.b_chroma_me )
-            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
-
-        i_part_cost = a->l0.me8x16[i].cost;
-        a->i_mb_partition8x16[i] = D_L0_8x8;
-
-        if( a->l1.me8x16[i].cost < i_part_cost )
-        {
-            i_part_cost = a->l1.me8x16[i].cost;
-            a->i_mb_partition8x16[i] = D_L1_8x8;
-        }
-        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
-        {
-            i_part_cost = i_part_cost_bi;
-            a->i_mb_partition8x16[i] = D_BI_8x8;
-        }
-        a->i_cost8x16bi += i_part_cost;
-
-        /* Early termination based on the current SATD score of partition[0]
-           plus the estimated SATD score of partition[1] */
-        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
-            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
-        {
-            a->i_cost8x16bi = COST_MAX;
-            return;
-        }
-
-        x264_mb_cache_mv_b8x16( h, a, i, 0 );
-    }
-
-    /* mb type cost */
-    a->i_mb_type8x16 = B_L0_L0
-        + (a->i_mb_partition8x16[0]>>2) * 3
-        + (a->i_mb_partition8x16[1]>>2);
-    a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
-}
-
-static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
-{
-    int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
-
-    h->mb.i_type = P_L0;
-    if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
-    {
-        h->mb.i_partition = D_16x16;
-        x264_analyse_update_cache( h, a );
-        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-
-    if( a->l0.i_cost16x8 < thresh )
-    {
-        h->mb.i_partition = D_16x8;
-        x264_analyse_update_cache( h, a );
-        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-    else
-        a->l0.i_cost16x8 = COST_MAX;
-
-    if( a->l0.i_cost8x16 < thresh )
-    {
-        h->mb.i_partition = D_8x16;
-        x264_analyse_update_cache( h, a );
-        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-    else
-        a->l0.i_cost8x16 = COST_MAX;
-
-    if( a->l0.i_cost8x8 < thresh )
-    {
-        h->mb.i_type = P_8x8;
-        h->mb.i_partition = D_8x8;
-        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
-        {
-            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
-            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
-            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
-            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
-            /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
-             * for future blocks are those left over from previous RDO calls. */
-            for( int i = 0; i < 4; i++ )
-            {
-                int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
-                int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
-                int subtype, btype = D_L0_8x8;
-                uint64_t bcost = COST_MAX64;
-                for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
-                {
-                    uint64_t cost;
-                    if( costs[subtype] > sub8x8_thresh )
-                        continue;
-                    h->mb.i_sub_partition[i] = subtype;
-                    x264_mb_cache_mv_p8x8( h, a, i );
-                    if( subtype == btype )
-                        continue;
-                    cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
-                    COPY2_IF_LT( bcost, cost, btype, subtype );
-                }
-                if( h->mb.i_sub_partition[i] != btype )
-                {
-                    h->mb.i_sub_partition[i] = btype;
-                    x264_mb_cache_mv_p8x8( h, a, i );
-                }
-            }
-        }
-        else
-            x264_analyse_update_cache( h, a );
-        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-    else
-        a->l0.i_cost8x8 = COST_MAX;
-}
-
-static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
-{
-    int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
-
-    if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
-    {
-        h->mb.i_type = B_DIRECT;
-        /* Assumes direct/skip MC is still in fdec */
-        /* Requires b-rdo to be done before intra analysis */
-        h->mb.b_skip_mc = 1;
-        x264_analyse_update_cache( h, a );
-        a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
-        h->mb.b_skip_mc = 0;
-    }
-
-    //FIXME not all the update_cache calls are needed
-    h->mb.i_partition = D_16x16;
-    /* L0 */
-    if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
-    {
-        h->mb.i_type = B_L0_L0;
-        x264_analyse_update_cache( h, a );
-        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-
-    /* L1 */
-    if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
-    {
-        h->mb.i_type = B_L1_L1;
-        x264_analyse_update_cache( h, a );
-        a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-
-    /* BI */
-    if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
-    {
-        h->mb.i_type = B_BI_BI;
-        x264_analyse_update_cache( h, a );
-        a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-
-    /* 8x8 */
-    if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
-    {
-        h->mb.i_type = B_8x8;
-        h->mb.i_partition = D_8x8;
-        x264_analyse_update_cache( h, a );
-        a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
-        x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
-    }
-
-    /* 16x8 */
-    if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
-    {
-        h->mb.i_type = a->i_mb_type16x8;
-        h->mb.i_partition = D_16x8;
-        x264_analyse_update_cache( h, a );
-        a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-
-    /* 8x16 */
-    if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
-    {
-        h->mb.i_type = a->i_mb_type8x16;
-        h->mb.i_partition = D_8x16;
-        x264_analyse_update_cache( h, a );
-        a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
-}
-
-static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
-{
-    int i_biweight;
-
-    if( IS_INTRA(h->mb.i_type) )
-        return;
-
-    switch( h->mb.i_partition )
-    {
-        case D_16x16:
-            if( h->mb.i_type == B_BI_BI )
-            {
-                i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
-                x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
-            }
-            break;
-        case D_16x8:
-            for( int i = 0; i < 2; i++ )
-                if( a->i_mb_partition16x8[i] == D_BI_8x8 )
-                {
-                    i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
-                    x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
-                }
-            break;
-        case D_8x16:
-            for( int i = 0; i < 2; i++ )
-                if( a->i_mb_partition8x16[i] == D_BI_8x8 )
-                {
-                    i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
-                    x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
-                }
-            break;
-        case D_8x8:
-            for( int i = 0; i < 4; i++ )
-                if( h->mb.i_sub_partition[i] == D_BI_8x8 )
-                {
-                    i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
-                    x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
-                }
-            break;
-    }
-}
-
-static inline void x264_mb_analyse_transform( x264_t *h )
-{
-    if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
-    {
-        /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
-        x264_mb_mc( h );
-
-        int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
-        int i_cost8 = 0, i_cost4 = 0;
-        /* Not all platforms have a merged SATD function */
-        if( h->pixf.sa8d_satd[PIXEL_16x16] )
-        {
-            uint64_t cost = 0;
-            for( int p = 0; p < plane_count; p++ )
-            {
-                cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
-                                                        h->mb.pic.p_fdec[p], FDEC_STRIDE );
-
-            }
-            i_cost8 = (uint32_t)cost;
-            i_cost4 = (uint32_t)(cost >> 32);
-        }
-        else
-        {
-            for( int p = 0; p < plane_count; p++ )
-            {
-                i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
-                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
-                i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
-                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
-            }
-        }
-
-        h->mb.b_transform_8x8 = i_cost8 < i_cost4;
-        h->mb.b_skip_mc = 1;
-    }
-}
-
-static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
-{
-    if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
-    {
-        uint32_t subpart_bak = M32( h->mb.i_sub_partition );
-        /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
-        if( h->mb.i_type == P_8x8 )
-            M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
-        else if( !x264_transform_allowed[h->mb.i_type] )
-            return;
-
-        x264_analyse_update_cache( h, a );
-        h->mb.b_transform_8x8 ^= 1;
-        /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
-        int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
-
-        if( *i_rd >= i_rd8 )
-        {
-            if( *i_rd > 0 )
-                *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
-            *i_rd = i_rd8;
-        }
-        else
-        {
-            h->mb.b_transform_8x8 ^= 1;
-            M32( h->mb.i_sub_partition ) = subpart_bak;
-        }
-    }
-}
-
-/* Rate-distortion optimal QP selection.
- * FIXME: More than half of the benefit of this function seems to be
- * in the way it improves the coding of chroma DC (by decimating or
- * finding a better way to code a single DC coefficient.)
- * There must be a more efficient way to get that portion of the benefit
- * without doing full QP-RD, but RD-decimation doesn't seem to do the
- * trick. */
-static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
-{
-    int bcost, cost, failures, prevcost, origcost;
-    int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
-    int last_qp_tried = 0;
-    origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
-    int origcbp = h->mb.cbp[h->mb.i_mb_xy];
-
-    /* If CBP is already zero, don't raise the quantizer any higher. */
-    for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
-    {
-        /* Without psy-RD, require monotonicity when moving quant away from previous
-         * macroblock's quant; allow 1 failure when moving quant towards previous quant.
-         * With psy-RD, allow 1 failure when moving quant away from previous quant,
-         * allow 2 failures when moving quant towards previous quant.
-         * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
-        int threshold = (!!h->mb.i_psy_rd);
-        /* Raise the threshold for failures if we're moving towards the last QP. */
-        if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
-            ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
-            threshold++;
-        h->mb.i_qp = orig_qp;
-        failures = 0;
-        prevcost = origcost;
-
-        /* If the current QP results in an empty CBP, it's highly likely that lower QPs
-         * (up to a point) will too.  So, jump down to where the threshold will kick in
-         * and check the QP there.  If the CBP is still empty, skip the main loop.
-         * If it isn't empty, we would have ended up having to check this QP anyways,
-         * so as long as we store it for later lookup, we lose nothing. */
-        int already_checked_qp = -1;
-        int already_checked_cost = COST_MAX;
-        if( direction == -1 )
-        {
-            if( !origcbp )
-            {
-                h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
-                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
-                already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
-                if( !h->mb.cbp[h->mb.i_mb_xy] )
-                {
-                    /* If our empty-CBP block is lower QP than the last QP,
-                     * the last QP almost surely doesn't have a CBP either. */
-                    if( h->mb.i_last_qp > h->mb.i_qp )
-                        last_qp_tried = 1;
-                    break;
-                }
-                already_checked_qp = h->mb.i_qp;
-                h->mb.i_qp = orig_qp;
-            }
-        }
-
-        h->mb.i_qp += direction;
-        while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
-        {
-            if( h->mb.i_last_qp == h->mb.i_qp )
-                last_qp_tried = 1;
-            if( h->mb.i_qp == already_checked_qp )
-                cost = already_checked_cost;
-            else
-            {
-                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
-                cost = x264_rd_cost_mb( h, a->i_lambda2 );
-                COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
-            }
-
-            /* We can't assume that the costs are monotonic over QPs.
-             * Tie case-as-failure seems to give better results. */
-            if( cost < prevcost )
-                failures = 0;
-            else
-                failures++;
-            prevcost = cost;
-
-            if( failures > threshold )
-                break;
-            if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
-                break;
-            h->mb.i_qp += direction;
-        }
-    }
-
-    /* Always try the last block's QP. */
-    if( !last_qp_tried )
-    {
-        h->mb.i_qp = h->mb.i_last_qp;
-        h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
-        cost = x264_rd_cost_mb( h, a->i_lambda2 );
-        COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
-    }
-
-    h->mb.i_qp = bqp;
-    h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
-
-    /* Check transform again; decision from before may no longer be optimal. */
-    if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
-        x264_mb_transform_8x8_allowed( h ) )
-    {
-        h->mb.b_transform_8x8 ^= 1;
-        cost = x264_rd_cost_mb( h, a->i_lambda2 );
-        if( cost > bcost )
-            h->mb.b_transform_8x8 ^= 1;
-    }
-}
-
-/*****************************************************************************
- * x264_macroblock_analyse:
- *****************************************************************************/
-void x264_macroblock_analyse( x264_t *h )
-{
-    x264_mb_analysis_t analysis;
-    int i_cost = COST_MAX;
-
-    h->mb.i_qp = x264_ratecontrol_mb_qp( h );
-    /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
-     * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
-    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
-        h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
-
-    if( h->param.analyse.b_mb_info )
-        h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
-    x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
-
-    /*--------------------------- Do the analysis ---------------------------*/
-    if( h->sh.i_type == SLICE_TYPE_I )
-    {
-intra_analysis:
-        if( analysis.i_mbrd )
-            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
-        x264_mb_analyse_intra( h, &analysis, COST_MAX );
-        if( analysis.i_mbrd )
-            x264_intra_rd( h, &analysis, COST_MAX );
-
-        i_cost = analysis.i_satd_i16x16;
-        h->mb.i_type = I_16x16;
-        COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
-        COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
-        if( analysis.i_satd_pcm < i_cost )
-            h->mb.i_type = I_PCM;
-
-        else if( analysis.i_mbrd >= 2 )
-            x264_intra_rd_refine( h, &analysis );
-    }
-    else if( h->sh.i_type == SLICE_TYPE_P )
-    {
-        int b_skip = 0;
-
-        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
-
-        analysis.b_try_skip = 0;
-        if( analysis.b_force_intra )
-        {
-            if( !h->param.analyse.b_psy )
-            {
-                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
-                goto intra_analysis;
-            }
-        }
-        else
-        {
-            /* Special fast-skip logic using information from mb_info. */
-            if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
-            {
-                if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
-                    h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
-                {
-                    h->mb.i_partition = D_16x16;
-                    /* Use the P-SKIP MV if we can... */
-                    if( !M32(h->mb.cache.pskip_mv) )
-                    {
-                        b_skip = 1;
-                        h->mb.i_type = P_SKIP;
-                    }
-                    /* Otherwise, just force a 16x16 block. */
-                    else
-                    {
-                        h->mb.i_type = P_L0;
-                        analysis.l0.me16x16.i_ref = 0;
-                        M32( analysis.l0.me16x16.mv ) = 0;
-                    }
-                    goto skip_analysis;
-                }
-                /* Reset the information accordingly */
-                else if( h->param.analyse.b_mb_info_update )
-                    h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
-            }
-
-            int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
-            /* If the current macroblock is off the frame, just skip it. */
-            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
-                b_skip = 1;
-            /* Fast P_SKIP detection */
-            else if( h->param.analyse.b_fast_pskip )
-            {
-                if( skip_invalid )
-                    // FIXME don't need to check this if the reference frame is done
-                    {}
-                else if( h->param.analyse.i_subpel_refine >= 3 )
-                    analysis.b_try_skip = 1;
-                else if( h->mb.i_mb_type_left[0] == P_SKIP ||
-                         h->mb.i_mb_type_top == P_SKIP ||
-                         h->mb.i_mb_type_topleft == P_SKIP ||
-                         h->mb.i_mb_type_topright == P_SKIP )
-                    b_skip = x264_macroblock_probe_pskip( h );
-            }
-        }
-
-        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
-
-        if( b_skip )
-        {
-            h->mb.i_type = P_SKIP;
-            h->mb.i_partition = D_16x16;
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
-skip_analysis:
-            /* Set up MVs for future predictors */
-            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
-                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
-        }
-        else
-        {
-            const unsigned int flags = h->param.analyse.inter;
-            int i_type;
-            int i_partition;
-            int i_satd_inter, i_satd_intra;
-
-            x264_mb_analyse_load_costs( h, &analysis );
-
-            x264_mb_analyse_inter_p16x16( h, &analysis );
-
-            if( h->mb.i_type == P_SKIP )
-            {
-                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
-                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
-                return;
-            }
-
-            if( flags & X264_ANALYSE_PSUB16x16 )
-            {
-                if( h->param.analyse.b_mixed_references )
-                    x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
-                else
-                    x264_mb_analyse_inter_p8x8( h, &analysis );
-            }
-
-            /* Select best inter mode */
-            i_type = P_L0;
-            i_partition = D_16x16;
-            i_cost = analysis.l0.me16x16.cost;
-
-            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
-                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
-            {
-                i_type = P_8x8;
-                i_partition = D_8x8;
-                i_cost = analysis.l0.i_cost8x8;
-
-                /* Do sub 8x8 */
-                if( flags & X264_ANALYSE_PSUB8x8 )
-                {
-                    for( int i = 0; i < 4; i++ )
-                    {
-                        x264_mb_analyse_inter_p4x4( h, &analysis, i );
-                        int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
-                        if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
-                        {
-                            int i_cost8x8 = analysis.l0.i_cost4x4[i];
-                            h->mb.i_sub_partition[i] = D_L0_4x4;
-
-                            x264_mb_analyse_inter_p8x4( h, &analysis, i );
-                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
-                                         h->mb.i_sub_partition[i], D_L0_8x4 );
-
-                            x264_mb_analyse_inter_p4x8( h, &analysis, i );
-                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
-                                         h->mb.i_sub_partition[i], D_L0_4x8 );
-
-                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
-                        }
-                        x264_mb_cache_mv_p8x8( h, &analysis, i );
-                    }
-                    analysis.l0.i_cost8x8 = i_cost;
-                }
-            }
-
-            /* Now do 16x8/8x16 */
-            int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
-            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
-                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
-            {
-                int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
-                                      + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
-                analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
-
-                x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
-                COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
-
-                i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
-                                  + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
-                analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
-
-                x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
-                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
-            }
-
-            h->mb.i_partition = i_partition;
-
-            /* refine qpel */
-            //FIXME mb_type costs?
-            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
-            {
-                /* refine later */
-            }
-            else if( i_partition == D_16x16 )
-            {
-                x264_me_refine_qpel( h, &analysis.l0.me16x16 );
-                i_cost = analysis.l0.me16x16.cost;
-            }
-            else if( i_partition == D_16x8 )
-            {
-                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
-                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
-                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
-            }
-            else if( i_partition == D_8x16 )
-            {
-                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
-                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
-                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
-            }
-            else if( i_partition == D_8x8 )
-            {
-                i_cost = 0;
-                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                {
-                    switch( h->mb.i_sub_partition[i8x8] )
-                    {
-                        case D_L0_8x8:
-                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
-                            i_cost += analysis.l0.me8x8[i8x8].cost;
-                            break;
-                        case D_L0_8x4:
-                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
-                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
-                            i_cost += analysis.l0.me8x4[i8x8][0].cost +
-                                      analysis.l0.me8x4[i8x8][1].cost;
-                            break;
-                        case D_L0_4x8:
-                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
-                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
-                            i_cost += analysis.l0.me4x8[i8x8][0].cost +
-                                      analysis.l0.me4x8[i8x8][1].cost;
-                            break;
-
-                        case D_L0_4x4:
-                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
-                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
-                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
-                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
-                            i_cost += analysis.l0.me4x4[i8x8][0].cost +
-                                      analysis.l0.me4x4[i8x8][1].cost +
-                                      analysis.l0.me4x4[i8x8][2].cost +
-                                      analysis.l0.me4x4[i8x8][3].cost;
-                            break;
-                        default:
-                            x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
-                            break;
-                    }
-                }
-            }
-
-            if( h->mb.b_chroma_me )
-            {
-                if( CHROMA444 )
-                {
-                    x264_mb_analyse_intra( h, &analysis, i_cost );
-                    x264_mb_analyse_intra_chroma( h, &analysis );
-                }
-                else
-                {
-                    x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
-                }
-                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
-                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
-                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
-            }
-            else
-                x264_mb_analyse_intra( h, &analysis, i_cost );
-
-            i_satd_inter = i_cost;
-            i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
-                                      analysis.i_satd_i8x8,
-                                      analysis.i_satd_i4x4 );
-
-            if( analysis.i_mbrd )
-            {
-                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
-                i_type = P_L0;
-                i_partition = D_16x16;
-                i_cost = analysis.l0.i_rd16x16;
-                COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
-                COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
-                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
-                h->mb.i_type = i_type;
-                h->mb.i_partition = i_partition;
-                if( i_cost < COST_MAX )
-                    x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
-                x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
-            }
-
-            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
-
-            h->mb.i_type = i_type;
-
-            if( analysis.b_force_intra && !IS_INTRA(i_type) )
-            {
-                /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
-                 * it was an inter block. */
-                x264_analyse_update_cache( h, &analysis );
-                x264_macroblock_encode( h );
-                for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
-                    h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
-                if( !CHROMA444 )
-                {
-                    int height = 16 >> CHROMA_V_SHIFT;
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
-                }
-                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
-                goto intra_analysis;
-            }
-
-            if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
-            {
-                if( IS_INTRA( h->mb.i_type ) )
-                {
-                    x264_intra_rd_refine( h, &analysis );
-                }
-                else if( i_partition == D_16x16 )
-                {
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
-                    analysis.l0.me16x16.cost = i_cost;
-                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
-                }
-                else if( i_partition == D_16x8 )
-                {
-                    M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
-                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
-                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
-                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
-                }
-                else if( i_partition == D_8x16 )
-                {
-                    M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
-                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
-                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
-                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
-                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
-                }
-                else if( i_partition == D_8x8 )
-                {
-                    x264_analyse_update_cache( h, &analysis );
-                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                    {
-                        if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
-                        {
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
-                        }
-                        else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
-                        {
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
-                        }
-                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
-                        {
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
-                        }
-                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
-                        {
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else if( h->sh.i_type == SLICE_TYPE_B )
-    {
-        int i_bskip_cost = COST_MAX;
-        int b_skip = 0;
-
-        if( analysis.i_mbrd )
-            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
-
-        h->mb.i_type = B_SKIP;
-        if( h->mb.b_direct_auto_write )
-        {
-            /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
-            for( int i = 0; i < 2; i++ )
-            {
-                int b_changed = 1;
-                h->sh.b_direct_spatial_mv_pred ^= 1;
-                analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
-                if( analysis.b_direct_available )
-                {
-                    if( b_changed )
-                    {
-                        x264_mb_mc( h );
-                        b_skip = x264_macroblock_probe_bskip( h );
-                    }
-                    h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
-                }
-                else
-                    b_skip = 0;
-            }
-        }
-        else
-            analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
-
-        analysis.b_try_skip = 0;
-        if( analysis.b_direct_available )
-        {
-            if( !h->mb.b_direct_auto_write )
-                x264_mb_mc( h );
-            /* If the current macroblock is off the frame, just skip it. */
-            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
-                b_skip = 1;
-            else if( analysis.i_mbrd )
-            {
-                i_bskip_cost = ssd_mb( h );
-                /* 6 = minimum cavlc cost of a non-skipped MB */
-                b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
-            }
-            else if( !h->mb.b_direct_auto_write )
-            {
-                /* Conditioning the probe on neighboring block types
-                 * doesn't seem to help speed or quality. */
-                analysis.b_try_skip = x264_macroblock_probe_bskip( h );
-                if( h->param.analyse.i_subpel_refine < 3 )
-                    b_skip = analysis.b_try_skip;
-            }
-            /* Set up MVs for future predictors */
-            if( b_skip )
-            {
-                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
-                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
-                for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
-                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
-            }
-        }
-
-        if( !b_skip )
-        {
-            const unsigned int flags = h->param.analyse.inter;
-            int i_type;
-            int i_partition;
-            int i_satd_inter;
-            h->mb.b_skip_mc = 0;
-            h->mb.i_type = B_DIRECT;
-
-            x264_mb_analyse_load_costs( h, &analysis );
-
-            /* select best inter mode */
-            /* direct must be first */
-            if( analysis.b_direct_available )
-                x264_mb_analyse_inter_direct( h, &analysis );
-
-            x264_mb_analyse_inter_b16x16( h, &analysis );
-
-            if( h->mb.i_type == B_SKIP )
-            {
-                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
-                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
-                for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
-                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
-                return;
-            }
-
-            i_type = B_L0_L0;
-            i_partition = D_16x16;
-            i_cost = analysis.l0.me16x16.cost;
-            COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
-            COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
-            COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
-
-            if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
-            {
-                x264_mb_analyse_b_rd( h, &analysis, i_cost );
-                if( i_bskip_cost < analysis.i_rd16x16direct &&
-                    i_bskip_cost < analysis.i_rd16x16bi &&
-                    i_bskip_cost < analysis.l0.i_rd16x16 &&
-                    i_bskip_cost < analysis.l1.i_rd16x16 )
-                {
-                    h->mb.i_type = B_SKIP;
-                    x264_analyse_update_cache( h, &analysis );
-                    return;
-                }
-            }
-
-            if( flags & X264_ANALYSE_BSUB16x16 )
-            {
-                if( h->param.analyse.b_mixed_references )
-                    x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
-                else
-                    x264_mb_analyse_inter_b8x8( h, &analysis );
-
-                COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
-
-                /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
-                int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
-                int i_mb_type, i_partition16x8[2], i_partition8x16[2];
-                for( int i = 0; i < 2; i++ )
-                {
-                    int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
-                    int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
-                    // 16x8
-                    i_best_cost = COST_MAX;
-                    i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
-                    i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
-                    i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
-                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
-                                         + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
-                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
-                                         + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
-                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
-                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
-                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
-                    analysis.i_cost_est16x8[i] = i_best_cost;
-
-                    // 8x16
-                    i_best_cost = COST_MAX;
-                    i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
-                    i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
-                    i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
-                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
-                                         + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
-                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
-                                         + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
-                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
-                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
-                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
-                    analysis.i_cost_est8x16[i] = i_best_cost;
-                }
-                i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
-                analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
-                i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
-                i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
-                analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
-                i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
-
-                /* We can gain a little speed by checking the mode with the lowest estimated cost first */
-                int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
-                if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
-                {
-                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
-                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
-                }
-                if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
-                {
-                    x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
-                    COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
-                }
-                if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
-                {
-                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
-                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
-                }
-            }
-
-            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
-            {
-                /* refine later */
-            }
-            /* refine qpel */
-            else if( i_partition == D_16x16 )
-            {
-                analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
-                analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
-                if( i_type == B_L0_L0 )
-                {
-                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );
-                    i_cost = analysis.l0.me16x16.cost
-                           + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
-                }
-                else if( i_type == B_L1_L1 )
-                {
-                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );
-                    i_cost = analysis.l1.me16x16.cost
-                           + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
-                }
-                else if( i_type == B_BI_BI )
-                {
-                    x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
-                    x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
-                }
-            }
-            else if( i_partition == D_16x8 )
-            {
-                for( int i = 0; i < 2; i++ )
-                {
-                    if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
-                        x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
-                    if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
-                        x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
-                }
-            }
-            else if( i_partition == D_8x16 )
-            {
-                for( int i = 0; i < 2; i++ )
-                {
-                    if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
-                        x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
-                    if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
-                        x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
-                }
-            }
-            else if( i_partition == D_8x8 )
-            {
-                for( int i = 0; i < 4; i++ )
-                {
-                    x264_me_t *m;
-                    int i_part_cost_old;
-                    int i_type_cost;
-                    int i_part_type = h->mb.i_sub_partition[i];
-                    int b_bidir = (i_part_type == D_BI_8x8);
-
-                    if( i_part_type == D_DIRECT_8x8 )
-                        continue;
-                    if( x264_mb_partition_listX_table[0][i_part_type] )
-                    {
-                        m = &analysis.l0.me8x8[i];
-                        i_part_cost_old = m->cost;
-                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
-                        m->cost -= i_type_cost;
-                        x264_me_refine_qpel( h, m );
-                        if( !b_bidir )
-                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
-                    }
-                    if( x264_mb_partition_listX_table[1][i_part_type] )
-                    {
-                        m = &analysis.l1.me8x8[i];
-                        i_part_cost_old = m->cost;
-                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
-                        m->cost -= i_type_cost;
-                        x264_me_refine_qpel( h, m );
-                        if( !b_bidir )
-                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
-                    }
-                    /* TODO: update mvp? */
-                }
-            }
-
-            i_satd_inter = i_cost;
-
-            if( analysis.i_mbrd )
-            {
-                x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
-                i_type = B_SKIP;
-                i_cost = i_bskip_cost;
-                i_partition = D_16x16;
-                COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
-                COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
-                COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
-                COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
-                COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
-                COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
-                COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
-
-                h->mb.i_type = i_type;
-                h->mb.i_partition = i_partition;
-            }
-
-            if( h->mb.b_chroma_me )
-            {
-                if( CHROMA444 )
-                {
-                    x264_mb_analyse_intra( h, &analysis, i_satd_inter );
-                    x264_mb_analyse_intra_chroma( h, &analysis );
-                }
-                else
-                {
-                    x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
-                }
-                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
-                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
-                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
-            }
-            else
-                x264_mb_analyse_intra( h, &analysis, i_satd_inter );
-
-            if( analysis.i_mbrd )
-            {
-                x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
-                x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
-            }
-
-            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
-            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
-
-            h->mb.i_type = i_type;
-            h->mb.i_partition = i_partition;
-
-            if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
-                x264_intra_rd_refine( h, &analysis );
-            if( h->mb.i_subpel_refine >= 5 )
-                x264_refine_bidir( h, &analysis );
-
-            if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
-            {
-                int i_biweight;
-                x264_analyse_update_cache( h, &analysis );
-
-                if( i_partition == D_16x16 )
-                {
-                    if( i_type == B_L0_L0 )
-                    {
-                        analysis.l0.me16x16.cost = i_cost;
-                        x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
-                    }
-                    else if( i_type == B_L1_L1 )
-                    {
-                        analysis.l1.me16x16.cost = i_cost;
-                        x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
-                    }
-                    else if( i_type == B_BI_BI )
-                    {
-                        i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
-                        x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
-                    }
-                }
-                else if( i_partition == D_16x8 )
-                {
-                    for( int i = 0; i < 2; i++ )
-                    {
-                        h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
-                        if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
-                        else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
-                        else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
-                        {
-                            i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
-                            x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
-                        }
-                    }
-                }
-                else if( i_partition == D_8x16 )
-                {
-                    for( int i = 0; i < 2; i++ )
-                    {
-                        h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
-                        if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
-                        else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
-                        else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
-                        {
-                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
-                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
-                        }
-                    }
-                }
-                else if( i_partition == D_8x8 )
-                {
-                    for( int i = 0; i < 4; i++ )
-                    {
-                        if( h->mb.i_sub_partition[i] == D_L0_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
-                        else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
-                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
-                        else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
-                        {
-                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
-                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    x264_analyse_update_cache( h, &analysis );
-
-    /* In rare cases we can end up qpel-RDing our way back to a larger partition size
-     * without realizing it.  Check for this and account for it if necessary. */
-    if( analysis.i_mbrd >= 2 )
-    {
-        /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
-        static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
-        int list = check_mv_lists[h->mb.i_type] - 1;
-        if( list >= 0 && h->mb.i_partition != D_16x16 &&
-            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
-            h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
-                h->mb.i_partition = D_16x16;
-    }
-
-    if( !analysis.i_mbrd )
-        x264_mb_analyse_transform( h );
-
-    if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
-        x264_mb_analyse_qp_rd( h, &analysis );
-
-    h->mb.b_trellis = h->param.analyse.i_trellis;
-    h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
-
-    if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
-        x264_psy_trellis_init( h, 0 );
-    if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
-        h->mb.i_skip_intra = 0;
-}
-
-/*-------------------- Update MB from the analysis ----------------------*/
-static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
-{
-    switch( h->mb.i_type )
-    {
-        case I_4x4:
-            for( int i = 0; i < 16; i++ )
-                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
-
-            x264_mb_analyse_intra_chroma( h, a );
-            break;
-        case I_8x8:
-            for( int i = 0; i < 4; i++ )
-                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
-
-            x264_mb_analyse_intra_chroma( h, a );
-            break;
-        case I_16x16:
-            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
-            x264_mb_analyse_intra_chroma( h, a );
-            break;
-
-        case I_PCM:
-            break;
-
-        case P_L0:
-            switch( h->mb.i_partition )
-            {
-                case D_16x16:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
-                    break;
-
-                case D_16x8:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
-                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
-                    x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
-                    break;
-
-                case D_8x16:
-                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
-                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
-                    x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
-                    break;
-
-                default:
-                    x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
-                    break;
-            }
-            break;
-
-        case P_8x8:
-            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
-            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
-            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
-            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
-            for( int i = 0; i < 4; i++ )
-                x264_mb_cache_mv_p8x8( h, a, i );
-            break;
-
-        case P_SKIP:
-        {
-            h->mb.i_partition = D_16x16;
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
-            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
-            break;
-        }
-
-        case B_SKIP:
-        case B_DIRECT:
-            h->mb.i_partition = h->mb.cache.direct_partition;
-            x264_mb_load_mv_direct8x8( h, 0 );
-            x264_mb_load_mv_direct8x8( h, 1 );
-            x264_mb_load_mv_direct8x8( h, 2 );
-            x264_mb_load_mv_direct8x8( h, 3 );
-            break;
-
-        case B_8x8:
-            /* optimize: cache might not need to be rewritten */
-            for( int i = 0; i < 4; i++ )
-                x264_mb_cache_mv_b8x8( h, a, i, 1 );
-            break;
-
-        default: /* the rest of the B types */
-            switch( h->mb.i_partition )
-            {
-            case D_16x16:
-                switch( h->mb.i_type )
-                {
-                case B_L0_L0:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
-
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
-                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
-                    break;
-                case B_L1_L1:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
-                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
-
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
-                    break;
-                case B_BI_BI:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
-
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
-                    break;
-                }
-                break;
-            case D_16x8:
-                x264_mb_cache_mv_b16x8( h, a, 0, 1 );
-                x264_mb_cache_mv_b16x8( h, a, 1, 1 );
-                break;
-            case D_8x16:
-                x264_mb_cache_mv_b8x16( h, a, 0, 1 );
-                x264_mb_cache_mv_b8x16( h, a, 1, 1 );
-                break;
-            default:
-                x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
-                break;
-            }
-    }
-
-#ifndef NDEBUG
-    if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
-    {
-        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
-        {
-            int completed;
-            int ref = h->mb.cache.ref[l][x264_scan8[0]];
-            if( ref < 0 )
-                continue;
-            completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
-            if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
-            {
-                x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
-                x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
-                x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
-                                h->mb.cache.mv[l][x264_scan8[15]][0],
-                                h->mb.cache.mv[l][x264_scan8[15]][1] );
-                x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
-                x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
-                x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
-                x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
-                x264_mb_analyse_intra( h, a, COST_MAX );
-                h->mb.i_type = I_16x16;
-                h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
-                x264_mb_analyse_intra_chroma( h, a );
-            }
-        }
-    }
-#endif
-}
-
-#include "slicetype.c"
-
diff --git a/android/src/main/libenc/jni/libx264/encoder/analyse.h b/android/src/main/libenc/jni/libx264/encoder/analyse.h
deleted file mode 100755
index 35d5110..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/analyse.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*****************************************************************************
- * analyse.h: macroblock analysis
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ANALYSE_H
-#define X264_ANALYSE_H
-
-int x264_analyse_init_costs( x264_t *h );
-void x264_analyse_free_costs( x264_t *h );
-void x264_analyse_weight_frame( x264_t *h, int end );
-void x264_macroblock_analyse( x264_t *h );
-void x264_slicetype_decide( x264_t *h );
-
-void x264_slicetype_analyse( x264_t *h, int intra_minigop );
-
-int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
-
-int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
-int  x264_lookahead_is_empty( x264_t *h );
-void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
-void x264_lookahead_get_frames( x264_t *h );
-void x264_lookahead_delete( x264_t *h );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/cabac.c b/android/src/main/libenc/jni/libx264/encoder/cabac.c
deleted file mode 100755
index f1149a5..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/cabac.c
+++ /dev/null
@@ -1,1284 +0,0 @@
-/*****************************************************************************
- * cabac.c: cabac bitstream writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-
-#ifndef RDO_SKIP_BS
-#define RDO_SKIP_BS 0
-#endif
-
-static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
-                    int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
-{
-    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
-    {
-        x264_cabac_encode_decision_noup( cb, ctx0, 0 );
-    }
-#if !RDO_SKIP_BS
-    else if( i_mb_type == I_PCM )
-    {
-        x264_cabac_encode_decision_noup( cb, ctx0, 1 );
-        x264_cabac_encode_flush( h, cb );
-    }
-#endif
-    else
-    {
-        int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
-
-        x264_cabac_encode_decision_noup( cb, ctx0, 1 );
-        x264_cabac_encode_terminal( cb );
-
-        x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma );
-        if( h->mb.i_cbp_chroma == 0 )
-            x264_cabac_encode_decision_noup( cb, ctx2, 0 );
-        else
-        {
-            x264_cabac_encode_decision( cb, ctx2, 1 );
-            x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 );
-        }
-        x264_cabac_encode_decision( cb, ctx4, i_pred>>1 );
-        x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 );
-    }
-}
-
-#if !RDO_SKIP_BS
-static void x264_cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb )
-{
-    int ctx = 0;
-    ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x;
-    ctx += (h->mb.i_mb_top_mbpair_xy >= 0
-            && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb
-            && h->mb.field[h->mb.i_mb_top_mbpair_xy]);
-
-    x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED );
-    h->mb.field_decoding_flag = MB_INTERLACED;
-}
-#endif
-
-static void x264_cabac_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
-{
-    if( i_pred == i_mode )
-        x264_cabac_encode_decision( cb, 68, 1 );
-    else
-    {
-        x264_cabac_encode_decision( cb, 68, 0 );
-        if( i_mode > i_pred  )
-            i_mode--;
-        x264_cabac_encode_decision( cb, 69, (i_mode     )&0x01 );
-        x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 );
-        x264_cabac_encode_decision( cb, 69, (i_mode >> 2)      );
-    }
-}
-
-static void x264_cabac_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
-{
-    int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
-    int ctx = 0;
-
-    /* No need to test for I4x4 or I_16x16 as cache_save handle that */
-    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
-        ctx++;
-    if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 )
-        ctx++;
-
-    x264_cabac_encode_decision_noup( cb, 64 + ctx, i_mode > 0 );
-    if( i_mode > 0 )
-    {
-        x264_cabac_encode_decision( cb, 64 + 3, i_mode > 1 );
-        if( i_mode > 1 )
-            x264_cabac_encode_decision_noup( cb, 64 + 3, i_mode > 2 );
-    }
-}
-
-static void x264_cabac_cbp_luma( x264_t *h, x264_cabac_t *cb )
-{
-    int cbp = h->mb.i_cbp_luma;
-    int cbp_l = h->mb.cache.i_cbp_left;
-    int cbp_t = h->mb.cache.i_cbp_top;
-    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (cbp >> 0) & 1 );
-    x264_cabac_encode_decision     ( cb, 76 - ((cbp   >> 0) & 1) - ((cbp_t >> 2) & 2), (cbp >> 1) & 1 );
-    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp   << 1) & 2), (cbp >> 2) & 1 );
-    x264_cabac_encode_decision_noup( cb, 76 - ((cbp   >> 2) & 1) - ((cbp   >> 0) & 2), (cbp >> 3) & 1 );
-}
-
-static void x264_cabac_cbp_chroma( x264_t *h, x264_cabac_t *cb )
-{
-    int cbp_a = h->mb.cache.i_cbp_left & 0x30;
-    int cbp_b = h->mb.cache.i_cbp_top  & 0x30;
-    int ctx = 0;
-
-    if( cbp_a && h->mb.cache.i_cbp_left != -1 ) ctx++;
-    if( cbp_b && h->mb.cache.i_cbp_top  != -1 ) ctx+=2;
-    if( h->mb.i_cbp_chroma == 0 )
-        x264_cabac_encode_decision_noup( cb, 77 + ctx, 0 );
-    else
-    {
-        x264_cabac_encode_decision_noup( cb, 77 + ctx, 1 );
-
-        ctx = 4;
-        if( cbp_a == 0x20 ) ctx++;
-        if( cbp_b == 0x20 ) ctx += 2;
-        x264_cabac_encode_decision_noup( cb, 77 + ctx, h->mb.i_cbp_chroma >> 1 );
-    }
-}
-
-static void x264_cabac_qp_delta( x264_t *h, x264_cabac_t *cb )
-{
-    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
-    int ctx;
-
-    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
-     * flat background area. Don't do this if it would raise the quantizer, since that could
-     * cause unexpected deblocking artifacts. */
-    if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp )
-    {
-#if !RDO_SKIP_BS
-        h->mb.i_qp = h->mb.i_last_qp;
-#endif
-        i_dqp = 0;
-    }
-
-    ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f));
-
-    if( i_dqp != 0 )
-    {
-        /* Faster than (i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp-1)).
-         * If you so much as sneeze on these lines, gcc will compile this suboptimally. */
-        i_dqp *= 2;
-        int val = 1 - i_dqp;
-        if( val < 0 ) val = i_dqp;
-        val--;
-        /* dqp is interpreted modulo (QP_MAX_SPEC+1) */
-        if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 )
-            val = 2*QP_MAX_SPEC+1 - val;
-        do
-        {
-            x264_cabac_encode_decision( cb, 60 + ctx, 1 );
-            ctx = 2+(ctx>>1);
-        } while( --val );
-    }
-    x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 );
-}
-
-#if !RDO_SKIP_BS
-void x264_cabac_mb_skip( x264_t *h, int b_skip )
-{
-    int ctx = h->mb.cache.i_neighbour_skip + 11;
-    if( h->sh.i_type != SLICE_TYPE_P )
-       ctx += 13;
-    x264_cabac_encode_decision( &h->cabac, ctx, b_skip );
-}
-#endif
-
-static inline void x264_cabac_subpartition_p( x264_cabac_t *cb, int i_sub )
-{
-    if( i_sub == D_L0_8x8 )
-    {
-        x264_cabac_encode_decision( cb, 21, 1 );
-        return;
-    }
-    x264_cabac_encode_decision( cb, 21, 0 );
-    if( i_sub == D_L0_8x4 )
-        x264_cabac_encode_decision( cb, 22, 0 );
-    else
-    {
-        x264_cabac_encode_decision( cb, 22, 1 );
-        x264_cabac_encode_decision( cb, 23, i_sub == D_L0_4x8 );
-    }
-}
-
-static ALWAYS_INLINE void x264_cabac_subpartition_b( x264_cabac_t *cb, int i_sub )
-{
-    if( i_sub == D_DIRECT_8x8 )
-    {
-        x264_cabac_encode_decision( cb, 36, 0 );
-        return;
-    }
-    x264_cabac_encode_decision( cb, 36, 1 );
-    if( i_sub == D_BI_8x8 )
-    {
-        x264_cabac_encode_decision( cb, 37, 1 );
-        x264_cabac_encode_decision( cb, 38, 0 );
-        x264_cabac_encode_decision( cb, 39, 0 );
-        x264_cabac_encode_decision( cb, 39, 0 );
-        return;
-    }
-    x264_cabac_encode_decision( cb, 37, 0 );
-    x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
-}
-
-static ALWAYS_INLINE void x264_cabac_transform_size( x264_t *h, x264_cabac_t *cb )
-{
-    int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
-    x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 );
-}
-
-static ALWAYS_INLINE void x264_cabac_ref_internal( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int bframe )
-{
-    const int i8 = x264_scan8[idx];
-    const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
-    const int i_refb = h->mb.cache.ref[i_list][i8 - 8];
-    int ctx = 0;
-
-    if( i_refa > 0 && (!bframe || !h->mb.cache.skip[i8 - 1]) )
-        ctx++;
-    if( i_refb > 0 && (!bframe || !h->mb.cache.skip[i8 - 8]) )
-        ctx += 2;
-
-    for( int i_ref = h->mb.cache.ref[i_list][i8]; i_ref > 0; i_ref-- )
-    {
-        x264_cabac_encode_decision( cb, 54 + ctx, 1 );
-        ctx = (ctx>>2)+4;
-    }
-    x264_cabac_encode_decision( cb, 54 + ctx, 0 );
-}
-
-static NOINLINE void x264_cabac_ref_p( x264_t *h, x264_cabac_t *cb, int idx )
-{
-    x264_cabac_ref_internal( h, cb, 0, idx, 0 );
-}
-static NOINLINE void x264_cabac_ref_b( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
-{
-    x264_cabac_ref_internal( h, cb, i_list, idx, 1 );
-}
-
-static ALWAYS_INLINE int x264_cabac_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
-{
-    int ctxbase = l ? 47 : 40;
-
-    if( mvd == 0 )
-    {
-        x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
-        return 0;
-    }
-
-    int i_abs = abs( mvd );
-    x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
-#if RDO_SKIP_BS
-    if( i_abs <= 3 )
-    {
-        for( int i = 1; i < i_abs; i++ )
-            x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 );
-        x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 );
-        x264_cabac_encode_bypass( cb, mvd >> 31 );
-    }
-    else
-    {
-        x264_cabac_encode_decision( cb, ctxbase + 3, 1 );
-        x264_cabac_encode_decision( cb, ctxbase + 4, 1 );
-        x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
-        if( i_abs < 9 )
-        {
-            cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
-            cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
-        }
-        else
-        {
-            cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
-            cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
-            x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
-        }
-    }
-#else
-    static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 };
-
-    if( i_abs < 9 )
-    {
-        for( int i = 1; i < i_abs; i++ )
-            x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
-        x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 );
-    }
-    else
-    {
-        for( int i = 1; i < 9; i++ )
-            x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
-        x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
-    }
-    x264_cabac_encode_bypass( cb, mvd >> 31 );
-#endif
-    /* Since we don't need to keep track of MVDs larger than 66, just cap the value.
-     * This lets us store MVDs as 8-bit values instead of 16-bit. */
-    return X264_MIN( i_abs, 66 );
-}
-
-static NOINLINE uint16_t x264_cabac_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
-{
-    ALIGNED_4( int16_t mvp[2] );
-    int mdx, mdy;
-
-    /* Calculate mvd */
-    x264_mb_predict_mv( h, i_list, idx, width, mvp );
-    mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
-    mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
-    uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
-                                       h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
-
-    /* encode */
-    mdx = x264_cabac_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
-    mdy = x264_cabac_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );
-
-    return pack8to16(mdx,mdy);
-}
-
-#define x264_cabac_mvd(h,cb,i_list,idx,width,height)\
-do\
-{\
-    uint16_t mvd = x264_cabac_mvd(h,cb,i_list,idx,width);\
-    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
-} while(0)
-
-static inline void x264_cabac_8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
-{
-    switch( h->mb.i_sub_partition[i] )
-    {
-        case D_L0_8x8:
-            x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 );
-            break;
-        case D_L0_8x4:
-            x264_cabac_mvd( h, cb, 0, 4*i+0, 2, 1 );
-            x264_cabac_mvd( h, cb, 0, 4*i+2, 2, 1 );
-            break;
-        case D_L0_4x8:
-            x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 2 );
-            x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 2 );
-            break;
-        case D_L0_4x4:
-            x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 1 );
-            x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 1 );
-            x264_cabac_mvd( h, cb, 0, 4*i+2, 1, 1 );
-            x264_cabac_mvd( h, cb, 0, 4*i+3, 1, 1 );
-            break;
-        default:
-            assert(0);
-    }
-}
-
-static ALWAYS_INLINE void x264_cabac_mb_header_i( x264_t *h, x264_cabac_t *cb, int i_mb_type, int slice_type, int chroma )
-{
-    if( slice_type == SLICE_TYPE_I )
-    {
-        int ctx = 0;
-        if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 )
-            ctx++;
-        if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 )
-            ctx++;
-
-        x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
-    }
-    else if( slice_type == SLICE_TYPE_P )
-    {
-        /* prefix */
-        x264_cabac_encode_decision_noup( cb, 14, 1 );
-
-        /* suffix */
-        x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
-    }
-    else if( slice_type == SLICE_TYPE_B )
-    {
-        /* prefix */
-        x264_cabac_encode_decision_noup( cb, 27+3,   1 );
-        x264_cabac_encode_decision_noup( cb, 27+4,   1 );
-        x264_cabac_encode_decision( cb, 27+5,   1 );
-        x264_cabac_encode_decision( cb, 27+5,   0 );
-        x264_cabac_encode_decision( cb, 27+5,   1 );
-
-        /* suffix */
-        x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
-    }
-
-    if( i_mb_type == I_PCM )
-        return;
-
-    if( i_mb_type != I_16x16 )
-    {
-        if( h->pps->b_transform_8x8_mode )
-            x264_cabac_transform_size( h, cb );
-
-        int di = h->mb.b_transform_8x8 ? 4 : 1;
-        for( int i = 0; i < 16; i += di )
-        {
-            const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
-            const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
-            x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
-        }
-    }
-
-    if( chroma )
-        x264_cabac_intra_chroma_pred_mode( h, cb );
-}
-
-static ALWAYS_INLINE void x264_cabac_mb_header_p( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma )
-{
-    if( i_mb_type == P_L0 )
-    {
-        x264_cabac_encode_decision_noup( cb, 14, 0 );
-        if( h->mb.i_partition == D_16x16 )
-        {
-            x264_cabac_encode_decision_noup( cb, 15, 0 );
-            x264_cabac_encode_decision_noup( cb, 16, 0 );
-            if( h->mb.pic.i_fref[0] > 1 )
-                x264_cabac_ref_p( h, cb, 0 );
-            x264_cabac_mvd( h, cb, 0, 0, 4, 4 );
-        }
-        else if( h->mb.i_partition == D_16x8 )
-        {
-            x264_cabac_encode_decision_noup( cb, 15, 1 );
-            x264_cabac_encode_decision_noup( cb, 17, 1 );
-            if( h->mb.pic.i_fref[0] > 1 )
-            {
-                x264_cabac_ref_p( h, cb, 0 );
-                x264_cabac_ref_p( h, cb, 8 );
-            }
-            x264_cabac_mvd( h, cb, 0, 0, 4, 2 );
-            x264_cabac_mvd( h, cb, 0, 8, 4, 2 );
-        }
-        else //if( h->mb.i_partition == D_8x16 )
-        {
-            x264_cabac_encode_decision_noup( cb, 15, 1 );
-            x264_cabac_encode_decision_noup( cb, 17, 0 );
-            if( h->mb.pic.i_fref[0] > 1 )
-            {
-                x264_cabac_ref_p( h, cb, 0 );
-                x264_cabac_ref_p( h, cb, 4 );
-            }
-            x264_cabac_mvd( h, cb, 0, 0, 2, 4 );
-            x264_cabac_mvd( h, cb, 0, 4, 2, 4 );
-        }
-    }
-    else if( i_mb_type == P_8x8 )
-    {
-        x264_cabac_encode_decision_noup( cb, 14, 0 );
-        x264_cabac_encode_decision_noup( cb, 15, 0 );
-        x264_cabac_encode_decision_noup( cb, 16, 1 );
-
-        /* sub mb type */
-        for( int i = 0; i < 4; i++ )
-            x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i] );
-
-        /* ref 0 */
-        if( h->mb.pic.i_fref[0] > 1 )
-        {
-            x264_cabac_ref_p( h, cb,  0 );
-            x264_cabac_ref_p( h, cb,  4 );
-            x264_cabac_ref_p( h, cb,  8 );
-            x264_cabac_ref_p( h, cb, 12 );
-        }
-
-        for( int i = 0; i < 4; i++ )
-            x264_cabac_8x8_mvd( h, cb, i );
-    }
-    else /* intra */
-        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_P, chroma );
-}
-
-static ALWAYS_INLINE void x264_cabac_mb_header_b( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma )
-{
-    int ctx = 0;
-    if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT )
-        ctx++;
-    if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT )
-        ctx++;
-
-    if( i_mb_type == B_DIRECT )
-    {
-        x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
-        return;
-    }
-    x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
-
-    if( i_mb_type == B_8x8 )
-    {
-        x264_cabac_encode_decision_noup( cb, 27+3,   1 );
-        x264_cabac_encode_decision_noup( cb, 27+4,   1 );
-        x264_cabac_encode_decision( cb, 27+5,   1 );
-        x264_cabac_encode_decision( cb, 27+5,   1 );
-        x264_cabac_encode_decision_noup( cb, 27+5,   1 );
-
-        /* sub mb type */
-        for( int i = 0; i < 4; i++ )
-            x264_cabac_subpartition_b( cb, h->mb.i_sub_partition[i] );
-
-        /* ref */
-        if( h->mb.pic.i_fref[0] > 1 )
-            for( int i = 0; i < 4; i++ )
-                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-                    x264_cabac_ref_b( h, cb, 0, 4*i );
-
-        if( h->mb.pic.i_fref[1] > 1 )
-            for( int i = 0; i < 4; i++ )
-                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-                    x264_cabac_ref_b( h, cb, 1, 4*i );
-
-        for( int i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-                x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 );
-
-        for( int i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-                x264_cabac_mvd( h, cb, 1, 4*i, 2, 2 );
-    }
-    else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
-    {
-        /* All B modes */
-        static const uint8_t i_mb_bits[9*3] =
-        {
-            0x31, 0x29, 0x4, /* L0 L0 */
-            0x35, 0x2d, 0,   /* L0 L1 */
-            0x43, 0x63, 0,   /* L0 BI */
-            0x3d, 0x2f, 0,   /* L1 L0 */
-            0x39, 0x25, 0x6, /* L1 L1 */
-            0x53, 0x73, 0,   /* L1 BI */
-            0x4b, 0x6b, 0,   /* BI L0 */
-            0x5b, 0x7b, 0,   /* BI L1 */
-            0x47, 0x67, 0x21 /* BI BI */
-        };
-
-        const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
-        int bits = i_mb_bits[idx];
-
-        x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
-        x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
-        if( bits != 1 )
-        {
-            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
-            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
-            x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
-            if( bits != 1 )
-                x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
-        }
-
-        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
-        if( h->mb.pic.i_fref[0] > 1 )
-        {
-            if( b_list[0][0] )
-                x264_cabac_ref_b( h, cb, 0, 0 );
-            if( b_list[0][1] && h->mb.i_partition != D_16x16 )
-                x264_cabac_ref_b( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
-        }
-        if( h->mb.pic.i_fref[1] > 1 )
-        {
-            if( b_list[1][0] )
-                x264_cabac_ref_b( h, cb, 1, 0 );
-            if( b_list[1][1] && h->mb.i_partition != D_16x16 )
-                x264_cabac_ref_b( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
-        }
-        for( int i_list = 0; i_list < 2; i_list++ )
-        {
-            if( h->mb.i_partition == D_16x16 )
-            {
-                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 4 );
-            }
-            else if( h->mb.i_partition == D_16x8 )
-            {
-                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 2 );
-                if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 8, 4, 2 );
-            }
-            else //if( h->mb.i_partition == D_8x16 )
-            {
-                if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 2, 4 );
-                if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 4, 2, 4 );
-            }
-        }
-    }
-    else /* intra */
-        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_B, chroma );
-}
-
-static int ALWAYS_INLINE x264_cabac_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra, int b_dc )
-{
-    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};
-
-    if( b_dc )
-    {
-        i_idx -= LUMA_DC;
-        if( i_cat == DCT_CHROMA_DC )
-        {
-            int i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1 : b_intra;
-            int i_nzb = h->mb.cache.i_cbp_top  != -1 ? (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1 : b_intra;
-            return base_ctx[i_cat] + 2*i_nzb + i_nza;
-        }
-        else
-        {
-            int i_nza = (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1;
-            int i_nzb = (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1;
-            return base_ctx[i_cat] + 2*i_nzb + i_nza;
-        }
-    }
-    else
-    {
-        int i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
-        int i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
-        if( x264_constant_p(b_intra) && !b_intra )
-            return base_ctx[i_cat] + ((2*i_nzb + i_nza)&0x7f);
-        else
-        {
-            i_nza &= 0x7f + (b_intra << 7);
-            i_nzb &= 0x7f + (b_intra << 7);
-            return base_ctx[i_cat] + 2*!!i_nzb + !!i_nza;
-        }
-    }
-}
-
-#if !RDO_SKIP_BS
-extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64];
-extern const uint8_t x264_last_coeff_flag_offset_8x8[63];
-extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7];
-extern const uint16_t x264_significant_coeff_flag_offset[2][16];
-extern const uint16_t x264_last_coeff_flag_offset[2][16];
-extern const uint16_t x264_coeff_abs_level_m1_offset[16];
-extern const uint8_t x264_count_cat_m1[14];
-#else
-/* Padded to [64] for easier addressing */
-const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] =
-{{
-    0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
-    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
-    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
-   12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
-},{
-    0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
-    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
-    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
-    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14
-}};
-const uint8_t x264_last_coeff_flag_offset_8x8[63] =
-{
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
-};
-const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
-const uint16_t x264_significant_coeff_flag_offset[2][16] =
-{
-    { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 },
-    { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 }
-};
-const uint16_t x264_last_coeff_flag_offset[2][16] =
-{
-    { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 },
-    { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 }
-};
-const uint16_t x264_coeff_abs_level_m1_offset[16] =
-{
-    227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
-};
-const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
-#endif
-
-// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
-//           4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
-/* map node ctx => cabac ctx for level=1 */
-static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
-/* map node ctx => cabac ctx for level>1 */
-static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
-/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
- * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
-static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
-
-static const uint8_t coeff_abs_level_transition[2][8] = {
-/* update node ctx after coding a level=1 */
-    { 1, 2, 3, 3, 4, 5, 6, 7 },
-/* update node ctx after coding a level>1 */
-    { 4, 4, 4, 4, 5, 6, 7, 7 }
-};
-
-#if !RDO_SKIP_BS
-static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
-{
-    int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
-    int coeff_idx = -1, node_ctx = 0;
-    int last = h->quantf.coeff_last[ctx_block_cat]( l );
-    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
-    dctcoef coeffs[64];
-
-#define WRITE_SIGMAP( sig_off, last_off )\
-{\
-    int i = 0;\
-    while( 1 )\
-    {\
-        if( l[i] )\
-        {\
-            coeffs[++coeff_idx] = l[i];\
-            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\
-            if( i == last )\
-            {\
-                x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\
-                break;\
-            }\
-            else\
-                x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\
-        }\
-        else\
-            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\
-        if( ++i == count_m1 )\
-        {\
-            coeffs[++coeff_idx] = l[i];\
-            break;\
-        }\
-    }\
-}
-
-    if( chroma422dc )
-    {
-        int count_m1 = 7;
-        WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] )
-    }
-    else
-    {
-        int count_m1 = x264_count_cat_m1[ctx_block_cat];
-        if( count_m1 == 63 )
-        {
-            const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
-            WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] )
-        }
-        else
-            WRITE_SIGMAP( i, i )
-    }
-
-    do
-    {
-        /* write coeff_abs - 1 */
-        int coeff = coeffs[coeff_idx];
-        int abs_coeff = abs(coeff);
-        int coeff_sign = coeff >> 31;
-        int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
-
-        if( abs_coeff > 1 )
-        {
-            x264_cabac_encode_decision( cb, ctx, 1 );
-            ctx = levelgt1_ctx[node_ctx] + ctx_level;
-            for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- )
-                x264_cabac_encode_decision( cb, ctx, 1 );
-            if( abs_coeff < 15 )
-                x264_cabac_encode_decision( cb, ctx, 0 );
-            else
-                x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 );
-
-            node_ctx = coeff_abs_level_transition[1][node_ctx];
-        }
-        else
-        {
-            x264_cabac_encode_decision( cb, ctx, 0 );
-            node_ctx = coeff_abs_level_transition[0][node_ctx];
-        }
-
-        x264_cabac_encode_bypass( cb, coeff_sign );
-    } while( --coeff_idx >= 0 );
-}
-
-void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
-}
-
-static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-#if ARCH_X86_64 && HAVE_MMX
-    h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
-#else
-    x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
-#endif
-}
-static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    /* Template a version specifically for chroma 4:2:2 DC in order to avoid
-     * slowing down everything else due to the added complexity. */
-    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 1 );
-}
-#define x264_cabac_block_residual_8x8( h, cb, cat, l ) x264_cabac_block_residual( h, cb, cat, l )
-#else
-
-/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is
- * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there
- * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
-{
-    const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
-    int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
-    int last = h->quantf.coeff_last[ctx_block_cat]( l );
-    int coeff_abs = abs(l[last]);
-    int ctx = coeff_abs_level1_ctx[0] + ctx_level;
-    int node_ctx;
-    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
-
-    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) )
-    {
-        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
-                                    chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
-        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] :
-                                    chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
-    }
-
-    if( coeff_abs > 1 )
-    {
-        x264_cabac_encode_decision( cb, ctx, 1 );
-        ctx = levelgt1_ctx[0] + ctx_level;
-        if( coeff_abs < 15 )
-        {
-            cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
-            cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
-        }
-        else
-        {
-            cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
-            cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
-            x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
-        }
-        node_ctx = coeff_abs_level_transition[1][0];
-    }
-    else
-    {
-        x264_cabac_encode_decision( cb, ctx, 0 );
-        node_ctx = coeff_abs_level_transition[0][0];
-        x264_cabac_encode_bypass( cb, 0 ); // sign
-    }
-
-    for( int i = last-1; i >= 0; i-- )
-    {
-        if( l[i] )
-        {
-            coeff_abs = abs(l[i]);
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
-                                        chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 );
-            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] :
-                                        chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
-            ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
-
-            if( coeff_abs > 1 )
-            {
-                x264_cabac_encode_decision( cb, ctx, 1 );
-                ctx = levelgt1_ctx[node_ctx] + ctx_level;
-                if( coeff_abs < 15 )
-                {
-                    cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
-                    cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
-                }
-                else
-                {
-                    cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
-                    cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
-                    x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
-                }
-                node_ctx = coeff_abs_level_transition[1][node_ctx];
-            }
-            else
-            {
-                x264_cabac_encode_decision( cb, ctx, 0 );
-                node_ctx = coeff_abs_level_transition[0][node_ctx];
-                x264_cabac_encode_bypass( cb, 0 );
-            }
-        }
-        else
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
-                                        chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
-    }
-}
-
-void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 );
-}
-void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
-}
-
-static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-#if ARCH_X86_64 && HAVE_MMX
-    h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
-#else
-    x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
-#endif
-}
-static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-#if ARCH_X86_64 && HAVE_MMX
-    h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
-#else
-    x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
-#endif
-}
-
-static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
-}
-#endif
-
-#define x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, b_dc, name )\
-do\
-{\
-    int ctxidxinc = x264_cabac_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra, b_dc );\
-    if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
-    {\
-        x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        x264_cabac_block_residual##name( h, cb, ctx_block_cat, l );\
-    }\
-    else\
-        x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
-} while(0)
-
-#define x264_cabac_block_residual_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 1, )
-
-#define x264_cabac_block_residual_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, )
-
-#define x264_cabac_block_residual_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-    x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, _8x8 )
-
-#define x264_cabac_block_residual_422_dc_cbf( h, cb, ch, b_intra )\
-    x264_cabac_block_residual_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, 1, _422_dc )
-
-static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
-{
-    const int i_mb_type = h->mb.i_type;
-
-#if !RDO_SKIP_BS
-    const int i_mb_pos_start = x264_cabac_pos( cb );
-    int       i_mb_pos_tex;
-
-    if( SLICE_MBAFF &&
-        (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
-    {
-        x264_cabac_field_decoding_flag( h, cb );
-    }
-#endif
-
-    if( h->sh.i_type == SLICE_TYPE_P )
-        x264_cabac_mb_header_p( h, cb, i_mb_type, chroma );
-    else if( h->sh.i_type == SLICE_TYPE_B )
-        x264_cabac_mb_header_b( h, cb, i_mb_type, chroma );
-    else //if( h->sh.i_type == SLICE_TYPE_I )
-        x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_I, chroma );
-
-#if !RDO_SKIP_BS
-    i_mb_pos_tex = x264_cabac_pos( cb );
-    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
-
-    if( i_mb_type == I_PCM )
-    {
-        bs_t s;
-        bs_init( &s, cb->p, cb->p_end - cb->p );
-
-        for( int p = 0; p < plane_count; p++ )
-            for( int i = 0; i < 256; i++ )
-                bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
-        if( chroma )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
-                    for( int j = 0; j < 8; j++ )
-                        bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
-
-        bs_flush( &s );
-        cb->p = s.p;
-        x264_cabac_encode_init_core( cb );
-
-        h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
-        return;
-    }
-#endif
-
-    if( i_mb_type != I_16x16 )
-    {
-        x264_cabac_cbp_luma( h, cb );
-        if( chroma )
-            x264_cabac_cbp_chroma( h, cb );
-    }
-
-    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
-    {
-        x264_cabac_transform_size( h, cb );
-    }
-
-    if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 )
-    {
-        const int b_intra = IS_INTRA( i_mb_type );
-        x264_cabac_qp_delta( h, cb );
-
-        /* write residual */
-        if( i_mb_type == I_16x16 )
-        {
-            /* DC Luma */
-            for( int p = 0; p < plane_count; p++ )
-            {
-                x264_cabac_block_residual_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
-
-                /* AC Luma */
-                if( h->mb.i_cbp_luma )
-                    for( int i = p*16; i < p*16+16; i++ )
-                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_AC][p], i, h->dct.luma4x4[i]+1, 1 );
-            }
-        }
-        else if( h->mb.b_transform_8x8 )
-        {
-            if( plane_count == 3 )
-            {
-                ALIGNED_4( uint8_t nnzbak[3][8] );
-
-/* Stupid nnz munging in the case that neighbors don't have
- * 8x8 transform enabled. */
-#define BACKUP( dst, src, res )\
-    dst = src;\
-    src = res;
-
-#define RESTORE( dst, src, res )\
-    src = dst;
-
-#define MUNGE_8x8_NNZ( MUNGE )\
-if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
-{\
-    MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
-    MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
-    MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
-    MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
-    MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
-    MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
-}\
-if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
-{\
-    MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
-    MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
-    MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
-    MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
-    MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
-    MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
-}\
-if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
-{\
-    MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
-    MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
-    MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
-}
-
-                MUNGE_8x8_NNZ( BACKUP )
-
-                for( int p = 0; p < 3; p++ )
-                    FOREACH_BIT( i, 0, h->mb.i_cbp_luma )
-                        x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra );
-
-                MUNGE_8x8_NNZ( RESTORE )
-            }
-            else
-            {
-                FOREACH_BIT( i, 0, h->mb.i_cbp_luma )
-                    x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] );
-            }
-        }
-        else
-        {
-            for( int p = 0; p < plane_count; p++ )
-                FOREACH_BIT( i8x8, 0, h->mb.i_cbp_luma )
-                    for( int i = 0; i < 4; i++ )
-                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+i8x8*4+p*16, h->dct.luma4x4[i+i8x8*4+p*16], b_intra );
-        }
-
-        if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
-        {
-            if( CHROMA_FORMAT == CHROMA_422 )
-            {
-                x264_cabac_block_residual_422_dc_cbf( h, cb, 0, b_intra );
-                x264_cabac_block_residual_422_dc_cbf( h, cb, 1, b_intra );
-            }
-            else
-            {
-                x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
-                x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
-            }
-
-            if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
-            {
-                int step = 8 << CHROMA_V_SHIFT;
-                for( int i = 16; i < 3*16; i += step )
-                    for( int j = i; j < i+4; j++ )
-                        x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
-            }
-        }
-    }
-
-#if !RDO_SKIP_BS
-    h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
-#endif
-}
-
-void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
-{
-    if( CHROMA444 )
-        x264_macroblock_write_cabac_internal( h, cb, 3, 0 );
-    else
-        x264_macroblock_write_cabac_internal( h, cb, 1, 1 );
-}
-
-#if RDO_SKIP_BS
-/*****************************************************************************
- * RD only; doesn't generate a valid bitstream
- * doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref (never varies between calls, so no point in doing so)
- * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
- * works on all partition sizes except 16x16
- *****************************************************************************/
-static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
-{
-    const int i_mb_type = h->mb.i_type;
-    int b_8x16 = h->mb.i_partition == D_8x16;
-    int plane_count = CHROMA444 ? 3 : 1;
-
-    if( i_mb_type == P_8x8 )
-    {
-        x264_cabac_8x8_mvd( h, cb, i8 );
-        x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i8] );
-    }
-    else if( i_mb_type == P_L0 )
-        x264_cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
-    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
-    {
-        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
-        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cabac_mvd( h, cb, 1, 4*i8, 4>>b_8x16, 2<<b_8x16 );
-    }
-    else //if( i_mb_type == B_8x8 )
-    {
-        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
-            x264_cabac_mvd( h, cb, 0, 4*i8, 2, 2 );
-        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
-            x264_cabac_mvd( h, cb, 1, 4*i8, 2, 2 );
-    }
-
-    for( int j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
-    {
-        if( h->mb.i_cbp_luma & (1 << i8) )
-        {
-            if( h->mb.b_transform_8x8 )
-            {
-                if( CHROMA444 )
-                    for( int p = 0; p < 3; p++ )
-                        x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 0 );
-                else
-                    x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
-            }
-            else
-                for( int p = 0; p < plane_count; p++ )
-                    for( int i4 = 0; i4 < 4; i4++ )
-                        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16], 0 );
-        }
-
-        if( h->mb.i_cbp_chroma )
-        {
-            if( CHROMA_FORMAT == CHROMA_422 )
-            {
-                int offset = (5*i8) & 0x09;
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 );
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 );
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 );
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 );
-            }
-            else
-            {
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
-                x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
-            }
-        }
-
-        i8 += x264_pixel_size[i_pixel].h >> 3;
-    }
-}
-
-static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
-{
-    int b_8x4 = i_pixel == PIXEL_8x4;
-    int plane_count = CHROMA444 ? 3 : 1;
-    if( i_pixel == PIXEL_4x4 )
-        x264_cabac_mvd( h, cb, 0, i4, 1, 1 );
-    else
-        x264_cabac_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
-    for( int p = 0; p < plane_count; p++ )
-    {
-        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4, h->dct.luma4x4[p*16+i4], 0 );
-        if( i_pixel != PIXEL_4x4 )
-            x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4], 0 );
-    }
-}
-
-static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
-{
-    const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
-    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
-    x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
-    x264_cabac_cbp_luma( h, cb );
-    if( h->mb.i_cbp_luma & (1 << i8) )
-    {
-        if( CHROMA444 )
-            for( int p = 0; p < 3; p++ )
-                x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 1 );
-        else
-            x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
-    }
-}
-
-static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
-{
-    const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
-    int plane_count = CHROMA444 ? 3 : 1;
-    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
-    x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode );
-    for( int p = 0; p < plane_count; p++ )
-        x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
-}
-
-static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
-{
-    x264_cabac_intra_chroma_pred_mode( h, cb );
-    x264_cabac_cbp_chroma( h, cb );
-    if( h->mb.i_cbp_chroma )
-    {
-        if( CHROMA_FORMAT == CHROMA_422 )
-        {
-            x264_cabac_block_residual_422_dc_cbf( h, cb, 0, 1 );
-            x264_cabac_block_residual_422_dc_cbf( h, cb, 1, 1 );
-        }
-        else
-        {
-            x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
-            x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
-        }
-
-        if( h->mb.i_cbp_chroma == 2 )
-        {
-            int step = 8 << CHROMA_V_SHIFT;
-            for( int i = 16; i < 3*16; i += step )
-                for( int j = i; j < i+4; j++ )
-                    x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
-        }
-    }
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/cavlc.c b/android/src/main/libenc/jni/libx264/encoder/cavlc.c
deleted file mode 100755
index 4344b95..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/cavlc.c
+++ /dev/null
@@ -1,722 +0,0 @@
-/*****************************************************************************
- * cavlc.c: cavlc bitstream writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-
-#ifndef RDO_SKIP_BS
-#define RDO_SKIP_BS 0
-#endif
-
-/* [400,420][inter,intra] */
-static const uint8_t cbp_to_golomb[2][2][48] =
-{
-    {{ 0,  1,  2,  5,  3,  6, 14, 10,  4, 15,  7, 11,  8, 12, 13,  9 },
-     { 1, 10, 11,  6, 12,  7, 14,  2, 13, 15,  8,  3,  9,  4,  5,  0 }},
-    {{ 0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
-       1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
-       6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 },
-     { 3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
-      16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
-      41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0 }}
-};
-
-static const uint8_t mb_type_b_to_golomb[3][9]=
-{
-    { 4,  8, 12, 10,  6, 14, 16, 18, 20 }, /* D_16x8 */
-    { 5,  9, 13, 11,  7, 15, 17, 19, 21 }, /* D_8x16 */
-    { 1, -1, -1, -1,  2, -1, -1, -1,  3 }  /* D_16x16 */
-};
-
-static const uint8_t subpartition_p_to_golomb[4]=
-{
-    3, 1, 2, 0
-};
-
-static const uint8_t subpartition_b_to_golomb[13]=
-{
-    10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
-};
-
-#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
-
-/****************************************************************************
- * x264_cavlc_block_residual:
- ****************************************************************************/
-static inline int x264_cavlc_block_residual_escape( x264_t *h, int i_suffix_length, int level )
-{
-    bs_t *s = &h->out.bs;
-    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
-    int i_level_prefix = 15;
-    int mask = level >> 31;
-    int abs_level = (level^mask)-mask;
-    int i_level_code = abs_level*2-mask-2;
-    if( ( i_level_code >> i_suffix_length ) < 15 )
-    {
-        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
-                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
-    }
-    else
-    {
-        i_level_code -= 15 << i_suffix_length;
-        if( i_suffix_length == 0 )
-            i_level_code -= 15;
-
-        /* If the prefix size exceeds 15, High Profile is required. */
-        if( i_level_code >= 1<<12 )
-        {
-            if( h->sps->i_profile_idc >= PROFILE_HIGH )
-            {
-                while( i_level_code > 1<<(i_level_prefix-3) )
-                {
-                    i_level_code -= 1<<(i_level_prefix-3);
-                    i_level_prefix++;
-                }
-            }
-            else
-            {
-#if RDO_SKIP_BS
-                /* Weight highly against overflows. */
-                s->i_bits_encoded += 2000;
-#else
-                /* We've had an overflow; note it down and re-encode the MB later. */
-                h->mb.b_overflow = 1;
-#endif
-            }
-        }
-        bs_write( s, i_level_prefix + 1, 1 );
-        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
-    }
-    if( i_suffix_length == 0 )
-        i_suffix_length++;
-    if( abs_level > next_suffix[i_suffix_length] )
-        i_suffix_length++;
-    return i_suffix_length;
-}
-
-static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC )
-{
-    bs_t *s = &h->out.bs;
-    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
-    static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
-    x264_run_level_t runlevel;
-    int i_total, i_trailing, i_total_zero, i_suffix_length;
-    unsigned int i_sign;
-
-    /* level and run and total */
-    i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
-    x264_prefetch( &x264_run_before[runlevel.mask] );
-    i_total_zero = runlevel.last + 1 - i_total;
-
-    /* branchless i_trailing calculation */
-    runlevel.level[i_total+0] = 2;
-    runlevel.level[i_total+1] = 2;
-    i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
-               | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
-               | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
-    i_trailing = ctz_index[i_trailing];
-    i_sign = ((runlevel.level[2] >> 31) & 1)
-           | ((runlevel.level[1] >> 31) & 2)
-           | ((runlevel.level[0] >> 31) & 4);
-    i_sign >>= 3-i_trailing;
-
-    /* total/trailing */
-    bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] );
-
-    i_suffix_length = i_total > 10 && i_trailing < 3;
-    bs_write( s, i_trailing, i_sign );
-
-    if( i_trailing < i_total )
-    {
-        int val = runlevel.level[i_trailing];
-        int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
-        val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
-        val += LEVEL_TABLE_SIZE/2;
-
-        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
-        {
-            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
-            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
-        }
-        else
-            i_suffix_length = x264_cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
-        for( int i = i_trailing+1; i < i_total; i++ )
-        {
-            val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
-            if( (unsigned)val < LEVEL_TABLE_SIZE )
-            {
-                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
-                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
-            }
-            else
-                i_suffix_length = x264_cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
-        }
-    }
-
-    if( ctx_block_cat == DCT_CHROMA_DC )
-    {
-        if( i_total < 8>>CHROMA_V_SHIFT )
-        {
-            vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
-                                                            : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
-            bs_write_vlc( s, total_zeros );
-        }
-    }
-    else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
-        bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
-
-    int zero_run_code = x264_run_before[runlevel.mask];
-    bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
-
-    return i_total;
-}
-
-static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
-
-#define x264_cavlc_block_residual(h,cat,idx,l)\
-{\
-    int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\
-                                  : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
-    uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
-    if( !*nnz )\
-        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
-    else\
-        *nnz = x264_cavlc_block_residual_internal(h,cat,l,nC);\
-}
-
-static void x264_cavlc_qp_delta( x264_t *h )
-{
-    bs_t *s = &h->out.bs;
-    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
-
-    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
-     * flat background area. Don't do this if it would raise the quantizer, since that could
-     * cause unexpected deblocking artifacts. */
-    if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
-        && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
-        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
-        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]]
-        && h->mb.i_qp > h->mb.i_last_qp )
-    {
-#if !RDO_SKIP_BS
-        h->mb.i_qp = h->mb.i_last_qp;
-#endif
-        i_dqp = 0;
-    }
-
-    if( i_dqp )
-    {
-        if( i_dqp < -(QP_MAX_SPEC+1)/2 )
-            i_dqp += QP_MAX_SPEC+1;
-        else if( i_dqp > QP_MAX_SPEC/2 )
-            i_dqp -= QP_MAX_SPEC+1;
-    }
-    bs_write_se( s, i_dqp );
-}
-
-static void x264_cavlc_mvd( x264_t *h, int i_list, int idx, int width )
-{
-    bs_t *s = &h->out.bs;
-    ALIGNED_4( int16_t mvp[2] );
-    x264_mb_predict_mv( h, i_list, idx, width, mvp );
-    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
-    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
-}
-
-static inline void x264_cavlc_8x8_mvd( x264_t *h, int i )
-{
-    switch( h->mb.i_sub_partition[i] )
-    {
-        case D_L0_8x8:
-            x264_cavlc_mvd( h, 0, 4*i, 2 );
-            break;
-        case D_L0_8x4:
-            x264_cavlc_mvd( h, 0, 4*i+0, 2 );
-            x264_cavlc_mvd( h, 0, 4*i+2, 2 );
-            break;
-        case D_L0_4x8:
-            x264_cavlc_mvd( h, 0, 4*i+0, 1 );
-            x264_cavlc_mvd( h, 0, 4*i+1, 1 );
-            break;
-        case D_L0_4x4:
-            x264_cavlc_mvd( h, 0, 4*i+0, 1 );
-            x264_cavlc_mvd( h, 0, 4*i+1, 1 );
-            x264_cavlc_mvd( h, 0, 4*i+2, 1 );
-            x264_cavlc_mvd( h, 0, 4*i+3, 1 );
-            break;
-    }
-}
-
-static ALWAYS_INLINE void x264_cavlc_macroblock_luma_residual( x264_t *h, int plane_count )
-{
-    if( h->mb.b_transform_8x8 )
-    {
-        /* shuffle 8x8 dct coeffs into 4x4 lists */
-        for( int p = 0; p < plane_count; p++ )
-            for( int i8 = 0; i8 < 4; i8++ )
-                if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] )
-                    h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8],
-                                                     &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] );
-    }
-
-    for( int p = 0; p < plane_count; p++ )
-        FOREACH_BIT( i8, 0, h->mb.i_cbp_luma )
-            for( int i4 = 0; i4 < 4; i4++ )
-                x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
-}
-
-#if RDO_SKIP_BS
-static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p )
-{
-    if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
-        h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4],
-                                         &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] );
-
-    if( h->mb.i_cbp_luma & (1 << i8) )
-        for( int i4 = 0; i4 < 4; i4++ )
-            x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
-}
-#endif
-
-static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
-{
-    bs_t *s = &h->out.bs;
-    if( i_mb_type == I_16x16 )
-    {
-        bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
-                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
-    }
-    else //if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
-    {
-        int di = i_mb_type == I_8x8 ? 4 : 1;
-        bs_write_ue( s, i_mb_i_offset + 0 );
-        if( h->pps->b_transform_8x8_mode )
-            bs_write1( s, h->mb.b_transform_8x8 );
-
-        /* Prediction: Luma */
-        for( int i = 0; i < 16; i += di )
-        {
-            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
-            int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
-
-            if( i_pred == i_mode )
-                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
-            else
-                bs_write( s, 4, i_mode - (i_mode > i_pred) );
-        }
-
-    }
-    if( chroma )
-        bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
-}
-
-static ALWAYS_INLINE void x264_cavlc_mb_header_p( x264_t *h, int i_mb_type, int chroma )
-{
-    bs_t *s = &h->out.bs;
-    if( i_mb_type == P_L0 )
-    {
-        if( h->mb.i_partition == D_16x16 )
-        {
-            bs_write1( s, 1 );
-
-            if( h->mb.pic.i_fref[0] > 1 )
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            x264_cavlc_mvd( h, 0, 0, 4 );
-        }
-        else if( h->mb.i_partition == D_16x8 )
-        {
-            bs_write_ue( s, 1 );
-            if( h->mb.pic.i_fref[0] > 1 )
-            {
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
-            }
-            x264_cavlc_mvd( h, 0, 0, 4 );
-            x264_cavlc_mvd( h, 0, 8, 4 );
-        }
-        else if( h->mb.i_partition == D_8x16 )
-        {
-            bs_write_ue( s, 2 );
-            if( h->mb.pic.i_fref[0] > 1 )
-            {
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
-            }
-            x264_cavlc_mvd( h, 0, 0, 2 );
-            x264_cavlc_mvd( h, 0, 4, 2 );
-        }
-    }
-    else if( i_mb_type == P_8x8 )
-    {
-        int b_sub_ref;
-        if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
-             h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
-        {
-            bs_write_ue( s, 4 );
-            b_sub_ref = 0;
-        }
-        else
-        {
-            bs_write_ue( s, 3 );
-            b_sub_ref = 1;
-        }
-
-        /* sub mb type */
-        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
-            for( int i = 0; i < 4; i++ )
-                bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i] ] );
-        else
-            bs_write( s, 4, 0xf );
-
-        /* ref0 */
-        if( b_sub_ref )
-        {
-            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
-            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
-            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
-        }
-
-        for( int i = 0; i < 4; i++ )
-            x264_cavlc_8x8_mvd( h, i );
-    }
-    else //if( IS_INTRA( i_mb_type ) )
-        x264_cavlc_mb_header_i( h, i_mb_type, 5, chroma );
-}
-
-static ALWAYS_INLINE void x264_cavlc_mb_header_b( x264_t *h, int i_mb_type, int chroma )
-{
-    bs_t *s = &h->out.bs;
-    if( i_mb_type == B_8x8 )
-    {
-        bs_write_ue( s, 22 );
-
-        /* sub mb type */
-        for( int i = 0; i < 4; i++ )
-            bs_write_ue( s, subpartition_b_to_golomb[ h->mb.i_sub_partition[i] ] );
-
-        /* ref */
-        if( h->mb.pic.i_fref[0] > 1 )
-            for( int i = 0; i < 4; i++ )
-                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-                    bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
-        if( h->mb.pic.i_fref[1] > 1 )
-            for( int i = 0; i < 4; i++ )
-                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-                    bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
-
-        /* mvd */
-        for( int i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-                x264_cavlc_mvd( h, 0, 4*i, 2 );
-        for( int i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-                x264_cavlc_mvd( h, 1, 4*i, 2 );
-    }
-    else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
-    {
-        /* All B mode */
-        /* Motion Vector */
-        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
-        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
-        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
-
-        bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
-        if( h->mb.i_partition == D_16x16 )
-        {
-            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
-            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
-            if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 4 );
-            if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 4 );
-        }
-        else
-        {
-            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
-            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
-            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
-            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
-            if( h->mb.i_partition == D_16x8 )
-            {
-                if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 4 );
-                if( b_list[0][1] ) x264_cavlc_mvd( h, 0, 8, 4 );
-                if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 4 );
-                if( b_list[1][1] ) x264_cavlc_mvd( h, 1, 8, 4 );
-            }
-            else //if( h->mb.i_partition == D_8x16 )
-            {
-                if( b_list[0][0] ) x264_cavlc_mvd( h, 0, 0, 2 );
-                if( b_list[0][1] ) x264_cavlc_mvd( h, 0, 4, 2 );
-                if( b_list[1][0] ) x264_cavlc_mvd( h, 1, 0, 2 );
-                if( b_list[1][1] ) x264_cavlc_mvd( h, 1, 4, 2 );
-            }
-        }
-    }
-    else if( i_mb_type == B_DIRECT )
-        bs_write1( s, 1 );
-    else //if( IS_INTRA( i_mb_type ) )
-        x264_cavlc_mb_header_i( h, i_mb_type, 23, chroma );
-}
-
-/*****************************************************************************
- * x264_macroblock_write:
- *****************************************************************************/
-void x264_macroblock_write_cavlc( x264_t *h )
-{
-    bs_t *s = &h->out.bs;
-    const int i_mb_type = h->mb.i_type;
-    int plane_count = CHROMA444 ? 3 : 1;
-    int chroma = !CHROMA444;
-
-#if RDO_SKIP_BS
-    s->i_bits_encoded = 0;
-#else
-    const int i_mb_pos_start = bs_pos( s );
-    int       i_mb_pos_tex;
-#endif
-
-    if( SLICE_MBAFF
-        && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
-    {
-        bs_write1( s, MB_INTERLACED );
-#if !RDO_SKIP_BS
-        h->mb.field_decoding_flag = MB_INTERLACED;
-#endif
-    }
-
-#if !RDO_SKIP_BS
-    if( i_mb_type == I_PCM )
-    {
-        static const uint8_t i_offsets[3] = {5,23,0};
-        uint8_t *p_start = s->p_start;
-        bs_write_ue( s, i_offsets[h->sh.i_type] + 25 );
-        i_mb_pos_tex = bs_pos( s );
-        h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
-
-        bs_align_0( s );
-
-        for( int p = 0; p < plane_count; p++ )
-            for( int i = 0; i < 256; i++ )
-                bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
-        if( chroma )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
-                    for( int j = 0; j < 8; j++ )
-                        bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
-
-        bs_init( s, s->p, s->p_end - s->p );
-        s->p_start = p_start;
-
-        h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
-        return;
-    }
-#endif
-
-    if( h->sh.i_type == SLICE_TYPE_P )
-        x264_cavlc_mb_header_p( h, i_mb_type, chroma );
-    else if( h->sh.i_type == SLICE_TYPE_B )
-        x264_cavlc_mb_header_b( h, i_mb_type, chroma );
-    else //if( h->sh.i_type == SLICE_TYPE_I )
-        x264_cavlc_mb_header_i( h, i_mb_type, 0, chroma );
-
-#if !RDO_SKIP_BS
-    i_mb_pos_tex = bs_pos( s );
-    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
-#endif
-
-    /* Coded block pattern */
-    if( i_mb_type != I_16x16 )
-        bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
-
-    /* transform size 8x8 flag */
-    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
-        bs_write1( s, h->mb.b_transform_8x8 );
-
-    if( i_mb_type == I_16x16 )
-    {
-        x264_cavlc_qp_delta( h );
-
-        /* DC Luma */
-        for( int p = 0; p < plane_count; p++ )
-        {
-            x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );
-
-            /* AC Luma */
-            if( h->mb.i_cbp_luma )
-                for( int i = p*16; i < p*16+16; i++ )
-                    x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
-        }
-    }
-    else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
-    {
-        x264_cavlc_qp_delta( h );
-        x264_cavlc_macroblock_luma_residual( h, plane_count );
-    }
-    if( h->mb.i_cbp_chroma )
-    {
-        /* Chroma DC residual present */
-        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
-        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
-        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
-        {
-            int step = 8 << CHROMA_V_SHIFT;
-            for( int i = 16; i < 3*16; i += step )
-                for( int j = i; j < i+4; j++ )
-                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
-        }
-    }
-
-#if !RDO_SKIP_BS
-    h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
-#endif
-}
-
-#if RDO_SKIP_BS
-/*****************************************************************************
- * RD only; doesn't generate a valid bitstream
- * doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref (never varies between calls, so no point in doing so)
- * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
- * works on all partition sizes except 16x16
- *****************************************************************************/
-static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
-{
-    bs_t *s = &h->out.bs;
-    const int i_mb_type = h->mb.i_type;
-    int b_8x16 = h->mb.i_partition == D_8x16;
-    int plane_count = CHROMA444 ? 3 : 1;
-    int j;
-
-    h->out.bs.i_bits_encoded = 0;
-
-    if( i_mb_type == P_8x8 )
-    {
-        x264_cavlc_8x8_mvd( h, i8 );
-        bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
-    }
-    else if( i_mb_type == P_L0 )
-        x264_cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
-    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
-    {
-        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
-        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) x264_cavlc_mvd( h, 1, 4*i8, 4>>b_8x16 );
-    }
-    else //if( i_mb_type == B_8x8 )
-    {
-        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
-            x264_cavlc_mvd( h, 0, 4*i8, 2 );
-        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
-            x264_cavlc_mvd( h, 1, 4*i8, 2 );
-    }
-
-    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
-    {
-        for( int p = 0; p < plane_count; p++ )
-            x264_cavlc_partition_luma_residual( h, i8, p );
-        if( h->mb.i_cbp_chroma )
-        {
-            if( CHROMA_FORMAT == CHROMA_422 )
-            {
-                int offset = (5*i8) & 0x09;
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
-            }
-            else
-            {
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
-                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
-            }
-        }
-        i8 += x264_pixel_size[i_pixel].h >> 3;
-    }
-
-    return h->out.bs.i_bits_encoded;
-}
-
-static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
-{
-    int plane_count = CHROMA444 ? 3 : 1;
-    int b_8x4 = i_pixel == PIXEL_8x4;
-    h->out.bs.i_bits_encoded = 0;
-    x264_cavlc_mvd( h, 0, i4, 1+b_8x4 );
-    for( int p = 0; p < plane_count; p++ )
-    {
-        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
-        if( i_pixel != PIXEL_4x4 )
-            x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] );
-    }
-
-    return h->out.bs.i_bits_encoded;
-}
-
-static int x264_cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
-{
-    if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
-        return 1;
-    else
-        return 4;
-}
-
-static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
-{
-    int plane_count = CHROMA444 ? 3 : 1;
-    h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
-    bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
-    for( int p = 0; p < plane_count; p++ )
-        x264_cavlc_partition_luma_residual( h, i8, p );
-    return h->out.bs.i_bits_encoded;
-}
-
-static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
-{
-    int plane_count = CHROMA444 ? 3 : 1;
-    h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, i4, i_mode );
-    for( int p = 0; p < plane_count; p++ )
-        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
-    return h->out.bs.i_bits_encoded;
-}
-
-static int x264_chroma_size_cavlc( x264_t *h )
-{
-    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
-    if( h->mb.i_cbp_chroma )
-    {
-        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
-        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
-
-        if( h->mb.i_cbp_chroma == 2 )
-        {
-            int step = 8 << CHROMA_V_SHIFT;
-            for( int i = 16; i < 3*16; i += step )
-                for( int j = i; j < i+4; j++ )
-                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
-        }
-    }
-    return h->out.bs.i_bits_encoded;
-}
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/encoder.c b/android/src/main/libenc/jni/libx264/encoder/encoder.c
deleted file mode 100755
index f733adf..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/encoder.c
+++ /dev/null
@@ -1,4424 +0,0 @@
-/*****************************************************************************
- * encoder.c: top-level encoder functions
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-
-#include "set.h"
-#include "analyse.h"
-#include "ratecontrol.h"
-#include "macroblock.h"
-#include "me.h"
-#if HAVE_INTEL_DISPATCHER
-#include "extras/intel_dispatcher.h"
-#endif
-
-//#define DEBUG_MB_TYPE
-
-#define bs_write_ue bs_write_ue_big
-
-static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
-                                   x264_nal_t **pp_nal, int *pi_nal,
-                                   x264_picture_t *pic_out );
-
-/****************************************************************************
- *
- ******************************* x264 libs **********************************
- *
- ****************************************************************************/
-static double x264_psnr( double sqe, double size )
-{
-    double mse = sqe / (PIXEL_MAX*PIXEL_MAX * size);
-    if( mse <= 0.0000000001 ) /* Max 100dB */
-        return 100;
-
-    return -10.0 * log10( mse );
-}
-
-static double x264_ssim( double ssim )
-{
-    double inv_ssim = 1 - ssim;
-    if( inv_ssim <= 0.0000000001 ) /* Max 100dB */
-        return 100;
-
-    return -10.0 * log10( inv_ssim );
-}
-
-static int x264_threadpool_wait_all( x264_t *h )
-{
-    for( int i = 0; i < h->param.i_threads; i++ )
-        if( h->thread[i]->b_thread_active )
-        {
-            h->thread[i]->b_thread_active = 0;
-            if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) < 0 )
-                return -1;
-        }
-    return 0;
-}
-
-static void x264_frame_dump( x264_t *h )
-{
-    FILE *f = x264_fopen( h->param.psz_dump_yuv, "r+b" );
-    if( !f )
-        return;
-
-    /* Wait for the threads to finish deblocking */
-    if( h->param.b_sliced_threads )
-        x264_threadpool_wait_all( h );
-
-    /* Write the frame in display order */
-    int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
-    if( !fseek( f, (int64_t)h->fdec->i_frame * frame_size, SEEK_SET ) )
-    {
-        for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
-            for( int y = 0; y < h->param.i_height; y++ )
-                fwrite( &h->fdec->plane[p][y*h->fdec->i_stride[p]], sizeof(pixel), h->param.i_width, f );
-        if( !CHROMA444 )
-        {
-            int cw = h->param.i_width>>1;
-            int ch = h->param.i_height>>CHROMA_V_SHIFT;
-            pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
-            if( planeu )
-            {
-                pixel *planev = planeu + cw*ch + 16;
-                h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
-                fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
-                fwrite( planev, 1, cw*ch*sizeof(pixel), f );
-                x264_free( planeu );
-            }
-        }
-    }
-    fclose( f );
-}
-
-/* Fill "default" values */
-static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
-                                    x264_sps_t *sps, x264_pps_t *pps,
-                                    int i_idr_pic_id, int i_frame, int i_qp )
-{
-    x264_param_t *param = &h->param;
-
-    /* First we fill all fields */
-    sh->sps = sps;
-    sh->pps = pps;
-
-    sh->i_first_mb  = 0;
-    sh->i_last_mb   = h->mb.i_mb_count - 1;
-    sh->i_pps_id    = pps->i_id;
-
-    sh->i_frame_num = i_frame;
-
-    sh->b_mbaff = PARAM_INTERLACED;
-    sh->b_field_pic = 0;    /* no field support for now */
-    sh->b_bottom_field = 0; /* not yet used */
-
-    sh->i_idr_pic_id = i_idr_pic_id;
-
-    /* poc stuff, fixed later */
-    sh->i_poc = 0;
-    sh->i_delta_poc_bottom = 0;
-    sh->i_delta_poc[0] = 0;
-    sh->i_delta_poc[1] = 0;
-
-    sh->i_redundant_pic_cnt = 0;
-
-    h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
-                                && h->param.i_bframe
-                                && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
-
-    if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
-    {
-        if( h->fref[1][0]->i_poc_l0ref0 == h->fref[0][0]->i_poc )
-        {
-            if( h->mb.b_direct_auto_write )
-                sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
-            else
-                sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
-        }
-        else
-        {
-            h->mb.b_direct_auto_write = 0;
-            sh->b_direct_spatial_mv_pred = 1;
-        }
-    }
-    /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */
-
-    sh->b_num_ref_idx_override = 0;
-    sh->i_num_ref_idx_l0_active = 1;
-    sh->i_num_ref_idx_l1_active = 1;
-
-    sh->b_ref_pic_list_reordering[0] = h->b_ref_reorder[0];
-    sh->b_ref_pic_list_reordering[1] = h->b_ref_reorder[1];
-
-    /* If the ref list isn't in the default order, construct reordering header */
-    for( int list = 0; list < 2; list++ )
-    {
-        if( sh->b_ref_pic_list_reordering[list] )
-        {
-            int pred_frame_num = i_frame;
-            for( int i = 0; i < h->i_ref[list]; i++ )
-            {
-                int diff = h->fref[list][i]->i_frame_num - pred_frame_num;
-                sh->ref_pic_list_order[list][i].idc = ( diff > 0 );
-                sh->ref_pic_list_order[list][i].arg = (abs(diff) - 1) & ((1 << sps->i_log2_max_frame_num) - 1);
-                pred_frame_num = h->fref[list][i]->i_frame_num;
-            }
-        }
-    }
-
-    sh->i_cabac_init_idc = param->i_cabac_init_idc;
-
-    sh->i_qp = SPEC_QP(i_qp);
-    sh->i_qp_delta = sh->i_qp - pps->i_pic_init_qp;
-    sh->b_sp_for_swidth = 0;
-    sh->i_qs_delta = 0;
-
-    int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
-    /* If effective qp <= 15, deblocking would have no effect anyway */
-    if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
-        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
-    else
-        sh->i_disable_deblocking_filter_idc = 1;
-    sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
-    sh->i_beta_offset = param->i_deblocking_filter_beta << 1;
-}
-
-static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal_ref_idc )
-{
-    if( sh->b_mbaff )
-    {
-        int first_x = sh->i_first_mb % sh->sps->i_mb_width;
-        int first_y = sh->i_first_mb / sh->sps->i_mb_width;
-        assert( (first_y&1) == 0 );
-        bs_write_ue( s, (2*first_x + sh->sps->i_mb_width*(first_y&~1) + (first_y&1)) >> 1 );
-    }
-    else
-        bs_write_ue( s, sh->i_first_mb );
-
-    bs_write_ue( s, sh->i_type + 5 );   /* same type things */
-    bs_write_ue( s, sh->i_pps_id );
-    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<<sh->sps->i_log2_max_frame_num)-1) );
-
-    if( !sh->sps->b_frame_mbs_only )
-    {
-        bs_write1( s, sh->b_field_pic );
-        if( sh->b_field_pic )
-            bs_write1( s, sh->b_bottom_field );
-    }
-
-    if( sh->i_idr_pic_id >= 0 ) /* NAL IDR */
-        bs_write_ue( s, sh->i_idr_pic_id );
-
-    if( sh->sps->i_poc_type == 0 )
-    {
-        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc & ((1<<sh->sps->i_log2_max_poc_lsb)-1) );
-        if( sh->pps->b_pic_order && !sh->b_field_pic )
-            bs_write_se( s, sh->i_delta_poc_bottom );
-    }
-
-    if( sh->pps->b_redundant_pic_cnt )
-        bs_write_ue( s, sh->i_redundant_pic_cnt );
-
-    if( sh->i_type == SLICE_TYPE_B )
-        bs_write1( s, sh->b_direct_spatial_mv_pred );
-
-    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_B )
-    {
-        bs_write1( s, sh->b_num_ref_idx_override );
-        if( sh->b_num_ref_idx_override )
-        {
-            bs_write_ue( s, sh->i_num_ref_idx_l0_active - 1 );
-            if( sh->i_type == SLICE_TYPE_B )
-                bs_write_ue( s, sh->i_num_ref_idx_l1_active - 1 );
-        }
-    }
-
-    /* ref pic list reordering */
-    if( sh->i_type != SLICE_TYPE_I )
-    {
-        bs_write1( s, sh->b_ref_pic_list_reordering[0] );
-        if( sh->b_ref_pic_list_reordering[0] )
-        {
-            for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
-            {
-                bs_write_ue( s, sh->ref_pic_list_order[0][i].idc );
-                bs_write_ue( s, sh->ref_pic_list_order[0][i].arg );
-            }
-            bs_write_ue( s, 3 );
-        }
-    }
-    if( sh->i_type == SLICE_TYPE_B )
-    {
-        bs_write1( s, sh->b_ref_pic_list_reordering[1] );
-        if( sh->b_ref_pic_list_reordering[1] )
-        {
-            for( int i = 0; i < sh->i_num_ref_idx_l1_active; i++ )
-            {
-                bs_write_ue( s, sh->ref_pic_list_order[1][i].idc );
-                bs_write_ue( s, sh->ref_pic_list_order[1][i].arg );
-            }
-            bs_write_ue( s, 3 );
-        }
-    }
-
-    sh->b_weighted_pred = 0;
-    if( sh->pps->b_weighted_pred && sh->i_type == SLICE_TYPE_P )
-    {
-        sh->b_weighted_pred = sh->weight[0][0].weightfn || sh->weight[0][1].weightfn || sh->weight[0][2].weightfn;
-        /* pred_weight_table() */
-        bs_write_ue( s, sh->weight[0][0].i_denom );
-        bs_write_ue( s, sh->weight[0][1].i_denom );
-        for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
-        {
-            int luma_weight_l0_flag = !!sh->weight[i][0].weightfn;
-            int chroma_weight_l0_flag = !!sh->weight[i][1].weightfn || !!sh->weight[i][2].weightfn;
-            bs_write1( s, luma_weight_l0_flag );
-            if( luma_weight_l0_flag )
-            {
-                bs_write_se( s, sh->weight[i][0].i_scale );
-                bs_write_se( s, sh->weight[i][0].i_offset );
-            }
-            bs_write1( s, chroma_weight_l0_flag );
-            if( chroma_weight_l0_flag )
-            {
-                for( int j = 1; j < 3; j++ )
-                {
-                    bs_write_se( s, sh->weight[i][j].i_scale );
-                    bs_write_se( s, sh->weight[i][j].i_offset );
-                }
-            }
-        }
-    }
-    else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
-    {
-      /* TODO */
-    }
-
-    if( i_nal_ref_idc != 0 )
-    {
-        if( sh->i_idr_pic_id >= 0 )
-        {
-            bs_write1( s, 0 );  /* no output of prior pics flag */
-            bs_write1( s, 0 );  /* long term reference flag */
-        }
-        else
-        {
-            bs_write1( s, sh->i_mmco_command_count > 0 ); /* adaptive_ref_pic_marking_mode_flag */
-            if( sh->i_mmco_command_count > 0 )
-            {
-                for( int i = 0; i < sh->i_mmco_command_count; i++ )
-                {
-                    bs_write_ue( s, 1 ); /* mark short term ref as unused */
-                    bs_write_ue( s, sh->mmco[i].i_difference_of_pic_nums - 1 );
-                }
-                bs_write_ue( s, 0 ); /* end command list */
-            }
-        }
-    }
-
-    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I )
-        bs_write_ue( s, sh->i_cabac_init_idc );
-
-    bs_write_se( s, sh->i_qp_delta );      /* slice qp delta */
-
-    if( sh->pps->b_deblocking_filter_control )
-    {
-        bs_write_ue( s, sh->i_disable_deblocking_filter_idc );
-        if( sh->i_disable_deblocking_filter_idc != 1 )
-        {
-            bs_write_se( s, sh->i_alpha_c0_offset >> 1 );
-            bs_write_se( s, sh->i_beta_offset >> 1 );
-        }
-    }
-}
-
-/* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
-/* reallocate, adding an arbitrary amount of space. */
-static int x264_bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal )
-{
-    if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) ||
-        (h->out.bs.p_end - h->out.bs.p < size) )
-    {
-        int buf_size = h->out.i_bitstream + size;
-        uint8_t *buf = x264_malloc( buf_size );
-        if( !buf )
-            return -1;
-        int aligned_size = h->out.i_bitstream & ~15;
-        h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size );
-        memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size );
-
-        intptr_t delta = buf - h->out.p_bitstream;
-
-        h->out.bs.p_start += delta;
-        h->out.bs.p += delta;
-        h->out.bs.p_end = buf + buf_size;
-
-        h->cabac.p_start += delta;
-        h->cabac.p += delta;
-        h->cabac.p_end = buf + buf_size;
-
-        for( int i = 0; i <= i_nal; i++ )
-            h->out.nal[i].p_payload += delta;
-
-        x264_free( h->out.p_bitstream );
-        h->out.p_bitstream = buf;
-        h->out.i_bitstream = buf_size;
-    }
-    return 0;
-}
-
-static int x264_bitstream_check_buffer( x264_t *h )
-{
-    int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width;
-    return x264_bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal );
-}
-
-static int x264_bitstream_check_buffer_filler( x264_t *h, int filler )
-{
-    filler += 32; // add padding for safety
-    return x264_bitstream_check_buffer_internal( h, filler, 0, -1 );
-}
-
-#if HAVE_THREAD
-static void x264_encoder_thread_init( x264_t *h )
-{
-    if( h->param.i_sync_lookahead )
-        x264_lower_thread_priority( 10 );
-}
-#endif
-
-/****************************************************************************
- *
- ****************************************************************************
- ****************************** External API*********************************
- ****************************************************************************
- *
- ****************************************************************************/
-
-static int x264_validate_parameters( x264_t *h, int b_open )
-{
-    if( !h->param.pf_log )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "pf_log not set! did you forget to call x264_param_default?\n" );
-        return -1;
-    }
-
-#if HAVE_MMX
-    if( b_open )
-    {
-        int cpuflags = x264_cpu_detect();
-        int fail = 0;
-#ifdef __SSE__
-        if( !(cpuflags & X264_CPU_SSE) )
-        {
-            x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n");
-            fail = 1;
-        }
-#else
-        if( !(cpuflags & X264_CPU_MMX2) )
-        {
-            x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n");
-            fail = 1;
-        }
-#endif
-        if( !fail && !(cpuflags & X264_CPU_CMOV) )
-        {
-            x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
-            fail = 1;
-        }
-        if( fail )
-        {
-            x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
-            return -1;
-        }
-    }
-#endif
-
-#if HAVE_INTERLACED
-    h->param.b_interlaced = !!PARAM_INTERLACED;
-#else
-    if( h->param.b_interlaced )
-    {
-        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
-        return -1;
-    }
-#endif
-
-    if( h->param.i_width <= 0 || h->param.i_height <= 0 )
-    {
-        x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
-        return -1;
-    }
-
-    int i_csp = h->param.i_csp & X264_CSP_MASK;
-#if X264_CHROMA_FORMAT
-    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 )
-    {
-        x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
-        return -1;
-    }
-    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 )
-    {
-        x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
-        return -1;
-    }
-    else if( CHROMA_FORMAT != CHROMA_444 && i_csp >= X264_CSP_I444 && i_csp <= X264_CSP_RGB )
-    {
-        x264_log( h, X264_LOG_ERROR, "not compiled with 4:4:4 support\n" );
-        return -1;
-    }
-#endif
-    if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
-    {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
-        return -1;
-    }
-
-    int w_mod = i_csp < X264_CSP_I444 ? 2 : 1;
-    int h_mod = (i_csp < X264_CSP_I422 ? 2 : 1) << PARAM_INTERLACED;
-    if( h->param.i_width % w_mod )
-    {
-        x264_log( h, X264_LOG_ERROR, "width not divisible by %d (%dx%d)\n",
-                  w_mod, h->param.i_width, h->param.i_height );
-        return -1;
-    }
-    if( h->param.i_height % h_mod )
-    {
-        x264_log( h, X264_LOG_ERROR, "height not divisible by %d (%dx%d)\n",
-                  h_mod, h->param.i_width, h->param.i_height );
-        return -1;
-    }
-
-    if( h->param.crop_rect.i_left   >= h->param.i_width ||
-        h->param.crop_rect.i_right  >= h->param.i_width ||
-        h->param.crop_rect.i_top    >= h->param.i_height ||
-        h->param.crop_rect.i_bottom >= h->param.i_height ||
-        h->param.crop_rect.i_left + h->param.crop_rect.i_right  >= h->param.i_width ||
-        h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom >= h->param.i_height )
-    {
-        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
-                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
-        return -1;
-    }
-    if( h->param.crop_rect.i_left % w_mod || h->param.crop_rect.i_right  % w_mod ||
-        h->param.crop_rect.i_top  % h_mod || h->param.crop_rect.i_bottom % h_mod )
-    {
-        x264_log( h, X264_LOG_ERROR, "crop-rect %u,%u,%u,%u not divisible by %dx%d\n", h->param.crop_rect.i_left,
-                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom, w_mod, h_mod );
-        return -1;
-    }
-
-    if( h->param.vui.i_sar_width <= 0 || h->param.vui.i_sar_height <= 0 )
-    {
-        h->param.vui.i_sar_width = 0;
-        h->param.vui.i_sar_height = 0;
-    }
-
-    if( h->param.i_threads == X264_THREADS_AUTO )
-    {
-        h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
-        /* Avoid too many threads as they don't improve performance and
-         * complicate VBV. Capped at an arbitrary 2 rows per thread. */
-        int max_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 2 );
-        h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
-    }
-    int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
-    if( h->param.i_threads > 1 )
-    {
-#if !HAVE_THREAD
-        x264_log( h, X264_LOG_WARNING, "not compiled with thread support!\n");
-        h->param.i_threads = 1;
-#endif
-        /* Avoid absurdly small thread slices as they can reduce performance
-         * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
-        if( h->param.b_sliced_threads )
-            h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
-    }
-    h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
-    if( h->param.i_threads == 1 )
-    {
-        h->param.b_sliced_threads = 0;
-        h->param.i_lookahead_threads = 1;
-    }
-    h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
-    if( h->i_thread_frames > 1 )
-        h->param.nalu_process = NULL;
-
-    if( h->param.b_opencl )
-    {
-#if !HAVE_OPENCL
-        x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" );
-        h->param.b_opencl = 0;
-#elif BIT_DEPTH > 8
-        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
-        h->param.b_opencl = 0;
-#else
-        if( h->param.i_width < 32 || h->param.i_height < 32 )
-        {
-            x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" );
-            h->param.b_opencl = 0;
-        }
-#endif
-        if( h->param.opencl_device_id && h->param.i_opencl_device )
-        {
-            x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" );
-            h->param.i_opencl_device = 0;
-        }
-    }
-
-    h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
-    if( h->param.i_keyint_max == 1 )
-    {
-        h->param.b_intra_refresh = 0;
-        h->param.analyse.i_weighted_pred = 0;
-        h->param.i_frame_reference = 1;
-        h->param.i_dpb_size = 1;
-    }
-
-    if( h->param.i_frame_packing < -1 || h->param.i_frame_packing > 7 )
-    {
-        x264_log( h, X264_LOG_WARNING, "ignoring unknown frame packing value\n" );
-        h->param.i_frame_packing = -1;
-    }
-    if( h->param.i_frame_packing == 7 &&
-        ((h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right)  % 3 ||
-         (h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom) % 3) )
-    {
-        x264_log( h, X264_LOG_ERROR, "cropped resolution %dx%d not compatible with tile format frame packing\n",
-                  h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right,
-                  h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom );
-        return -1;
-    }
-
-    /* Detect default ffmpeg settings and terminate with an error. */
-    if( b_open )
-    {
-        int score = 0;
-        score += h->param.analyse.i_me_range == 0;
-        score += h->param.rc.i_qp_step == 3;
-        score += h->param.i_keyint_max == 12;
-        score += h->param.rc.i_qp_min == 2;
-        score += h->param.rc.i_qp_max == 31;
-        score += h->param.rc.f_qcompress == 0.5;
-        score += fabs(h->param.rc.f_ip_factor - 1.25) < 0.01;
-        score += fabs(h->param.rc.f_pb_factor - 1.25) < 0.01;
-        score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8;
-        if( score >= 5 )
-        {
-            x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" );
-            x264_log( h, X264_LOG_ERROR, "use an encoding preset (e.g. -vpre medium)\n" );
-            x264_log( h, X264_LOG_ERROR, "preset usage: -vpre <speed> -vpre <profile>\n" );
-            x264_log( h, X264_LOG_ERROR, "speed presets are listed in x264 --help\n" );
-            x264_log( h, X264_LOG_ERROR, "profile is optional; x264 defaults to high\n" );
-            return -1;
-        }
-    }
-
-    if( h->param.rc.i_rc_method < 0 || h->param.rc.i_rc_method > 2 )
-    {
-        x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" );
-        return -1;
-    }
-
-    if( PARAM_INTERLACED )
-        h->param.b_pic_struct = 1;
-
-    if( h->param.i_avcintra_class )
-    {
-        if( BIT_DEPTH != 10 )
-        {
-            x264_log( h, X264_LOG_ERROR, "%2d-bit AVC-Intra is not widely compatible\n", BIT_DEPTH );
-            x264_log( h, X264_LOG_ERROR, "10-bit x264 is required to encode AVC-Intra\n" );
-            return -1;
-        }
-
-        int type = h->param.i_avcintra_class == 200 ? 2 :
-                   h->param.i_avcintra_class == 100 ? 1 :
-                   h->param.i_avcintra_class == 50 ? 0 : -1;
-        if( type < 0 )
-        {
-            x264_log( h, X264_LOG_ERROR, "Invalid AVC-Intra class\n" );
-            return -1;
-        }
-
-        /* [50/100/200][res][fps] */
-        static const struct
-        {
-            uint16_t fps_num;
-            uint16_t fps_den;
-            uint8_t interlaced;
-            uint16_t frame_size;
-            const uint8_t *cqm_4ic;
-            const uint8_t *cqm_8iy;
-        } avcintra_lut[3][2][7] =
-        {
-            {{{ 60000, 1001, 0,  912, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              {    50,    1, 0, 1100, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              { 30000, 1001, 0,  912, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              {    25,    1, 0, 1100, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              { 24000, 1001, 0,  912, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }},
-             {{ 30000, 1001, 1, 1820, x264_cqm_avci50_4ic, x264_cqm_avci50_1080i_8iy },
-              {    25,    1, 1, 2196, x264_cqm_avci50_4ic, x264_cqm_avci50_1080i_8iy },
-              { 60000, 1001, 0, 1820, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              { 30000, 1001, 0, 1820, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              {    50,    1, 0, 2196, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              {    25,    1, 0, 2196, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy },
-              { 24000, 1001, 0, 1820, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }}},
-            {{{ 60000, 1001, 0, 1848, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy },
-              {    50,    1, 0, 2224, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy },
-              { 30000, 1001, 0, 1848, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy },
-              {    25,    1, 0, 2224, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy },
-              { 24000, 1001, 0, 1848, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }},
-             {{ 30000, 1001, 1, 3692, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy },
-              {    25,    1, 1, 4444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy },
-              { 60000, 1001, 0, 3692, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              { 30000, 1001, 0, 3692, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              {    50,    1, 0, 4444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              {    25,    1, 0, 4444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              { 24000, 1001, 0, 3692, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }}},
-            {{{ 60000, 1001, 0, 3724, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy },
-              {    50,    1, 0, 4472, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }},
-             {{ 30000, 1001, 1, 7444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy },
-              {    25,    1, 1, 8940, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy },
-              { 60000, 1001, 0, 7444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              { 30000, 1001, 0, 7444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              {    50,    1, 0, 8940, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              {    25,    1, 0, 8940, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy },
-              { 24000, 1001, 0, 7444, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }}}
-        };
-
-        int res = -1;
-        if( i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 && !type )
-        {
-            if(      h->param.i_width == 1440 && h->param.i_height == 1080 ) res =  1;
-            else if( h->param.i_width ==  960 && h->param.i_height ==  720 ) res =  0;
-        }
-        else if( i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 && type )
-        {
-            if(      h->param.i_width == 1920 && h->param.i_height == 1080 ) res =  1;
-            else if( h->param.i_width == 1280 && h->param.i_height ==  720 ) res =  0;
-        }
-        else
-        {
-            x264_log( h, X264_LOG_ERROR, "Invalid colorspace for AVC-Intra %d\n", h->param.i_avcintra_class );
-            return -1;
-        }
-
-        if( res < 0 )
-        {
-            x264_log( h, X264_LOG_ERROR, "Resolution %dx%d invalid for AVC-Intra %d\n",
-                      h->param.i_width, h->param.i_height, h->param.i_avcintra_class );
-            return -1;
-        }
-
-        if( h->param.nalu_process )
-        {
-            x264_log( h, X264_LOG_ERROR, "nalu_process is not supported in AVC-Intra mode\n" );
-            return -1;
-        }
-
-        if( !h->param.b_repeat_headers )
-        {
-            x264_log( h, X264_LOG_ERROR, "Separate headers not supported in AVC-Intra mode\n" );
-            return -1;
-        }
-
-        int i;
-        uint32_t fps_num = h->param.i_fps_num, fps_den = h->param.i_fps_den;
-        x264_reduce_fraction( &fps_num, &fps_den );
-        for( i = 0; i < 7; i++ )
-        {
-            if( avcintra_lut[type][res][i].fps_num == fps_num &&
-                avcintra_lut[type][res][i].fps_den == fps_den &&
-                avcintra_lut[type][res][i].interlaced == PARAM_INTERLACED )
-            {
-                break;
-            }
-        }
-        if( i == 7 )
-        {
-            x264_log( h, X264_LOG_ERROR, "FPS %d/%d%c not compatible with AVC-Intra\n",
-                      h->param.i_fps_num, h->param.i_fps_den, PARAM_INTERLACED ? 'i' : 'p' );
-            return -1;
-        }
-
-        h->param.i_keyint_max = 1;
-        h->param.b_intra_refresh = 0;
-        h->param.analyse.i_weighted_pred = 0;
-        h->param.i_frame_reference = 1;
-        h->param.i_dpb_size = 1;
-
-        h->param.b_bluray_compat = 0;
-        h->param.b_vfr_input = 0;
-        h->param.b_aud = 1;
-        h->param.vui.i_chroma_loc = 0;
-        h->param.i_nal_hrd = X264_NAL_HRD_NONE;
-        h->param.b_deblocking_filter = 0;
-        h->param.b_stitchable = 1;
-        h->param.b_pic_struct = 0;
-        h->param.analyse.b_transform_8x8 = 1;
-        h->param.analyse.intra = X264_ANALYSE_I8x8;
-        h->param.analyse.i_chroma_qp_offset = res && type ? 3 : 4;
-        h->param.b_cabac = !type;
-        h->param.rc.i_vbv_buffer_size = avcintra_lut[type][res][i].frame_size;
-        h->param.rc.i_vbv_max_bitrate =
-        h->param.rc.i_bitrate = h->param.rc.i_vbv_buffer_size * fps_num / fps_den;
-        h->param.rc.i_rc_method = X264_RC_ABR;
-        h->param.rc.f_vbv_buffer_init = 1.0;
-        h->param.rc.b_filler = 1;
-        h->param.i_cqm_preset = X264_CQM_CUSTOM;
-        memcpy( h->param.cqm_4iy, x264_cqm_jvt4i, sizeof(h->param.cqm_4iy) );
-        memcpy( h->param.cqm_4ic, avcintra_lut[type][res][i].cqm_4ic, sizeof(h->param.cqm_4ic) );
-        memcpy( h->param.cqm_8iy, avcintra_lut[type][res][i].cqm_8iy, sizeof(h->param.cqm_8iy) );
-
-        /* Need exactly 10 slices of equal MB count... why?  $deity knows... */
-        h->param.i_slice_max_mbs = ((h->param.i_width + 15) / 16) * ((h->param.i_height + 15) / 16) / 10;
-        h->param.i_slice_max_size = 0;
-        /* The slice structure only allows a maximum of 2 threads for 1080i/p
-         * and 1 or 5 threads for 720p */
-        if( h->param.b_sliced_threads )
-        {
-            if( res )
-                h->param.i_threads = X264_MIN( 2, h->param.i_threads );
-            else
-            {
-                h->param.i_threads = X264_MIN( 5, h->param.i_threads );
-                if( h->param.i_threads < 5 )
-                    h->param.i_threads = 1;
-            }
-        }
-
-        if( type )
-            h->param.vui.i_sar_width = h->param.vui.i_sar_height = 1;
-        else
-        {
-            h->param.vui.i_sar_width  = 4;
-            h->param.vui.i_sar_height = 3;
-        }
-
-        /* Official encoder doesn't appear to go under 13
-         * and Avid cannot handle negative QPs */
-        h->param.rc.i_qp_min = X264_MAX( h->param.rc.i_qp_min, QP_BD_OFFSET + 1 );
-    }
-
-    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, -QP_BD_OFFSET, 51 );
-    h->param.rc.f_rf_constant_max = x264_clip3f( h->param.rc.f_rf_constant_max, -QP_BD_OFFSET, 51 );
-    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
-    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 11 );
-    h->param.rc.f_ip_factor = X264_MAX( h->param.rc.f_ip_factor, 0.01f );
-    h->param.rc.f_pb_factor = X264_MAX( h->param.rc.f_pb_factor, 0.01f );
-    if( h->param.rc.i_rc_method == X264_RC_CRF )
-    {
-        h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET;
-        h->param.rc.i_bitrate = 0;
-    }
-    if( b_open && (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
-        && h->param.rc.i_qp_constant == 0 )
-    {
-        h->mb.b_lossless = 1;
-        h->param.i_cqm_preset = X264_CQM_FLAT;
-        h->param.psz_cqm_file = NULL;
-        h->param.rc.i_rc_method = X264_RC_CQP;
-        h->param.rc.f_ip_factor = 1;
-        h->param.rc.f_pb_factor = 1;
-        h->param.analyse.b_psnr = 0;
-        h->param.analyse.b_ssim = 0;
-        h->param.analyse.i_chroma_qp_offset = 0;
-        h->param.analyse.i_trellis = 0;
-        h->param.analyse.b_fast_pskip = 0;
-        h->param.analyse.i_noise_reduction = 0;
-        h->param.analyse.b_psy = 0;
-        h->param.i_bframe = 0;
-        /* 8x8dct is not useful without RD in CAVLC lossless */
-        if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 )
-            h->param.analyse.b_transform_8x8 = 0;
-        h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
-        h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
-    }
-    if( h->param.rc.i_rc_method == X264_RC_CQP )
-    {
-        float qp_p = h->param.rc.i_qp_constant;
-        float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor );
-        float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor );
-        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX );
-        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX );
-        h->param.rc.i_aq_mode = 0;
-        h->param.rc.b_mb_tree = 0;
-        h->param.rc.i_bitrate = 0;
-    }
-    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX );
-    h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
-    h->param.rc.i_qp_step = x264_clip3( h->param.rc.i_qp_step, 2, QP_MAX );
-    h->param.rc.i_bitrate = x264_clip3( h->param.rc.i_bitrate, 0, 2000000 );
-    if( h->param.rc.i_rc_method == X264_RC_ABR && !h->param.rc.i_bitrate )
-    {
-        x264_log( h, X264_LOG_ERROR, "bitrate not specified\n" );
-        return -1;
-    }
-    h->param.rc.i_vbv_buffer_size = x264_clip3( h->param.rc.i_vbv_buffer_size, 0, 2000000 );
-    h->param.rc.i_vbv_max_bitrate = x264_clip3( h->param.rc.i_vbv_max_bitrate, 0, 2000000 );
-    h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init, 0, 2000000 );
-    if( h->param.rc.i_vbv_buffer_size )
-    {
-        if( h->param.rc.i_rc_method == X264_RC_CQP )
-        {
-            x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
-            h->param.rc.i_vbv_max_bitrate = 0;
-            h->param.rc.i_vbv_buffer_size = 0;
-        }
-        else if( h->param.rc.i_vbv_max_bitrate == 0 )
-        {
-            if( h->param.rc.i_rc_method == X264_RC_ABR )
-            {
-                x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
-                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
-            }
-            else
-            {
-                x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
-                h->param.rc.i_vbv_buffer_size = 0;
-            }
-        }
-        else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
-                 h->param.rc.i_rc_method == X264_RC_ABR )
-        {
-            x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
-            h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate;
-        }
-    }
-    else if( h->param.rc.i_vbv_max_bitrate )
-    {
-        x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
-        h->param.rc.i_vbv_max_bitrate = 0;
-    }
-
-    h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
-    h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
-    h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 );
-    if( h->param.i_slice_max_mbs )
-        h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 );
-    else if( !h->param.i_slice_max_size )
-        h->param.i_slice_min_mbs = 0;
-    if( PARAM_INTERLACED && h->param.i_slice_min_mbs )
-    {
-        x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" );
-        h->param.i_slice_min_mbs = 0;
-    }
-    int mb_width = (h->param.i_width+15)/16;
-    if( h->param.i_slice_min_mbs > mb_width )
-    {
-        x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width );
-        h->param.i_slice_min_mbs = mb_width;
-    }
-
-    int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED);
-    if( h->param.b_sliced_threads )
-        h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
-    else
-    {
-        h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
-        if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
-            h->param.i_slice_count = 0;
-    }
-    if( h->param.i_slice_count_max > 0 )
-        h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max );
-
-    if( h->param.b_bluray_compat )
-    {
-        h->param.i_bframe_pyramid = X264_MIN( X264_B_PYRAMID_STRICT, h->param.i_bframe_pyramid );
-        h->param.i_bframe = X264_MIN( h->param.i_bframe, 3 );
-        h->param.b_aud = 1;
-        h->param.i_nal_hrd = X264_MAX( h->param.i_nal_hrd, X264_NAL_HRD_VBR );
-        h->param.i_slice_max_size = 0;
-        h->param.i_slice_max_mbs = 0;
-        h->param.b_intra_refresh = 0;
-        h->param.i_frame_reference = X264_MIN( h->param.i_frame_reference, 6 );
-        h->param.i_dpb_size = X264_MIN( h->param.i_dpb_size, 6 );
-        /* Don't use I-frames, because Blu-ray treats them the same as IDR. */
-        h->param.i_keyint_min = 1;
-        /* Due to the proliferation of broken players that don't handle dupes properly. */
-        h->param.analyse.i_weighted_pred = X264_MIN( h->param.analyse.i_weighted_pred, X264_WEIGHTP_SIMPLE );
-        if( h->param.b_fake_interlaced )
-            h->param.b_pic_struct = 1;
-    }
-
-    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX );
-    h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX );
-    if( h->param.i_scenecut_threshold < 0 )
-        h->param.i_scenecut_threshold = 0;
-    h->param.analyse.i_direct_mv_pred = x264_clip3( h->param.analyse.i_direct_mv_pred, X264_DIRECT_PRED_NONE, X264_DIRECT_PRED_AUTO );
-    if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
-    {
-        x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" );
-        h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
-    }
-    h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) );
-    h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
-    if( h->param.i_bframe <= 1 )
-        h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE;
-    h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL );
-    h->param.i_bframe_adaptive = x264_clip3( h->param.i_bframe_adaptive, X264_B_ADAPT_NONE, X264_B_ADAPT_TRELLIS );
-    if( !h->param.i_bframe )
-    {
-        h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
-        h->param.analyse.i_direct_mv_pred = 0;
-        h->param.analyse.b_weighted_bipred = 0;
-        h->param.b_open_gop = 0;
-    }
-    if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL )
-    {
-        x264_log( h, X264_LOG_WARNING, "b-pyramid normal + intra-refresh is not supported\n" );
-        h->param.i_bframe_pyramid = X264_B_PYRAMID_STRICT;
-    }
-    if( h->param.b_intra_refresh && (h->param.i_frame_reference > 1 || h->param.i_dpb_size > 1) )
-    {
-        x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
-        h->param.i_frame_reference = 1;
-        h->param.i_dpb_size = 1;
-    }
-    if( h->param.b_intra_refresh && h->param.b_open_gop )
-    {
-        x264_log( h, X264_LOG_WARNING, "intra-refresh is not compatible with open-gop\n" );
-        h->param.b_open_gop = 0;
-    }
-    if( !h->param.i_fps_num || !h->param.i_fps_den )
-    {
-        h->param.i_fps_num = 25;
-        h->param.i_fps_den = 1;
-    }
-    float fps = (float)h->param.i_fps_num / h->param.i_fps_den;
-    if( h->param.i_keyint_min == X264_KEYINT_MIN_AUTO )
-        h->param.i_keyint_min = X264_MIN( h->param.i_keyint_max / 10, (int)fps );
-    h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
-    h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
-    {
-        int maxrate = X264_MAX( h->param.rc.i_vbv_max_bitrate, h->param.rc.i_bitrate );
-        float bufsize = maxrate ? (float)h->param.rc.i_vbv_buffer_size / maxrate : 0;
-        h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) );
-    }
-
-    if( !h->param.i_timebase_num || !h->param.i_timebase_den || !(h->param.b_vfr_input || h->param.b_pulldown) )
-    {
-        h->param.i_timebase_num = h->param.i_fps_den;
-        h->param.i_timebase_den = h->param.i_fps_num;
-    }
-
-    h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
-    if( h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
-        h->param.rc.b_mb_tree = 0;
-    if( (!h->param.b_intra_refresh && h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE) &&
-        !h->param.rc.i_lookahead && h->param.rc.b_mb_tree )
-    {
-        x264_log( h, X264_LOG_WARNING, "lookaheadless mb-tree requires intra refresh or infinite keyint\n" );
-        h->param.rc.b_mb_tree = 0;
-    }
-    if( b_open && h->param.rc.b_stat_read )
-        h->param.rc.i_lookahead = 0;
-#if HAVE_THREAD
-    if( h->param.i_sync_lookahead < 0 )
-        h->param.i_sync_lookahead = h->param.i_bframe + 1;
-    h->param.i_sync_lookahead = X264_MIN( h->param.i_sync_lookahead, X264_LOOKAHEAD_MAX );
-    if( h->param.rc.b_stat_read || h->i_thread_frames == 1 )
-        h->param.i_sync_lookahead = 0;
-#else
-    h->param.i_sync_lookahead = 0;
-#endif
-
-    h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
-    h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
-    h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
-    h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 );
-
-    h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 );
-
-    if( h->param.i_cqm_preset < X264_CQM_FLAT || h->param.i_cqm_preset > X264_CQM_CUSTOM )
-        h->param.i_cqm_preset = X264_CQM_FLAT;
-
-    if( h->param.analyse.i_me_method < X264_ME_DIA ||
-        h->param.analyse.i_me_method > X264_ME_TESA )
-        h->param.analyse.i_me_method = X264_ME_HEX;
-    h->param.analyse.i_me_range = x264_clip3( h->param.analyse.i_me_range, 4, 1024 );
-    if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
-        h->param.analyse.i_me_range = 16;
-    if( h->param.analyse.i_me_method == X264_ME_TESA &&
-        (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
-        h->param.analyse.i_me_method = X264_ME_ESA;
-    h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
-    h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16|
-                              X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
-    h->param.analyse.intra &= X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
-    if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) )
-        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
-    if( !h->param.analyse.b_transform_8x8 )
-    {
-        h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
-        h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
-    }
-    h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
-    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 3 );
-    h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
-    if( h->param.rc.f_aq_strength == 0 )
-        h->param.rc.i_aq_mode = 0;
-
-    if( h->param.i_log_level < X264_LOG_INFO )
-    {
-        h->param.analyse.b_psnr = 0;
-        h->param.analyse.b_ssim = 0;
-    }
-    /* Warn users trying to measure PSNR/SSIM with psy opts on. */
-    if( b_open && (h->param.analyse.b_psnr || h->param.analyse.b_ssim) )
-    {
-        char *s = NULL;
-
-        if( h->param.analyse.b_psy )
-        {
-            s = h->param.analyse.b_psnr ? "psnr" : "ssim";
-            x264_log( h, X264_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s );
-        }
-        else if( !h->param.rc.i_aq_mode && h->param.analyse.b_ssim )
-        {
-            x264_log( h, X264_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n" );
-            s = "ssim";
-        }
-        else if(  h->param.rc.i_aq_mode && h->param.analyse.b_psnr )
-        {
-            x264_log( h, X264_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n" );
-            s = "psnr";
-        }
-        if( s )
-            x264_log( h, X264_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s );
-    }
-
-    if( !h->param.analyse.b_psy )
-    {
-        h->param.analyse.f_psy_rd = 0;
-        h->param.analyse.f_psy_trellis = 0;
-    }
-    h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
-    h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 );
-    h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0;
-    h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0;
-    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -32, 32);
-    /* In 4:4:4 mode, chroma gets twice as much resolution, so we can halve its quality. */
-    if( b_open && i_csp >= X264_CSP_I444 && i_csp < X264_CSP_BGR && h->param.analyse.b_psy )
-        h->param.analyse.i_chroma_qp_offset += 6;
-    /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */
-    /* so we lower the chroma QP offset to compensate */
-    if( b_open && h->mb.i_psy_rd && !h->param.i_avcintra_class )
-        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2;
-    /* Psy trellis has a similar effect. */
-    if( b_open && h->mb.i_psy_trellis && !h->param.i_avcintra_class )
-        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
-    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
-    /* MB-tree requires AQ to be on, even if the strength is zero. */
-    if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree )
-    {
-        h->param.rc.i_aq_mode = 1;
-        h->param.rc.f_aq_strength = 0;
-    }
-    h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
-    if( h->param.analyse.i_subpel_refine >= 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) )
-        h->param.analyse.i_subpel_refine = 9;
-
-    if( b_open )
-    {
-        const x264_level_t *l = x264_levels;
-        if( h->param.i_level_idc < 0 )
-        {
-            int maxrate_bak = h->param.rc.i_vbv_max_bitrate;
-            if( h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.i_vbv_buffer_size <= 0 )
-                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate * 2;
-            x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
-            do h->param.i_level_idc = l->level_idc;
-                while( l[1].level_idc && x264_validate_levels( h, 0 ) && l++ );
-            h->param.rc.i_vbv_max_bitrate = maxrate_bak;
-        }
-        else
-        {
-            while( l->level_idc && l->level_idc != h->param.i_level_idc )
-                l++;
-            if( l->level_idc == 0 )
-            {
-                x264_log( h, X264_LOG_ERROR, "invalid level_idc: %d\n", h->param.i_level_idc );
-                return -1;
-            }
-        }
-        if( h->param.analyse.i_mv_range <= 0 )
-            h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED;
-        else
-            h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED);
-    }
-
-    h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
-
-    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
-    {
-        if( h->param.b_sliced_threads )
-            h->param.i_lookahead_threads = h->param.i_threads;
-        else
-        {
-            /* If we're using much slower lookahead settings than encoding settings, it helps a lot to use
-             * more lookahead threads.  This typically happens in the first pass of a two-pass encode, so
-             * try to guess at this sort of case.
-             *
-             * Tuned by a little bit of real encoding with the various presets. */
-            int badapt = h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS;
-            int subme = X264_MIN( h->param.analyse.i_subpel_refine / 3, 3 ) + (h->param.analyse.i_subpel_refine > 1);
-            int bframes = X264_MIN( (h->param.i_bframe - 1) / 3, 3 );
-
-            /* [b-adapt 0/1 vs 2][quantized subme][quantized bframes] */
-            static const uint8_t lookahead_thread_div[2][5][4] =
-            {{{6,6,6,6}, {3,3,3,3}, {4,4,4,4}, {6,6,6,6}, {12,12,12,12}},
-             {{3,2,1,1}, {2,1,1,1}, {4,3,2,1}, {6,4,3,2}, {12, 9, 6, 4}}};
-
-            h->param.i_lookahead_threads = h->param.i_threads / lookahead_thread_div[badapt][subme][bframes];
-            /* Since too many lookahead threads significantly degrades lookahead accuracy, limit auto
-             * lookahead threads to about 8 macroblock rows high each at worst.  This number is chosen
-             * pretty much arbitrarily. */
-            h->param.i_lookahead_threads = X264_MIN( h->param.i_lookahead_threads, h->param.i_height / 128 );
-        }
-    }
-    h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
-
-    if( PARAM_INTERLACED )
-    {
-        if( h->param.analyse.i_me_method >= X264_ME_ESA )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
-            h->param.analyse.i_me_method = X264_ME_UMH;
-        }
-        if( h->param.analyse.i_weighted_pred > 0 )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
-            h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
-        }
-    }
-
-    if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy )
-        h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
-
-    if( h->i_thread_frames > 1 )
-    {
-        int r = h->param.analyse.i_mv_range_thread;
-        int r2;
-        if( r <= 0 )
-        {
-            // half of the available space is reserved and divided evenly among the threads,
-            // the rest is allocated to whichever thread is far enough ahead to use it.
-            // reserving more space increases quality for some videos, but costs more time
-            // in thread synchronization.
-            int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->i_thread_frames - X264_THREAD_HEIGHT;
-            r = max_range / 2;
-        }
-        r = X264_MAX( r, h->param.analyse.i_me_range );
-        r = X264_MIN( r, h->param.analyse.i_mv_range );
-        // round up to use the whole mb row
-        r2 = (r & ~15) + ((-X264_THREAD_HEIGHT) & 15);
-        if( r2 < r )
-            r2 += 16;
-        x264_log( h, X264_LOG_DEBUG, "using mv_range_thread = %d\n", r2 );
-        h->param.analyse.i_mv_range_thread = r2;
-    }
-
-    if( h->param.rc.f_rate_tolerance < 0 )
-        h->param.rc.f_rate_tolerance = 0;
-    if( h->param.rc.f_qblur < 0 )
-        h->param.rc.f_qblur = 0;
-    if( h->param.rc.f_complexity_blur < 0 )
-        h->param.rc.f_complexity_blur = 0;
-
-    h->param.i_sps_id &= 31;
-
-    h->param.i_nal_hrd = x264_clip3( h->param.i_nal_hrd, X264_NAL_HRD_NONE, X264_NAL_HRD_CBR );
-
-    if( h->param.i_nal_hrd && !h->param.rc.i_vbv_buffer_size )
-    {
-        x264_log( h, X264_LOG_WARNING, "NAL HRD parameters require VBV parameters\n" );
-        h->param.i_nal_hrd = X264_NAL_HRD_NONE;
-    }
-
-    if( h->param.i_nal_hrd == X264_NAL_HRD_CBR &&
-       (h->param.rc.i_bitrate != h->param.rc.i_vbv_max_bitrate || !h->param.rc.i_vbv_max_bitrate) )
-    {
-        x264_log( h, X264_LOG_WARNING, "CBR HRD requires constant bitrate\n" );
-        h->param.i_nal_hrd = X264_NAL_HRD_VBR;
-    }
-
-    if( h->param.i_nal_hrd == X264_NAL_HRD_CBR )
-        h->param.rc.b_filler = 1;
-
-    /* ensure the booleans are 0 or 1 so they can be used in math */
-#define BOOLIFY(x) h->param.x = !!h->param.x
-    BOOLIFY( b_cabac );
-    BOOLIFY( b_constrained_intra );
-    BOOLIFY( b_deblocking_filter );
-    BOOLIFY( b_deterministic );
-    BOOLIFY( b_sliced_threads );
-    BOOLIFY( b_interlaced );
-    BOOLIFY( b_intra_refresh );
-    BOOLIFY( b_aud );
-    BOOLIFY( b_repeat_headers );
-    BOOLIFY( b_annexb );
-    BOOLIFY( b_vfr_input );
-    BOOLIFY( b_pulldown );
-    BOOLIFY( b_tff );
-    BOOLIFY( b_pic_struct );
-    BOOLIFY( b_fake_interlaced );
-    BOOLIFY( b_open_gop );
-    BOOLIFY( b_bluray_compat );
-    BOOLIFY( b_stitchable );
-    BOOLIFY( b_full_recon );
-    BOOLIFY( b_opencl );
-    BOOLIFY( analyse.b_transform_8x8 );
-    BOOLIFY( analyse.b_weighted_bipred );
-    BOOLIFY( analyse.b_chroma_me );
-    BOOLIFY( analyse.b_mixed_references );
-    BOOLIFY( analyse.b_fast_pskip );
-    BOOLIFY( analyse.b_dct_decimate );
-    BOOLIFY( analyse.b_psy );
-    BOOLIFY( analyse.b_psnr );
-    BOOLIFY( analyse.b_ssim );
-    BOOLIFY( rc.b_stat_write );
-    BOOLIFY( rc.b_stat_read );
-    BOOLIFY( rc.b_mb_tree );
-    BOOLIFY( rc.b_filler );
-#undef BOOLIFY
-
-    return 0;
-}
-
-static void mbcmp_init( x264_t *h )
-{
-    int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
-    memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
-    memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
-    h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
-    h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c;
-    h->pixf.intra_mbcmp_x3_8x8c  = satd ? h->pixf.intra_satd_x3_8x8c  : h->pixf.intra_sad_x3_8x8c;
-    h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
-    h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
-    h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
-                               : satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4;
-    h->pixf.intra_mbcmp_x9_8x8 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
-                               : satd ? h->pixf.intra_sa8d_x9_8x8 : h->pixf.intra_sad_x9_8x8;
-    satd &= h->param.analyse.i_me_method == X264_ME_TESA;
-    memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
-    memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
-    memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
-}
-
-static void chroma_dsp_init( x264_t *h )
-{
-    memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) );
-
-    switch( CHROMA_FORMAT )
-    {
-        case CHROMA_420:
-            memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
-            h->mc.prefetch_fenc = h->mc.prefetch_fenc_420;
-            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
-            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
-            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
-            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff;
-            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c;
-            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4;
-            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4;
-            break;
-        case CHROMA_422:
-            memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
-            h->mc.prefetch_fenc = h->mc.prefetch_fenc_422;
-            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
-            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
-            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
-            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff;
-            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c;
-            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8;
-            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
-            break;
-        case CHROMA_444:
-            h->mc.prefetch_fenc = h->mc.prefetch_fenc_422; /* FIXME: doesn't cover V plane */
-            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
-            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
-            break;
-    }
-}
-
-static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
-{
-    /* VUI */
-    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
-    {
-        uint32_t i_w = param->vui.i_sar_width;
-        uint32_t i_h = param->vui.i_sar_height;
-        uint32_t old_w = h->param.vui.i_sar_width;
-        uint32_t old_h = h->param.vui.i_sar_height;
-
-        x264_reduce_fraction( &i_w, &i_h );
-
-        while( i_w > 65535 || i_h > 65535 )
-        {
-            i_w /= 2;
-            i_h /= 2;
-        }
-
-        x264_reduce_fraction( &i_w, &i_h );
-
-        if( i_w != old_w || i_h != old_h || initial )
-        {
-            h->param.vui.i_sar_width = 0;
-            h->param.vui.i_sar_height = 0;
-            if( i_w == 0 || i_h == 0 )
-                x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
-            else
-            {
-                x264_log( h, initial?X264_LOG_INFO:X264_LOG_DEBUG, "using SAR=%d/%d\n", i_w, i_h );
-                h->param.vui.i_sar_width = i_w;
-                h->param.vui.i_sar_height = i_h;
-            }
-        }
-    }
-}
-
-/****************************************************************************
- * x264_encoder_open:
- ****************************************************************************/
-x264_t *x264_encoder_open( x264_param_t *param )
-{
-    x264_t *h;
-    char buf[1000], *p;
-    int i_slicetype_length;
-
-    CHECKED_MALLOCZERO( h, sizeof(x264_t) );
-
-    /* Create a copy of param */
-    memcpy( &h->param, param, sizeof(x264_param_t) );
-
-    if( param->param_free )
-        param->param_free( param );
-
-#if HAVE_INTEL_DISPATCHER
-    x264_intel_dispatcher_override();
-#endif
-
-    if( x264_threading_init() )
-    {
-        x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" );
-        goto fail;
-    }
-
-    if( x264_validate_parameters( h, 1 ) < 0 )
-        goto fail;
-
-    if( h->param.psz_cqm_file )
-        if( x264_cqm_parse_file( h, h->param.psz_cqm_file ) < 0 )
-            goto fail;
-
-    if( h->param.rc.psz_stat_out )
-        h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out );
-    if( h->param.rc.psz_stat_in )
-        h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in );
-
-    x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
-    x264_reduce_fraction( &h->param.i_timebase_num, &h->param.i_timebase_den );
-
-    /* Init x264_t */
-    h->i_frame = -1;
-    h->i_frame_num = 0;
-
-    if( h->param.i_avcintra_class )
-        h->i_idr_pic_id = 5;
-    else
-        h->i_idr_pic_id = 0;
-
-    if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX )
-    {
-        x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %u exceeds H.264 maximum\n", h->param.i_timebase_den );
-        goto fail;
-    }
-
-    x264_set_aspect_ratio( h, &h->param, 1 );
-
-    x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
-    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps );
-
-    x264_validate_levels( h, 1 );
-
-    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
-
-    if( x264_cqm_init( h ) < 0 )
-        goto fail;
-
-    h->mb.i_mb_width = h->sps->i_mb_width;
-    h->mb.i_mb_height = h->sps->i_mb_height;
-    h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height;
-
-    h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
-    h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420;
-
-    /* Adaptive MBAFF and subme 0 are not supported as we require halving motion
-     * vectors during prediction, resulting in hpel mvs.
-     * The chosen solution is to make MBAFF non-adaptive in this case. */
-    h->mb.b_adaptive_mbaff = PARAM_INTERLACED && h->param.analyse.i_subpel_refine;
-
-    /* Init frames. */
-    if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && !h->param.rc.b_stat_read )
-        h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4;
-    else
-        h->frames.i_delay = h->param.i_bframe;
-    if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
-        h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
-    i_slicetype_length = h->frames.i_delay;
-    h->frames.i_delay += h->i_thread_frames - 1;
-    h->frames.i_delay += h->param.i_sync_lookahead;
-    h->frames.i_delay += h->param.b_vfr_input;
-    h->frames.i_bframe_delay = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 2 : 1) : 0;
-
-    h->frames.i_max_ref0 = h->param.i_frame_reference;
-    h->frames.i_max_ref1 = X264_MIN( h->sps->vui.i_num_reorder_frames, h->param.i_frame_reference );
-    h->frames.i_max_dpb  = h->sps->vui.i_max_dec_frame_buffering;
-    h->frames.b_have_lowres = !h->param.rc.b_stat_read
-        && ( h->param.rc.i_rc_method == X264_RC_ABR
-          || h->param.rc.i_rc_method == X264_RC_CRF
-          || h->param.i_bframe_adaptive
-          || h->param.i_scenecut_threshold
-          || h->param.rc.b_mb_tree
-          || h->param.analyse.i_weighted_pred );
-    h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
-    h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
-
-    h->frames.i_last_idr =
-    h->frames.i_last_keyframe = - h->param.i_keyint_max;
-    h->frames.i_input    = 0;
-    h->frames.i_largest_pts = h->frames.i_second_largest_pts = -1;
-    h->frames.i_poc_last_open_gop = -1;
-
-    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
-    /* Allocate room for max refs plus a few extra just in case. */
-    CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) );
-    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
-                        + h->i_thread_frames + 3) * sizeof(x264_frame_t *) );
-    if( h->param.analyse.i_weighted_pred > 0 )
-        CHECKED_MALLOCZERO( h->frames.blank_unused, h->i_thread_frames * 4 * sizeof(x264_frame_t *) );
-    h->i_ref[0] = h->i_ref[1] = 0;
-    h->i_cpb_delay = h->i_coded_fields = h->i_disp_fields = 0;
-    h->i_prev_duration = ((uint64_t)h->param.i_fps_den * h->sps->vui.i_time_scale) / ((uint64_t)h->param.i_fps_num * h->sps->vui.i_num_units_in_tick);
-    h->i_disp_fields_last_frame = -1;
-    x264_rdo_init();
-
-    /* init CPU functions */
-    x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
-    x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
-    x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
-    x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
-    x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
-    x264_pixel_init( h->param.cpu, &h->pixf );
-    x264_dct_init( h->param.cpu, &h->dctf );
-    x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
-    memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) );
-    x264_mc_init( h->param.cpu, &h->mc, h->param.b_cpu_independent );
-    x264_quant_init( h, h->param.cpu, &h->quantf );
-    x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
-    x264_bitstream_init( h->param.cpu, &h->bsf );
-    if( h->param.b_cabac )
-        x264_cabac_init( h );
-    else
-        x264_stack_align( x264_cavlc_init, h );
-
-    mbcmp_init( h );
-    chroma_dsp_init( h );
-
-    p = buf + sprintf( buf, "using cpu capabilities:" );
-    for( int i = 0; x264_cpu_names[i].flags; i++ )
-    {
-        if( !strcmp(x264_cpu_names[i].name, "SSE")
-            && h->param.cpu & (X264_CPU_SSE2) )
-            continue;
-        if( !strcmp(x264_cpu_names[i].name, "SSE2")
-            && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
-            continue;
-        if( !strcmp(x264_cpu_names[i].name, "SSE3")
-            && (h->param.cpu & X264_CPU_SSSE3 || !(h->param.cpu & X264_CPU_CACHELINE_64)) )
-            continue;
-        if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
-            && (h->param.cpu & X264_CPU_SSE42) )
-            continue;
-        if( !strcmp(x264_cpu_names[i].name, "BMI1")
-            && (h->param.cpu & X264_CPU_BMI2) )
-            continue;
-        if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
-            && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
-            p += sprintf( p, " %s", x264_cpu_names[i].name );
-    }
-    if( !h->param.cpu )
-        p += sprintf( p, " none!" );
-    x264_log( h, X264_LOG_INFO, "%s\n", buf );
-
-    if( x264_analyse_init_costs( h ) )
-        goto fail;
-
-    static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
-    /* Checks for known miscompilation issues. */
-    if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
-    {
-        x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
-        goto fail;
-    }
-
-    /* Must be volatile or else GCC will optimize it out. */
-    volatile int temp = 392;
-    if( x264_clz( temp ) != 23 )
-    {
-        x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" );
-#if ARCH_X86 || ARCH_X86_64
-        x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" );
-        x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" );
-#endif
-        goto fail;
-    }
-
-    h->out.i_nal = 0;
-    h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
-        * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
-          : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
-
-    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */
-    CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );
-
-    CHECKED_MALLOC( h->reconfig_h, sizeof(x264_t) );
-
-    if( h->param.i_threads > 1 &&
-        x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
-        goto fail;
-    if( h->param.i_lookahead_threads > 1 &&
-        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, NULL, NULL ) )
-        goto fail;
-
-#if HAVE_OPENCL
-    if( h->param.b_opencl )
-    {
-        h->opencl.ocl = x264_opencl_load_library();
-        if( !h->opencl.ocl )
-        {
-            x264_log( h, X264_LOG_WARNING, "failed to load OpenCL\n" );
-            h->param.b_opencl = 0;
-        }
-    }
-#endif
-
-    h->thread[0] = h;
-    for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
-        CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
-    if( h->param.i_lookahead_threads > 1 )
-        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-        {
-            CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) );
-            *h->lookahead_thread[i] = *h;
-        }
-    *h->reconfig_h = *h;
-
-    for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        int init_nal_count = h->param.i_slice_count + 3;
-        int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
-        if( i > 0 )
-            *h->thread[i] = *h;
-
-        if( x264_pthread_mutex_init( &h->thread[i]->mutex, NULL ) )
-            goto fail;
-        if( x264_pthread_cond_init( &h->thread[i]->cv, NULL ) )
-            goto fail;
-
-        if( allocate_threadlocal_data )
-        {
-            h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
-            if( !h->thread[i]->fdec )
-                goto fail;
-        }
-        else
-            h->thread[i]->fdec = h->thread[0]->fdec;
-
-        CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
-        /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
-        CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
-        h->thread[i]->out.i_nals_allocated = init_nal_count;
-
-        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
-            goto fail;
-    }
-
-#if HAVE_OPENCL
-    if( h->param.b_opencl && x264_opencl_lookahead_init( h ) < 0 )
-        h->param.b_opencl = 0;
-#endif
-
-    if( x264_lookahead_init( h, i_slicetype_length ) )
-        goto fail;
-
-    for( int i = 0; i < h->param.i_threads; i++ )
-        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
-            goto fail;
-
-    if( x264_ratecontrol_new( h ) < 0 )
-        goto fail;
-
-    if( h->param.i_nal_hrd )
-    {
-        x264_log( h, X264_LOG_DEBUG, "HRD bitrate: %i bits/sec\n", h->sps->vui.hrd.i_bit_rate_unscaled );
-        x264_log( h, X264_LOG_DEBUG, "CPB size: %i bits\n", h->sps->vui.hrd.i_cpb_size_unscaled );
-    }
-
-    if( h->param.psz_dump_yuv )
-    {
-        /* create or truncate the reconstructed video file */
-        FILE *f = x264_fopen( h->param.psz_dump_yuv, "w" );
-        if( !f )
-        {
-            x264_log( h, X264_LOG_ERROR, "dump_yuv: can't write to %s\n", h->param.psz_dump_yuv );
-            goto fail;
-        }
-        else if( !x264_is_regular_file( f ) )
-        {
-            x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
-            fclose( f );
-            goto fail;
-        }
-        fclose( f );
-    }
-
-    const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Constrained Baseline" :
-                          h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
-                          h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
-                          h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
-                          h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") :
-                          h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
-    char level[4];
-    snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
-    if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 &&
-        (h->sps->i_profile_idc == PROFILE_BASELINE || h->sps->i_profile_idc == PROFILE_MAIN) ) )
-        strcpy( level, "1b" );
-
-    if( h->sps->i_profile_idc < PROFILE_HIGH10 )
-    {
-        x264_log( h, X264_LOG_INFO, "profile %s, level %s\n",
-            profile, level );
-    }
-    else
-    {
-        static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
-        x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
-            profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH );
-    }
-
-    return h;
-fail:
-    x264_free( h );
-    return NULL;
-}
-
-/****************************************************************************/
-static int x264_encoder_try_reconfig( x264_t *h, x264_param_t *param, int *rc_reconfig )
-{
-    *rc_reconfig = 0;
-    x264_set_aspect_ratio( h, param, 0 );
-#define COPY(var) h->param.var = param->var
-    COPY( i_frame_reference ); // but never uses more refs than initially specified
-    COPY( i_bframe_bias );
-    if( h->param.i_scenecut_threshold )
-        COPY( i_scenecut_threshold ); // can't turn it on or off, only vary the threshold
-    COPY( b_deblocking_filter );
-    COPY( i_deblocking_filter_alphac0 );
-    COPY( i_deblocking_filter_beta );
-    COPY( i_frame_packing );
-    COPY( analyse.inter );
-    COPY( analyse.intra );
-    COPY( analyse.i_direct_mv_pred );
-    /* Scratch buffer prevents me_range from being increased for esa/tesa */
-    if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
-        COPY( analyse.i_me_range );
-    COPY( analyse.i_noise_reduction );
-    /* We can't switch out of subme=0 during encoding. */
-    if( h->param.analyse.i_subpel_refine )
-        COPY( analyse.i_subpel_refine );
-    COPY( analyse.i_trellis );
-    COPY( analyse.b_chroma_me );
-    COPY( analyse.b_dct_decimate );
-    COPY( analyse.b_fast_pskip );
-    COPY( analyse.b_mixed_references );
-    COPY( analyse.f_psy_rd );
-    COPY( analyse.f_psy_trellis );
-    COPY( crop_rect );
-    // can only twiddle these if they were enabled to begin with:
-    if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
-        COPY( analyse.i_me_method );
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
-        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
-    if( h->pps->b_transform_8x8_mode )
-        COPY( analyse.b_transform_8x8 );
-    if( h->frames.i_max_ref1 > 1 )
-        COPY( i_bframe_pyramid );
-    COPY( i_slice_max_size );
-    COPY( i_slice_max_mbs );
-    COPY( i_slice_min_mbs );
-    COPY( i_slice_count );
-    COPY( i_slice_count_max );
-    COPY( b_tff );
-
-    /* VBV can't be turned on if it wasn't on to begin with */
-    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
-          param->rc.i_vbv_max_bitrate > 0 &&   param->rc.i_vbv_buffer_size > 0 )
-    {
-        *rc_reconfig |= h->param.rc.i_vbv_max_bitrate != param->rc.i_vbv_max_bitrate;
-        *rc_reconfig |= h->param.rc.i_vbv_buffer_size != param->rc.i_vbv_buffer_size;
-        *rc_reconfig |= h->param.rc.i_bitrate != param->rc.i_bitrate;
-        COPY( rc.i_vbv_max_bitrate );
-        COPY( rc.i_vbv_buffer_size );
-        COPY( rc.i_bitrate );
-    }
-    *rc_reconfig |= h->param.rc.f_rf_constant != param->rc.f_rf_constant;
-    *rc_reconfig |= h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max;
-    COPY( rc.f_rf_constant );
-    COPY( rc.f_rf_constant_max );
-#undef COPY
-
-    return x264_validate_parameters( h, 0 );
-}
-
-int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param )
-{
-    int rc_reconfig;
-    int ret = x264_encoder_try_reconfig( h, param, &rc_reconfig );
-
-    mbcmp_init( h );
-    if( !ret )
-        x264_sps_init_reconfigurable( h->sps, &h->param );
-
-    /* Supported reconfiguration options (1-pass only):
-     * vbv-maxrate
-     * vbv-bufsize
-     * crf
-     * bitrate (CBR only) */
-    if( !ret && rc_reconfig )
-        x264_ratecontrol_init_reconfigurable( h, 0 );
-
-    return ret;
-}
-
-/****************************************************************************
- * x264_encoder_reconfig:
- ****************************************************************************/
-int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
-{
-    h = h->thread[h->thread[0]->i_thread_phase];
-    x264_param_t param_save = h->reconfig_h->param;
-    h->reconfig_h->param = h->param;
-
-    int rc_reconfig;
-    int ret = x264_encoder_try_reconfig( h->reconfig_h, param, &rc_reconfig );
-    if( !ret )
-        h->reconfig = 1;
-    else
-        h->reconfig_h->param = param_save;
-
-    return ret;
-}
-
-/****************************************************************************
- * x264_encoder_parameters:
- ****************************************************************************/
-void x264_encoder_parameters( x264_t *h, x264_param_t *param )
-{
-    memcpy( param, &h->thread[h->i_thread_phase]->param, sizeof(x264_param_t) );
-}
-
-/* internal usage */
-static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
-{
-    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
-
-    nal->i_ref_idc        = i_ref_idc;
-    nal->i_type           = i_type;
-    nal->b_long_startcode = 1;
-
-    nal->i_payload= 0;
-    nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
-    nal->i_padding= 0;
-}
-
-/* if number of allocated nals is not enough, re-allocate a larger one. */
-static int x264_nal_check_buffer( x264_t *h )
-{
-    if( h->out.i_nal >= h->out.i_nals_allocated )
-    {
-        x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
-        if( !new_out )
-            return -1;
-        memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) );
-        x264_free( h->out.nal );
-        h->out.nal = new_out;
-        h->out.i_nals_allocated *= 2;
-    }
-    return 0;
-}
-
-static int x264_nal_end( x264_t *h )
-{
-    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
-    uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
-    nal->i_payload = end - nal->p_payload;
-    /* Assembly implementation of nal_escape reads past the end of the input.
-     * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
-    memset( end, 0xff, 64 );
-    if( h->param.nalu_process )
-        h->param.nalu_process( h, nal, h->fenc->opaque );
-    h->out.i_nal++;
-
-    return x264_nal_check_buffer( h );
-}
-
-static int x264_check_encapsulated_buffer( x264_t *h, x264_t *h0, int start,
-                                           int previous_nal_size, int necessary_size )
-{
-    if( h0->nal_buffer_size < necessary_size )
-    {
-        necessary_size *= 2;
-        uint8_t *buf = x264_malloc( necessary_size );
-        if( !buf )
-            return -1;
-        if( previous_nal_size )
-            memcpy( buf, h0->nal_buffer, previous_nal_size );
-
-        intptr_t delta = buf - h0->nal_buffer;
-        for( int i = 0; i < start; i++ )
-            h->out.nal[i].p_payload += delta;
-
-        x264_free( h0->nal_buffer );
-        h0->nal_buffer = buf;
-        h0->nal_buffer_size = necessary_size;
-    }
-
-    return 0;
-}
-
-static int x264_encoder_encapsulate_nals( x264_t *h, int start )
-{
-    x264_t *h0 = h->thread[0];
-    int nal_size = 0, previous_nal_size = 0;
-
-    if( h->param.nalu_process )
-    {
-        for( int i = start; i < h->out.i_nal; i++ )
-            nal_size += h->out.nal[i].i_payload;
-        return nal_size;
-    }
-
-    for( int i = 0; i < start; i++ )
-        previous_nal_size += h->out.nal[i].i_payload;
-
-    for( int i = start; i < h->out.i_nal; i++ )
-        nal_size += h->out.nal[i].i_payload;
-
-    /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
-    int necessary_size = previous_nal_size + nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64;
-    for( int i = start; i < h->out.i_nal; i++ )
-        necessary_size += h->out.nal[i].i_padding;
-    if( x264_check_encapsulated_buffer( h, h0, start, previous_nal_size, necessary_size ) )
-        return -1;
-
-    uint8_t *nal_buffer = h0->nal_buffer + previous_nal_size;
-
-    for( int i = start; i < h->out.i_nal; i++ )
-    {
-        h->out.nal[i].b_long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS ||
-                                         h->param.i_avcintra_class;
-        x264_nal_encode( h, nal_buffer, &h->out.nal[i] );
-        nal_buffer += h->out.nal[i].i_payload;
-    }
-
-    x264_emms();
-
-    return nal_buffer - (h0->nal_buffer + previous_nal_size);
-}
-
-/****************************************************************************
- * x264_encoder_headers:
- ****************************************************************************/
-int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
-{
-    int frame_size = 0;
-    /* init bitstream context */
-    h->out.i_nal = 0;
-    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
-
-    /* Write SEI, SPS and PPS. */
-
-    /* generate sequence parameters */
-    x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
-    x264_sps_write( &h->out.bs, h->sps );
-    if( x264_nal_end( h ) )
-        return -1;
-
-    /* generate picture parameters */
-    x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-    x264_pps_write( &h->out.bs, h->sps, h->pps );
-    if( x264_nal_end( h ) )
-        return -1;
-
-    /* identify ourselves */
-    x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-    if( x264_sei_version_write( h, &h->out.bs ) )
-        return -1;
-    if( x264_nal_end( h ) )
-        return -1;
-
-    frame_size = x264_encoder_encapsulate_nals( h, 0 );
-    if( frame_size < 0 )
-        return -1;
-
-    /* now set output*/
-    *pi_nal = h->out.i_nal;
-    *pp_nal = &h->out.nal[0];
-    h->out.i_nal = 0;
-
-    return frame_size;
-}
-
-/* Check to see whether we have chosen a reference list ordering different
- * from the standard's default. */
-static inline void x264_reference_check_reorder( x264_t *h )
-{
-    /* The reorder check doesn't check for missing frames, so just
-     * force a reorder if one of the reference list is corrupt. */
-    for( int i = 0; h->frames.reference[i]; i++ )
-        if( h->frames.reference[i]->b_corrupt )
-        {
-            h->b_ref_reorder[0] = 1;
-            return;
-        }
-    for( int list = 0; list <= (h->sh.i_type == SLICE_TYPE_B); list++ )
-        for( int i = 0; i < h->i_ref[list] - 1; i++ )
-        {
-            int framenum_diff = h->fref[list][i+1]->i_frame_num - h->fref[list][i]->i_frame_num;
-            int poc_diff = h->fref[list][i+1]->i_poc - h->fref[list][i]->i_poc;
-            /* P and B-frames use different default orders. */
-            if( h->sh.i_type == SLICE_TYPE_P ? framenum_diff > 0 : list == 1 ? poc_diff < 0 : poc_diff > 0 )
-            {
-                h->b_ref_reorder[list] = 1;
-                return;
-            }
-        }
-}
-
-/* return -1 on failure, else return the index of the new reference frame */
-int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w )
-{
-    int i = h->i_ref[0];
-    int j = 1;
-    x264_frame_t *newframe;
-    if( i <= 1 ) /* empty list, definitely can't duplicate frame */
-        return -1;
-
-    //Duplication is only used in X264_WEIGHTP_SMART
-    if( h->param.analyse.i_weighted_pred != X264_WEIGHTP_SMART )
-        return -1;
-
-    /* Duplication is a hack to compensate for crappy rounding in motion compensation.
-     * With high bit depth, it's not worth doing, so turn it off except in the case of
-     * unweighted dupes. */
-    if( BIT_DEPTH > 8 && w != x264_weight_none )
-        return -1;
-
-    newframe = x264_frame_pop_blank_unused( h );
-    if( !newframe )
-        return -1;
-
-    //FIXME: probably don't need to copy everything
-    *newframe = *h->fref[0][i_ref];
-    newframe->i_reference_count = 1;
-    newframe->orig = h->fref[0][i_ref];
-    newframe->b_duplicate = 1;
-    memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) );
-
-    /* shift the frames to make space for the dupe. */
-    h->b_ref_reorder[0] = 1;
-    if( h->i_ref[0] < X264_REF_MAX )
-        ++h->i_ref[0];
-    h->fref[0][X264_REF_MAX-1] = NULL;
-    x264_frame_unshift( &h->fref[0][j], newframe );
-
-    return j;
-}
-
-static void x264_weighted_pred_init( x264_t *h )
-{
-    /* for now no analysis and set all weights to nothing */
-    for( int i_ref = 0; i_ref < h->i_ref[0]; i_ref++ )
-        h->fenc->weighted[i_ref] = h->fref[0][i_ref]->filtered[0][0];
-
-    // FIXME: This only supports weighting of one reference frame
-    // and duplicates of that frame.
-    h->fenc->i_lines_weighted = 0;
-
-    for( int i_ref = 0; i_ref < (h->i_ref[0] << SLICE_MBAFF); i_ref++ )
-        for( int i = 0; i < 3; i++ )
-            h->sh.weight[i_ref][i].weightfn = NULL;
-
-
-    if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 )
-        return;
-
-    int i_padv = PADV << PARAM_INTERLACED;
-    int denom = -1;
-    int weightplane[2] = { 0, 0 };
-    int buffer_next = 0;
-    for( int i = 0; i < 3; i++ )
-    {
-        for( int j = 0; j < h->i_ref[0]; j++ )
-        {
-            if( h->fenc->weight[j][i].weightfn )
-            {
-                h->sh.weight[j][i] = h->fenc->weight[j][i];
-                // if weight is useless, don't write it to stream
-                if( h->sh.weight[j][i].i_scale == 1<<h->sh.weight[j][i].i_denom && h->sh.weight[j][i].i_offset == 0 )
-                    h->sh.weight[j][i].weightfn = NULL;
-                else
-                {
-                    if( !weightplane[!!i] )
-                    {
-                        weightplane[!!i] = 1;
-                        h->sh.weight[0][!!i].i_denom = denom = h->sh.weight[j][i].i_denom;
-                        assert( x264_clip3( denom, 0, 7 ) == denom );
-                    }
-
-                    assert( h->sh.weight[j][i].i_denom == denom );
-                    if( !i )
-                    {
-                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
-                        //scale full resolution frame
-                        if( h->param.i_threads == 1 )
-                        {
-                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
-                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
-                            int stride = h->fenc->i_stride[0];
-                            int width = h->fenc->i_width[0] + PADH*2;
-                            int height = h->fenc->i_lines[0] + i_padv*2;
-                            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
-                            h->fenc->i_lines_weighted = height;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    if( weightplane[1] )
-        for( int i = 0; i < h->i_ref[0]; i++ )
-        {
-            if( h->sh.weight[i][1].weightfn && !h->sh.weight[i][2].weightfn )
-            {
-                h->sh.weight[i][2].i_scale = 1 << h->sh.weight[0][1].i_denom;
-                h->sh.weight[i][2].i_offset = 0;
-            }
-            else if( h->sh.weight[i][2].weightfn && !h->sh.weight[i][1].weightfn )
-            {
-                h->sh.weight[i][1].i_scale = 1 << h->sh.weight[0][1].i_denom;
-                h->sh.weight[i][1].i_offset = 0;
-            }
-        }
-
-    if( !weightplane[0] )
-        h->sh.weight[0][0].i_denom = 0;
-    if( !weightplane[1] )
-        h->sh.weight[0][1].i_denom = 0;
-    h->sh.weight[0][2].i_denom = h->sh.weight[0][1].i_denom;
-}
-
-static inline int x264_reference_distance( x264_t *h, x264_frame_t *frame )
-{
-    if( h->param.i_frame_packing == 5 )
-        return abs((h->fenc->i_frame&~1) - (frame->i_frame&~1)) +
-                  ((h->fenc->i_frame&1) != (frame->i_frame&1));
-    else
-        return abs(h->fenc->i_frame - frame->i_frame);
-}
-
-static inline void x264_reference_build_list( x264_t *h, int i_poc )
-{
-    int b_ok;
-
-    /* build ref list 0/1 */
-    h->mb.pic.i_fref[0] = h->i_ref[0] = 0;
-    h->mb.pic.i_fref[1] = h->i_ref[1] = 0;
-    if( h->sh.i_type == SLICE_TYPE_I )
-        return;
-
-    for( int i = 0; h->frames.reference[i]; i++ )
-    {
-        if( h->frames.reference[i]->b_corrupt )
-            continue;
-        if( h->frames.reference[i]->i_poc < i_poc )
-            h->fref[0][h->i_ref[0]++] = h->frames.reference[i];
-        else if( h->frames.reference[i]->i_poc > i_poc )
-            h->fref[1][h->i_ref[1]++] = h->frames.reference[i];
-    }
-
-    if( h->sh.i_mmco_remove_from_end )
-    {
-        /* Order ref0 for MMCO remove */
-        do
-        {
-            b_ok = 1;
-            for( int i = 0; i < h->i_ref[0] - 1; i++ )
-            {
-                if( h->fref[0][i]->i_frame < h->fref[0][i+1]->i_frame )
-                {
-                    XCHG( x264_frame_t*, h->fref[0][i], h->fref[0][i+1] );
-                    b_ok = 0;
-                    break;
-                }
-            }
-        } while( !b_ok );
-
-        for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- )
-        {
-            int diff = h->i_frame_num - h->fref[0][i]->i_frame_num;
-            h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc;
-            h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
-        }
-    }
-
-    /* Order reference lists by distance from the current frame. */
-    for( int list = 0; list < 2; list++ )
-    {
-        h->fref_nearest[list] = h->fref[list][0];
-        do
-        {
-            b_ok = 1;
-            for( int i = 0; i < h->i_ref[list] - 1; i++ )
-            {
-                if( list ? h->fref[list][i+1]->i_poc < h->fref_nearest[list]->i_poc
-                         : h->fref[list][i+1]->i_poc > h->fref_nearest[list]->i_poc )
-                    h->fref_nearest[list] = h->fref[list][i+1];
-                if( x264_reference_distance( h, h->fref[list][i] ) > x264_reference_distance( h, h->fref[list][i+1] ) )
-                {
-                    XCHG( x264_frame_t*, h->fref[list][i], h->fref[list][i+1] );
-                    b_ok = 0;
-                    break;
-                }
-            }
-        } while( !b_ok );
-    }
-
-    x264_reference_check_reorder( h );
-
-    h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 );
-    h->i_ref[0] = X264_MIN( h->i_ref[0], h->frames.i_max_ref0 );
-    h->i_ref[0] = X264_MIN( h->i_ref[0], h->param.i_frame_reference ); // if reconfig() has lowered the limit
-
-    /* For Blu-ray compliance, don't reference frames outside of the minigop. */
-    if( IS_X264_TYPE_B( h->fenc->i_type ) && h->param.b_bluray_compat )
-        h->i_ref[0] = X264_MIN( h->i_ref[0], IS_X264_TYPE_B( h->fref[0][0]->i_type ) + 1 );
-
-    /* add duplicates */
-    if( h->fenc->i_type == X264_TYPE_P )
-    {
-        int idx = -1;
-        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
-        {
-            x264_weight_t w[3];
-            w[1].weightfn = w[2].weightfn = NULL;
-            if( h->param.rc.b_stat_read )
-                x264_ratecontrol_set_weights( h, h->fenc );
-
-            if( !h->fenc->weight[0][0].weightfn )
-            {
-                h->fenc->weight[0][0].i_denom = 0;
-                SET_WEIGHT( w[0], 1, 1, 0, -1 );
-                idx = x264_weighted_reference_duplicate( h, 0, w );
-            }
-            else
-            {
-                if( h->fenc->weight[0][0].i_scale == 1<<h->fenc->weight[0][0].i_denom )
-                {
-                    SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset );
-                }
-                x264_weighted_reference_duplicate( h, 0, x264_weight_none );
-                if( h->fenc->weight[0][0].i_offset > -128 )
-                {
-                    w[0] = h->fenc->weight[0][0];
-                    w[0].i_offset--;
-                    h->mc.weight_cache( h, &w[0] );
-                    idx = x264_weighted_reference_duplicate( h, 0, w );
-                }
-            }
-        }
-        h->mb.ref_blind_dupe = idx;
-    }
-
-    assert( h->i_ref[0] + h->i_ref[1] <= X264_REF_MAX );
-    h->mb.pic.i_fref[0] = h->i_ref[0];
-    h->mb.pic.i_fref[1] = h->i_ref[1];
-}
-
-static void x264_fdec_filter_row( x264_t *h, int mb_y, int pass )
-{
-    /* mb_y is the mb to be encoded next, not the mb to be filtered here */
-    int b_hpel = h->fdec->b_kept_as_ref;
-    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
-    int b_end = mb_y == h->i_threadslice_end;
-    int b_measure_quality = 1;
-    int min_y = mb_y - (1 << SLICE_MBAFF);
-    int b_start = min_y == h->i_threadslice_start;
-    /* Even in interlaced mode, deblocking never modifies more than 4 pixels
-     * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */
-    int minpix_y = min_y*16 - 4 * !b_start;
-    int maxpix_y = mb_y*16 - 4 * !b_end;
-    b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
-    if( h->param.b_sliced_threads )
-    {
-        switch( pass )
-        {
-            /* During encode: only do deblock if asked for */
-            default:
-            case 0:
-                b_deblock &= h->param.b_full_recon;
-                b_hpel = 0;
-                break;
-            /* During post-encode pass: do deblock if not done yet, do hpel for all
-             * rows except those between slices. */
-            case 1:
-                b_deblock &= !h->param.b_full_recon;
-                b_hpel &= !(b_start && min_y > 0);
-                b_measure_quality = 0;
-                break;
-            /* Final pass: do the rows between slices in sequence. */
-            case 2:
-                b_deblock = 0;
-                b_measure_quality = 0;
-                break;
-        }
-    }
-    if( mb_y & SLICE_MBAFF )
-        return;
-    if( min_y < h->i_threadslice_start )
-        return;
-
-    if( b_deblock )
-        for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) )
-            x264_frame_deblock_row( h, y );
-
-    /* FIXME: Prediction requires different borders for interlaced/progressive mc,
-     * but the actual image data is equivalent. For now, maintain this
-     * consistency by copying deblocked pixels between planes. */
-    if( PARAM_INTERLACED && (!h->param.b_sliced_threads || pass == 1) )
-        for( int p = 0; p < h->fdec->i_plane; p++ )
-            for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )
-                memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
-                        h->fdec->plane[p]     + i*h->fdec->i_stride[p],
-                        h->mb.i_mb_width*16*sizeof(pixel) );
-
-    if( h->fdec->b_kept_as_ref && (!h->param.b_sliced_threads || pass == 1) )
-        x264_frame_expand_border( h, h->fdec, min_y );
-    if( b_hpel )
-    {
-        int end = mb_y == h->mb.i_mb_height;
-        /* Can't do hpel until the previous slice is done encoding. */
-        if( h->param.analyse.i_subpel_refine )
-        {
-            x264_frame_filter( h, h->fdec, min_y, end );
-            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
-        }
-    }
-
-    if( SLICE_MBAFF && pass == 0 )
-        for( int i = 0; i < 3; i++ )
-        {
-            XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );
-            XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] );
-        }
-
-    if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
-        x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) );
-
-    if( b_measure_quality )
-    {
-        maxpix_y = X264_MIN( maxpix_y, h->param.i_height );
-        if( h->param.analyse.b_psnr )
-        {
-            for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
-                h->stat.frame.i_ssd[p] += x264_pixel_ssd_wxh( &h->pixf,
-                    h->fdec->plane[p] + minpix_y * h->fdec->i_stride[p], h->fdec->i_stride[p],
-                    h->fenc->plane[p] + minpix_y * h->fenc->i_stride[p], h->fenc->i_stride[p],
-                    h->param.i_width, maxpix_y-minpix_y );
-            if( !CHROMA444 )
-            {
-                uint64_t ssd_u, ssd_v;
-                int v_shift = CHROMA_V_SHIFT;
-                x264_pixel_ssd_nv12( &h->pixf,
-                    h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
-                    h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
-                    h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );
-                h->stat.frame.i_ssd[1] += ssd_u;
-                h->stat.frame.i_ssd[2] += ssd_v;
-            }
-        }
-
-        if( h->param.analyse.b_ssim )
-        {
-            int ssim_cnt;
-            x264_emms();
-            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
-             * and overlap by 4 */
-            minpix_y += b_start ? 2 : -6;
-            h->stat.frame.f_ssim +=
-                x264_pixel_ssim_wxh( &h->pixf,
-                    h->fdec->plane[0] + 2+minpix_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                    h->fenc->plane[0] + 2+minpix_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                    h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer, &ssim_cnt );
-            h->stat.frame.i_ssim_cnt += ssim_cnt;
-        }
-    }
-}
-
-static inline int x264_reference_update( x264_t *h )
-{
-    if( !h->fdec->b_kept_as_ref )
-    {
-        if( h->i_thread_frames > 1 )
-        {
-            x264_frame_push_unused( h, h->fdec );
-            h->fdec = x264_frame_pop_unused( h, 1 );
-            if( !h->fdec )
-                return -1;
-        }
-        return 0;
-    }
-
-    /* apply mmco from previous frame. */
-    for( int i = 0; i < h->sh.i_mmco_command_count; i++ )
-        for( int j = 0; h->frames.reference[j]; j++ )
-            if( h->frames.reference[j]->i_poc == h->sh.mmco[i].i_poc )
-                x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[j] ) );
-
-    /* move frame in the buffer */
-    x264_frame_push( h->frames.reference, h->fdec );
-    if( h->frames.reference[h->sps->i_num_ref_frames] )
-        x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
-    h->fdec = x264_frame_pop_unused( h, 1 );
-    if( !h->fdec )
-        return -1;
-    return 0;
-}
-
-static inline void x264_reference_reset( x264_t *h )
-{
-    while( h->frames.reference[0] )
-        x264_frame_push_unused( h, x264_frame_pop( h->frames.reference ) );
-    h->fdec->i_poc =
-    h->fenc->i_poc = 0;
-}
-
-static inline void x264_reference_hierarchy_reset( x264_t *h )
-{
-    int ref;
-    int b_hasdelayframe = 0;
-
-    /* look for delay frames -- chain must only contain frames that are disposable */
-    for( int i = 0; h->frames.current[i] && IS_DISPOSABLE( h->frames.current[i]->i_type ); i++ )
-        b_hasdelayframe |= h->frames.current[i]->i_coded
-                        != h->frames.current[i]->i_frame + h->sps->vui.i_num_reorder_frames;
-
-    /* This function must handle b-pyramid and clear frames for open-gop */
-    if( h->param.i_bframe_pyramid != X264_B_PYRAMID_STRICT && !b_hasdelayframe && h->frames.i_poc_last_open_gop == -1 )
-        return;
-
-    /* Remove last BREF. There will never be old BREFs in the
-     * dpb during a BREF decode when pyramid == STRICT */
-    for( ref = 0; h->frames.reference[ref]; ref++ )
-    {
-        if( ( h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT
-            && h->frames.reference[ref]->i_type == X264_TYPE_BREF )
-            || ( h->frames.reference[ref]->i_poc < h->frames.i_poc_last_open_gop
-            && h->sh.i_type != SLICE_TYPE_B ) )
-        {
-            int diff = h->i_frame_num - h->frames.reference[ref]->i_frame_num;
-            h->sh.mmco[h->sh.i_mmco_command_count].i_difference_of_pic_nums = diff;
-            h->sh.mmco[h->sh.i_mmco_command_count++].i_poc = h->frames.reference[ref]->i_poc;
-            x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[ref] ) );
-            h->b_ref_reorder[0] = 1;
-            ref--;
-        }
-    }
-
-    /* Prepare room in the dpb for the delayed display time of the later b-frame's */
-    if( h->param.i_bframe_pyramid )
-        h->sh.i_mmco_remove_from_end = X264_MAX( ref + 2 - h->frames.i_max_dpb, 0 );
-}
-
-static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
-{
-    /* ------------------------ Create slice header  ----------------------- */
-    if( i_nal_type == NAL_SLICE_IDR )
-    {
-        x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
-
-        /* alternate id */
-        if( h->param.i_avcintra_class )
-        {
-            switch( h->i_idr_pic_id )
-            {
-                case 5:
-                    h->i_idr_pic_id = 3;
-                    break;
-                case 3:
-                    h->i_idr_pic_id = 4;
-                    break;
-                case 4:
-                default:
-                    h->i_idr_pic_id = 5;
-                    break;
-            }
-        }
-        else
-            h->i_idr_pic_id ^= 1;
-    }
-    else
-    {
-        x264_slice_header_init( h, &h->sh, h->sps, h->pps, -1, h->i_frame_num, i_global_qp );
-
-        h->sh.i_num_ref_idx_l0_active = h->i_ref[0] <= 0 ? 1 : h->i_ref[0];
-        h->sh.i_num_ref_idx_l1_active = h->i_ref[1] <= 0 ? 1 : h->i_ref[1];
-        if( h->sh.i_num_ref_idx_l0_active != h->pps->i_num_ref_idx_l0_default_active ||
-            (h->sh.i_type == SLICE_TYPE_B && h->sh.i_num_ref_idx_l1_active != h->pps->i_num_ref_idx_l1_default_active) )
-        {
-            h->sh.b_num_ref_idx_override = 1;
-        }
-    }
-
-    if( h->fenc->i_type == X264_TYPE_BREF && h->param.b_bluray_compat && h->sh.i_mmco_command_count )
-    {
-        h->b_sh_backup = 1;
-        h->sh_backup = h->sh;
-    }
-
-    h->fdec->i_frame_num = h->sh.i_frame_num;
-
-    if( h->sps->i_poc_type == 0 )
-    {
-        h->sh.i_poc = h->fdec->i_poc;
-        if( PARAM_INTERLACED )
-        {
-            h->sh.i_delta_poc_bottom = h->param.b_tff ? 1 : -1;
-            h->sh.i_poc += h->sh.i_delta_poc_bottom == -1;
-        }
-        else
-            h->sh.i_delta_poc_bottom = 0;
-        h->fdec->i_delta_poc[0] = h->sh.i_delta_poc_bottom == -1;
-        h->fdec->i_delta_poc[1] = h->sh.i_delta_poc_bottom ==  1;
-    }
-    else
-    {
-        /* Nothing to do ? */
-    }
-
-    x264_macroblock_slice_init( h );
-}
-
-typedef struct
-{
-    int skip;
-    uint8_t cabac_prevbyte;
-    bs_t bs;
-    x264_cabac_t cabac;
-    x264_frame_stat_t stat;
-    int last_qp;
-    int last_dqp;
-    int field_decoding_flag;
-} x264_bs_bak_t;
-
-static ALWAYS_INLINE void x264_bitstream_backup( x264_t *h, x264_bs_bak_t *bak, int i_skip, int full )
-{
-    if( full )
-    {
-        bak->stat = h->stat.frame;
-        bak->last_qp = h->mb.i_last_qp;
-        bak->last_dqp = h->mb.i_last_dqp;
-        bak->field_decoding_flag = h->mb.field_decoding_flag;
-    }
-    else
-    {
-        bak->stat.i_mv_bits = h->stat.frame.i_mv_bits;
-        bak->stat.i_tex_bits = h->stat.frame.i_tex_bits;
-    }
-    /* In the per-MB backup, we don't need the contexts because flushing the CABAC
-     * encoder has no context dependency and in this case, a slice is ended (and
-     * thus the content of all contexts are thrown away). */
-    if( h->param.b_cabac )
-    {
-        if( full )
-            memcpy( &bak->cabac, &h->cabac, sizeof(x264_cabac_t) );
-        else
-            memcpy( &bak->cabac, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
-        /* x264's CABAC writer modifies the previous byte during carry, so it has to be
-         * backed up. */
-        bak->cabac_prevbyte = h->cabac.p[-1];
-    }
-    else
-    {
-        bak->bs = h->out.bs;
-        bak->skip = i_skip;
-    }
-}
-
-static ALWAYS_INLINE void x264_bitstream_restore( x264_t *h, x264_bs_bak_t *bak, int *skip, int full )
-{
-    if( full )
-    {
-        h->stat.frame = bak->stat;
-        h->mb.i_last_qp = bak->last_qp;
-        h->mb.i_last_dqp = bak->last_dqp;
-        h->mb.field_decoding_flag = bak->field_decoding_flag;
-    }
-    else
-    {
-        h->stat.frame.i_mv_bits = bak->stat.i_mv_bits;
-        h->stat.frame.i_tex_bits = bak->stat.i_tex_bits;
-    }
-    if( h->param.b_cabac )
-    {
-        if( full )
-            memcpy( &h->cabac, &bak->cabac, sizeof(x264_cabac_t) );
-        else
-            memcpy( &h->cabac, &bak->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
-        h->cabac.p[-1] = bak->cabac_prevbyte;
-    }
-    else
-    {
-        h->out.bs = bak->bs;
-        *skip = bak->skip;
-    }
-}
-
-static intptr_t x264_slice_write( x264_t *h )
-{
-    int i_skip;
-    int mb_xy, i_mb_x, i_mb_y;
-    /* NALUs other than the first use a 3-byte startcode.
-     * Add one extra byte for the rbsp, and one more for the final CABAC putbyte.
-     * Then add an extra 5 bytes just in case, to account for random NAL escapes and
-     * other inaccuracies. */
-    int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5;
-    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0;
-    int back_up_bitstream_cavlc = !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH;
-    int back_up_bitstream = slice_max_size || back_up_bitstream_cavlc;
-    int starting_bits = bs_pos(&h->out.bs);
-    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
-    int b_hpel = h->fdec->b_kept_as_ref;
-    int orig_last_mb = h->sh.i_last_mb;
-    int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1;
-    uint8_t *last_emu_check;
-#define BS_BAK_SLICE_MAX_SIZE 0
-#define BS_BAK_CAVLC_OVERFLOW 1
-#define BS_BAK_SLICE_MIN_MBS  2
-#define BS_BAK_ROW_VBV        3
-    x264_bs_bak_t bs_bak[4];
-    b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
-    bs_realign( &h->out.bs );
-
-    /* Slice */
-    x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
-    h->out.nal[h->out.i_nal].i_first_mb = h->sh.i_first_mb;
-
-    /* Slice header */
-    x264_macroblock_thread_init( h );
-
-    /* Set the QP equal to the first QP in the slice for more accurate CABAC initialization. */
-    h->mb.i_mb_xy = h->sh.i_first_mb;
-    h->sh.i_qp = x264_ratecontrol_mb_qp( h );
-    h->sh.i_qp = SPEC_QP( h->sh.i_qp );
-    h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp;
-
-    x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
-    if( h->param.b_cabac )
-    {
-        /* alignment needed */
-        bs_align_1( &h->out.bs );
-
-        /* init cabac */
-        x264_cabac_context_init( h, &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
-        x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end );
-        last_emu_check = h->cabac.p;
-    }
-    else
-        last_emu_check = h->out.bs.p;
-    h->mb.i_last_qp = h->sh.i_qp;
-    h->mb.i_last_dqp = 0;
-    h->mb.field_decoding_flag = 0;
-
-    i_mb_y = h->sh.i_first_mb / h->mb.i_mb_width;
-    i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width;
-    i_skip = 0;
-
-    while( 1 )
-    {
-        mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width;
-        int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
-
-        if( i_mb_x == 0 )
-        {
-            if( x264_bitstream_check_buffer( h ) )
-                return -1;
-            if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size )
-                x264_bitstream_backup( h, &bs_bak[BS_BAK_ROW_VBV], i_skip, 1 );
-            if( !h->mb.b_reencode_mb )
-                x264_fdec_filter_row( h, i_mb_y, 0 );
-        }
-
-        if( back_up_bitstream )
-        {
-            if( back_up_bitstream_cavlc )
-                x264_bitstream_backup( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], i_skip, 0 );
-            if( slice_max_size && !(i_mb_y & SLICE_MBAFF) )
-            {
-                x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 );
-                if( (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs )
-                    x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 );
-            }
-        }
-
-        if( PARAM_INTERLACED )
-        {
-            if( h->mb.b_adaptive_mbaff )
-            {
-                if( !(i_mb_y&1) )
-                {
-                    /* FIXME: VSAD is fast but fairly poor at choosing the best interlace type. */
-                    h->mb.b_interlaced = x264_field_vsad( h, i_mb_x, i_mb_y );
-                    memcpy( &h->zigzagf, MB_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) );
-                    if( !MB_INTERLACED && (i_mb_y+2) == h->mb.i_mb_height )
-                        x264_expand_border_mbpair( h, i_mb_x, i_mb_y );
-                }
-            }
-            h->mb.field[mb_xy] = MB_INTERLACED;
-        }
-
-        /* load cache */
-        if( SLICE_MBAFF )
-            x264_macroblock_cache_load_interlaced( h, i_mb_x, i_mb_y );
-        else
-            x264_macroblock_cache_load_progressive( h, i_mb_x, i_mb_y );
-
-        x264_macroblock_analyse( h );
-
-        /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
-reencode:
-        x264_macroblock_encode( h );
-
-        if( h->param.b_cabac )
-        {
-            if( mb_xy > h->sh.i_first_mb && !(SLICE_MBAFF && (i_mb_y&1)) )
-                x264_cabac_encode_terminal( &h->cabac );
-
-            if( IS_SKIP( h->mb.i_type ) )
-                x264_cabac_mb_skip( h, 1 );
-            else
-            {
-                if( h->sh.i_type != SLICE_TYPE_I )
-                    x264_cabac_mb_skip( h, 0 );
-                x264_macroblock_write_cabac( h, &h->cabac );
-            }
-        }
-        else
-        {
-            if( IS_SKIP( h->mb.i_type ) )
-                i_skip++;
-            else
-            {
-                if( h->sh.i_type != SLICE_TYPE_I )
-                {
-                    bs_write_ue( &h->out.bs, i_skip );  /* skip run */
-                    i_skip = 0;
-                }
-                x264_macroblock_write_cavlc( h );
-                /* If there was a CAVLC level code overflow, try again at a higher QP. */
-                if( h->mb.b_overflow )
-                {
-                    h->mb.i_chroma_qp = h->chroma_qp_table[++h->mb.i_qp];
-                    h->mb.i_skip_intra = 0;
-                    h->mb.b_skip_mc = 0;
-                    h->mb.b_overflow = 0;
-                    x264_bitstream_restore( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], &i_skip, 0 );
-                    goto reencode;
-                }
-            }
-        }
-
-        int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
-        int mb_size = total_bits - mb_spos;
-
-        if( slice_max_size && (!SLICE_MBAFF || (i_mb_y&1)) )
-        {
-            /* Count the skip run, just in case. */
-            if( !h->param.b_cabac )
-                total_bits += bs_size_ue_big( i_skip );
-            /* Check for escape bytes. */
-            uint8_t *end = h->param.b_cabac ? h->cabac.p : h->out.bs.p;
-            for( ; last_emu_check < end - 2; last_emu_check++ )
-                if( last_emu_check[0] == 0 && last_emu_check[1] == 0 && last_emu_check[2] <= 3 )
-                {
-                    slice_max_size -= 8;
-                    last_emu_check++;
-                }
-            /* We'll just re-encode this last macroblock if we go over the max slice size. */
-            if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
-            {
-                if( !x264_frame_new_slice( h, h->fdec ) )
-                {
-                    /* Handle the most obnoxious slice-min-mbs edge case: we need to end the slice
-                     * because it's gone over the maximum size, but doing so would violate slice-min-mbs.
-                     * If possible, roll back to the last checkpoint and try again.
-                     * We could try raising QP, but that would break in the case where a slice spans multiple
-                     * rows, which the re-encoding infrastructure can't currently handle. */
-                    if( mb_xy <= thread_last_mb && (thread_last_mb+1-mb_xy) < h->param.i_slice_min_mbs )
-                    {
-                        if( thread_last_mb-h->param.i_slice_min_mbs < h->sh.i_first_mb+h->param.i_slice_min_mbs )
-                        {
-                            x264_log( h, X264_LOG_WARNING, "slice-max-size violated (frame %d, cause: slice-min-mbs)\n", h->i_frame );
-                            slice_max_size = 0;
-                            goto cont;
-                        }
-                        x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], &i_skip, 0 );
-                        h->mb.b_reencode_mb = 1;
-                        h->sh.i_last_mb = thread_last_mb-h->param.i_slice_min_mbs;
-                        break;
-                    }
-                    if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb )
-                    {
-                        x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 );
-                        h->mb.b_reencode_mb = 1;
-                        if( SLICE_MBAFF )
-                        {
-                            // set to bottom of previous mbpair
-                            if( i_mb_x )
-                                h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1));
-                            else
-                                h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1;
-                        }
-                        else
-                            h->sh.i_last_mb = mb_xy-1;
-                        break;
-                    }
-                    else
-                        h->sh.i_last_mb = mb_xy;
-                }
-                else
-                    slice_max_size = 0;
-            }
-        }
-cont:
-        h->mb.b_reencode_mb = 0;
-
-        /* save cache */
-        x264_macroblock_cache_save( h );
-
-        if( x264_ratecontrol_mb( h, mb_size ) < 0 )
-        {
-            x264_bitstream_restore( h, &bs_bak[BS_BAK_ROW_VBV], &i_skip, 1 );
-            h->mb.b_reencode_mb = 1;
-            i_mb_x = 0;
-            i_mb_y = i_mb_y - SLICE_MBAFF;
-            h->mb.i_mb_prev_xy = i_mb_y * h->mb.i_mb_stride - 1;
-            h->sh.i_last_mb = orig_last_mb;
-            continue;
-        }
-
-        /* accumulate mb stats */
-        h->stat.frame.i_mb_count[h->mb.i_type]++;
-
-        int b_intra = IS_INTRA( h->mb.i_type );
-        int b_skip = IS_SKIP( h->mb.i_type );
-        if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write )
-        {
-            if( !b_intra && !b_skip && !IS_DIRECT( h->mb.i_type ) )
-            {
-                if( h->mb.i_partition != D_8x8 )
-                        h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
-                    else
-                        for( int i = 0; i < 4; i++ )
-                            h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
-                if( h->param.i_frame_reference > 1 )
-                    for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
-                        for( int i = 0; i < 4; i++ )
-                        {
-                            int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
-                            if( i_ref >= 0 )
-                                h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
-                        }
-            }
-        }
-
-        if( h->param.i_log_level >= X264_LOG_INFO )
-        {
-            if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
-            {
-                if( CHROMA444 )
-                {
-                    for( int i = 0; i < 4; i++ )
-                        if( h->mb.i_cbp_luma & (1 << i) )
-                            for( int p = 0; p < 3; p++ )
-                            {
-                                int s8 = i*4+p*16;
-                                int nnz8x8 = M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+0] )
-                                           | M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+8] );
-                                h->stat.frame.i_mb_cbp[!b_intra + p*2] += !!nnz8x8;
-                            }
-                }
-                else
-                {
-                    int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
-                               + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
-                    h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
-                    h->stat.frame.i_mb_cbp[!b_intra + 2] += !!h->mb.i_cbp_chroma;
-                    h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma >> 1;
-                }
-            }
-            if( h->mb.i_cbp_luma && !b_intra )
-            {
-                h->stat.frame.i_mb_count_8x8dct[0] ++;
-                h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
-            }
-            if( b_intra && h->mb.i_type != I_PCM )
-            {
-                if( h->mb.i_type == I_16x16 )
-                    h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++;
-                else if( h->mb.i_type == I_8x8 )
-                    for( int i = 0; i < 16; i += 4 )
-                        h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
-                else //if( h->mb.i_type == I_4x4 )
-                    for( int i = 0; i < 16; i++ )
-                        h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
-                h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++;
-            }
-            h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED;
-        }
-
-        /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */
-        if( b_deblock )
-            x264_macroblock_deblock_strength( h );
-
-        if( mb_xy == h->sh.i_last_mb )
-            break;
-
-        if( SLICE_MBAFF )
-        {
-            i_mb_x += i_mb_y & 1;
-            i_mb_y ^= i_mb_x < h->mb.i_mb_width;
-        }
-        else
-            i_mb_x++;
-        if( i_mb_x == h->mb.i_mb_width )
-        {
-            i_mb_y++;
-            i_mb_x = 0;
-        }
-    }
-    if( h->sh.i_last_mb < h->sh.i_first_mb )
-        return 0;
-
-    h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb;
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_encode_flush( h, &h->cabac );
-        h->out.bs.p = h->cabac.p;
-    }
-    else
-    {
-        if( i_skip > 0 )
-            bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
-        /* rbsp_slice_trailing_bits */
-        bs_rbsp_trailing( &h->out.bs );
-        bs_flush( &h->out.bs );
-    }
-    if( x264_nal_end( h ) )
-        return -1;
-
-    if( h->sh.i_last_mb == (h->i_threadslice_end * h->mb.i_mb_width - 1) )
-    {
-        h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
-                                  + (h->out.i_nal*NALU_OVERHEAD * 8)
-                                  - h->stat.frame.i_tex_bits
-                                  - h->stat.frame.i_mv_bits;
-        x264_fdec_filter_row( h, h->i_threadslice_end, 0 );
-
-        if( h->param.b_sliced_threads )
-        {
-            /* Tell the main thread we're done. */
-            x264_threadslice_cond_broadcast( h, 1 );
-            /* Do hpel now */
-            for( int mb_y = h->i_threadslice_start; mb_y <= h->i_threadslice_end; mb_y++ )
-                x264_fdec_filter_row( h, mb_y, 1 );
-            x264_threadslice_cond_broadcast( h, 2 );
-            /* Do the first row of hpel, now that the previous slice is done */
-            if( h->i_thread_idx > 0 )
-            {
-                x264_threadslice_cond_wait( h->thread[h->i_thread_idx-1], 2 );
-                x264_fdec_filter_row( h, h->i_threadslice_start + (1 << SLICE_MBAFF), 2 );
-            }
-        }
-
-        /* Free mb info after the last thread's done using it */
-        if( h->fdec->mb_info_free && (!h->param.b_sliced_threads || h->i_thread_idx == (h->param.i_threads-1)) )
-        {
-            h->fdec->mb_info_free( h->fdec->mb_info );
-            h->fdec->mb_info = NULL;
-            h->fdec->mb_info_free = NULL;
-        }
-    }
-
-    return 0;
-}
-
-static void x264_thread_sync_context( x264_t *dst, x264_t *src )
-{
-    if( dst == src )
-        return;
-
-    // reference counting
-    for( x264_frame_t **f = src->frames.reference; *f; f++ )
-        (*f)->i_reference_count++;
-    for( x264_frame_t **f = dst->frames.reference; *f; f++ )
-        x264_frame_push_unused( src, *f );
-    src->fdec->i_reference_count++;
-    x264_frame_push_unused( src, dst->fdec );
-
-    // copy everything except the per-thread pointers and the constants.
-    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.base) - offsetof(x264_t, i_frame) );
-    dst->param = src->param;
-    dst->stat = src->stat;
-    dst->pixf = src->pixf;
-    dst->reconfig = src->reconfig;
-}
-
-static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
-{
-    if( dst != src )
-        memcpy( &dst->stat, &src->stat, offsetof(x264_t, stat.frame) - offsetof(x264_t, stat) );
-}
-
-static void *x264_slices_write( x264_t *h )
-{
-    int i_slice_num = 0;
-    int last_thread_mb = h->sh.i_last_mb;
-
-    /* init stats */
-    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
-    h->mb.b_reencode_mb = 0;
-    while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb )
-    {
-        h->sh.i_last_mb = last_thread_mb;
-        if( !i_slice_num || !x264_frame_new_slice( h, h->fdec ) )
-        {
-            if( h->param.i_slice_max_mbs )
-            {
-                if( SLICE_MBAFF )
-                {
-                    // convert first to mbaff form, add slice-max-mbs, then convert back to normal form
-                    int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width)
-                        + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width)
-                        + h->param.i_slice_max_mbs - 1;
-                    int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2;
-                    int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1;
-                    h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y;
-                }
-                else
-                {
-                    h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
-                    if( h->sh.i_last_mb < last_thread_mb && last_thread_mb - h->sh.i_last_mb < h->param.i_slice_min_mbs )
-                        h->sh.i_last_mb = last_thread_mb - h->param.i_slice_min_mbs;
-                }
-                i_slice_num++;
-            }
-            else if( h->param.i_slice_count && !h->param.b_sliced_threads )
-            {
-                int height = h->mb.i_mb_height >> PARAM_INTERLACED;
-                int width = h->mb.i_mb_width << PARAM_INTERLACED;
-                i_slice_num++;
-                h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
-            }
-        }
-        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
-        if( x264_stack_align( x264_slice_write, h ) )
-            goto fail;
-        h->sh.i_first_mb = h->sh.i_last_mb + 1;
-        // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order
-        if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width )
-            h->sh.i_first_mb -= h->mb.i_mb_stride;
-    }
-
-    return (void *)0;
-
-fail:
-    /* Tell other threads we're done, so they wouldn't wait for it */
-    if( h->param.b_sliced_threads )
-        x264_threadslice_cond_broadcast( h, 2 );
-    return (void *)-1;
-}
-
-static int x264_threaded_slices_write( x264_t *h )
-{
-    /* set first/last mb and sync contexts */
-    for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        x264_t *t = h->thread[i];
-        if( i )
-        {
-            t->param = h->param;
-            memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
-        }
-        int height = h->mb.i_mb_height >> PARAM_INTERLACED;
-        t->i_threadslice_start = ((height *  i    + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED;
-        t->i_threadslice_end   = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << PARAM_INTERLACED;
-        t->sh.i_first_mb = t->i_threadslice_start * h->mb.i_mb_width;
-        t->sh.i_last_mb  =   t->i_threadslice_end * h->mb.i_mb_width - 1;
-    }
-
-    x264_stack_align( x264_analyse_weight_frame, h, h->mb.i_mb_height*16 + 16 );
-
-    x264_threads_distribute_ratecontrol( h );
-
-    /* setup */
-    for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        h->thread[i]->i_thread_idx = i;
-        h->thread[i]->b_thread_active = 1;
-        x264_threadslice_cond_broadcast( h->thread[i], 0 );
-    }
-    /* dispatch */
-    for( int i = 0; i < h->param.i_threads; i++ )
-        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
-    /* wait */
-    for( int i = 0; i < h->param.i_threads; i++ )
-        x264_threadslice_cond_wait( h->thread[i], 1 );
-
-    x264_threads_merge_ratecontrol( h );
-
-    for( int i = 1; i < h->param.i_threads; i++ )
-    {
-        x264_t *t = h->thread[i];
-        for( int j = 0; j < t->out.i_nal; j++ )
-        {
-            h->out.nal[h->out.i_nal] = t->out.nal[j];
-            h->out.i_nal++;
-            x264_nal_check_buffer( h );
-        }
-        /* All entries in stat.frame are ints except for ssd/ssim. */
-        for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
-            ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
-        for( int j = 0; j < 3; j++ )
-            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
-        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
-        h->stat.frame.i_ssim_cnt += t->stat.frame.i_ssim_cnt;
-    }
-
-    return 0;
-}
-
-void x264_encoder_intra_refresh( x264_t *h )
-{
-    h = h->thread[h->i_thread_phase];
-    h->b_queued_intra_refresh = 1;
-}
-
-int x264_encoder_invalidate_reference( x264_t *h, int64_t pts )
-{
-    if( h->param.i_bframe )
-    {
-        x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with B-frames enabled\n" );
-        return -1;
-    }
-    if( h->param.b_intra_refresh )
-    {
-        x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with intra refresh enabled\n" );
-        return -1;
-    }
-    h = h->thread[h->i_thread_phase];
-    if( pts >= h->i_last_idr_pts )
-    {
-        for( int i = 0; h->frames.reference[i]; i++ )
-            if( pts <= h->frames.reference[i]->i_pts )
-                h->frames.reference[i]->b_corrupt = 1;
-        if( pts <= h->fdec->i_pts )
-            h->fdec->b_corrupt = 1;
-    }
-    return 0;
-}
-
-/****************************************************************************
- * x264_encoder_encode:
- *  XXX: i_poc   : is the poc of the current given picture
- *       i_frame : is the number of the frame being coded
- *  ex:  type frame poc
- *       I      0   2*0
- *       P      1   2*3
- *       B      2   2*1
- *       B      3   2*2
- *       P      4   2*6
- *       B      5   2*4
- *       B      6   2*5
- ****************************************************************************/
-int     x264_encoder_encode( x264_t *h,
-                             x264_nal_t **pp_nal, int *pi_nal,
-                             x264_picture_t *pic_in,
-                             x264_picture_t *pic_out )
-{
-    x264_t *thread_current, *thread_prev, *thread_oldest;
-    int i_nal_type, i_nal_ref_idc, i_global_qp;
-    int overhead = NALU_OVERHEAD;
-
-#if HAVE_OPENCL
-    if( h->opencl.b_fatal_error )
-        return -1;
-#endif
-
-    if( h->i_thread_frames > 1 )
-    {
-        thread_prev    = h->thread[ h->i_thread_phase ];
-        h->i_thread_phase = (h->i_thread_phase + 1) % h->i_thread_frames;
-        thread_current = h->thread[ h->i_thread_phase ];
-        thread_oldest  = h->thread[ (h->i_thread_phase + 1) % h->i_thread_frames ];
-        x264_thread_sync_context( thread_current, thread_prev );
-        x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
-        h = thread_current;
-    }
-    else
-    {
-        thread_current =
-        thread_oldest  = h;
-    }
-    h->i_cpb_delay_pir_offset = h->i_cpb_delay_pir_offset_next;
-
-    /* no data out */
-    *pi_nal = 0;
-    *pp_nal = NULL;
-
-    /* ------------------- Setup new frame from picture -------------------- */
-    if( pic_in != NULL )
-    {
-        if( h->lookahead->b_exit_thread )
-        {
-            x264_log( h, X264_LOG_ERROR, "lookahead thread is already stopped\n" );
-            return -1;
-        }
-
-        /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
-        if( !fenc )
-            return -1;
-
-        if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 )
-            return -1;
-
-        if( h->param.i_width != 16 * h->mb.i_mb_width ||
-            h->param.i_height != 16 * h->mb.i_mb_height )
-            x264_frame_expand_border_mod16( h, fenc );
-
-        fenc->i_frame = h->frames.i_input++;
-
-        if( fenc->i_frame == 0 )
-            h->frames.i_first_pts = fenc->i_pts;
-        if( h->frames.i_bframe_delay && fenc->i_frame == h->frames.i_bframe_delay )
-            h->frames.i_bframe_delay_time = fenc->i_pts - h->frames.i_first_pts;
-
-        if( h->param.b_vfr_input && fenc->i_pts <= h->frames.i_largest_pts )
-            x264_log( h, X264_LOG_WARNING, "non-strictly-monotonic PTS\n" );
-
-        h->frames.i_second_largest_pts = h->frames.i_largest_pts;
-        h->frames.i_largest_pts = fenc->i_pts;
-
-        if( (fenc->i_pic_struct < PIC_STRUCT_AUTO) || (fenc->i_pic_struct > PIC_STRUCT_TRIPLE) )
-            fenc->i_pic_struct = PIC_STRUCT_AUTO;
-
-        if( fenc->i_pic_struct == PIC_STRUCT_AUTO )
-        {
-#if HAVE_INTERLACED
-            int b_interlaced = fenc->param ? fenc->param->b_interlaced : h->param.b_interlaced;
-#else
-            int b_interlaced = 0;
-#endif
-            if( b_interlaced )
-            {
-                int b_tff = fenc->param ? fenc->param->b_tff : h->param.b_tff;
-                fenc->i_pic_struct = b_tff ? PIC_STRUCT_TOP_BOTTOM : PIC_STRUCT_BOTTOM_TOP;
-            }
-            else
-                fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE;
-        }
-
-        if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
-        {
-            if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) )
-                return -1;
-        }
-        else
-            x264_stack_align( x264_adaptive_quant_frame, h, fenc, pic_in->prop.quant_offsets );
-
-        if( pic_in->prop.quant_offsets_free )
-            pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets );
-
-        if( h->frames.b_have_lowres )
-            x264_frame_init_lowres( h, fenc );
-
-        /* 2: Place the frame into the queue for its slice type decision */
-        x264_lookahead_put_frame( h, fenc );
-
-        if( h->frames.i_input <= h->frames.i_delay + 1 - h->i_thread_frames )
-        {
-            /* Nothing yet to encode, waiting for filling of buffers */
-            pic_out->i_type = X264_TYPE_AUTO;
-            return 0;
-        }
-    }
-    else
-    {
-        /* signal kills for lookahead thread */
-        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
-        h->lookahead->b_exit_thread = 1;
-        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
-        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-    }
-
-    h->i_frame++;
-    /* 3: The picture is analyzed in the lookahead */
-    if( !h->frames.current[0] )
-        x264_lookahead_get_frames( h );
-
-    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
-        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
-
-    /* ------------------- Get frame to be encoded ------------------------- */
-    /* 4: get picture to encode */
-    h->fenc = x264_frame_shift( h->frames.current );
-
-    /* If applicable, wait for previous frame reconstruction to finish */
-    if( h->param.b_sliced_threads )
-        if( x264_threadpool_wait_all( h ) < 0 )
-            return -1;
-
-    if( h->i_frame == 0 )
-        h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
-    if( h->reconfig )
-    {
-        x264_encoder_reconfig_apply( h, &h->reconfig_h->param );
-        h->reconfig = 0;
-    }
-    if( h->fenc->param )
-    {
-        x264_encoder_reconfig_apply( h, h->fenc->param );
-        if( h->fenc->param->param_free )
-        {
-            h->fenc->param->param_free( h->fenc->param );
-            h->fenc->param = NULL;
-        }
-    }
-    x264_ratecontrol_zone_init( h );
-
-    // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
-    if( x264_reference_update( h ) )
-        return -1;
-    h->fdec->i_lines_completed = -1;
-
-    if( !IS_X264_TYPE_I( h->fenc->i_type ) )
-    {
-        int valid_refs_left = 0;
-        for( int i = 0; h->frames.reference[i]; i++ )
-            if( !h->frames.reference[i]->b_corrupt )
-                valid_refs_left++;
-        /* No valid reference frames left: force an IDR. */
-        if( !valid_refs_left )
-        {
-            h->fenc->b_keyframe = 1;
-            h->fenc->i_type = X264_TYPE_IDR;
-        }
-    }
-
-    if( h->fenc->b_keyframe )
-    {
-        h->frames.i_last_keyframe = h->fenc->i_frame;
-        if( h->fenc->i_type == X264_TYPE_IDR )
-        {
-            h->i_frame_num = 0;
-            h->frames.i_last_idr = h->fenc->i_frame;
-        }
-    }
-    h->sh.i_mmco_command_count =
-    h->sh.i_mmco_remove_from_end = 0;
-    h->b_ref_reorder[0] =
-    h->b_ref_reorder[1] = 0;
-    h->fdec->i_poc =
-    h->fenc->i_poc = 2 * ( h->fenc->i_frame - X264_MAX( h->frames.i_last_idr, 0 ) );
-
-    /* ------------------- Setup frame context ----------------------------- */
-    /* 5: Init data dependent of frame type */
-    if( h->fenc->i_type == X264_TYPE_IDR )
-    {
-        /* reset ref pictures */
-        i_nal_type    = NAL_SLICE_IDR;
-        i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
-        h->sh.i_type = SLICE_TYPE_I;
-        x264_reference_reset( h );
-        h->frames.i_poc_last_open_gop = -1;
-    }
-    else if( h->fenc->i_type == X264_TYPE_I )
-    {
-        i_nal_type    = NAL_SLICE;
-        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
-        h->sh.i_type = SLICE_TYPE_I;
-        x264_reference_hierarchy_reset( h );
-        if( h->param.b_open_gop )
-            h->frames.i_poc_last_open_gop = h->fenc->b_keyframe ? h->fenc->i_poc : -1;
-    }
-    else if( h->fenc->i_type == X264_TYPE_P )
-    {
-        i_nal_type    = NAL_SLICE;
-        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
-        h->sh.i_type = SLICE_TYPE_P;
-        x264_reference_hierarchy_reset( h );
-        h->frames.i_poc_last_open_gop = -1;
-    }
-    else if( h->fenc->i_type == X264_TYPE_BREF )
-    {
-        i_nal_type    = NAL_SLICE;
-        i_nal_ref_idc = h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT ? NAL_PRIORITY_LOW : NAL_PRIORITY_HIGH;
-        h->sh.i_type = SLICE_TYPE_B;
-        x264_reference_hierarchy_reset( h );
-    }
-    else    /* B frame */
-    {
-        i_nal_type    = NAL_SLICE;
-        i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
-        h->sh.i_type = SLICE_TYPE_B;
-    }
-
-    h->fdec->i_type = h->fenc->i_type;
-    h->fdec->i_frame = h->fenc->i_frame;
-    h->fenc->b_kept_as_ref =
-    h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1;
-
-    h->fdec->mb_info = h->fenc->mb_info;
-    h->fdec->mb_info_free = h->fenc->mb_info_free;
-    h->fenc->mb_info = NULL;
-    h->fenc->mb_info_free = NULL;
-
-    h->fdec->i_pts = h->fenc->i_pts;
-    if( h->frames.i_bframe_delay )
-    {
-        int64_t *prev_reordered_pts = thread_current->frames.i_prev_reordered_pts;
-        h->fdec->i_dts = h->i_frame > h->frames.i_bframe_delay
-                       ? prev_reordered_pts[ (h->i_frame - h->frames.i_bframe_delay) % h->frames.i_bframe_delay ]
-                       : h->fenc->i_reordered_pts - h->frames.i_bframe_delay_time;
-        prev_reordered_pts[ h->i_frame % h->frames.i_bframe_delay ] = h->fenc->i_reordered_pts;
-    }
-    else
-        h->fdec->i_dts = h->fenc->i_reordered_pts;
-    if( h->fenc->i_type == X264_TYPE_IDR )
-        h->i_last_idr_pts = h->fdec->i_pts;
-
-    /* ------------------- Init                ----------------------------- */
-    /* build ref list 0/1 */
-    x264_reference_build_list( h, h->fdec->i_poc );
-
-    /* ---------------------- Write the bitstream -------------------------- */
-    /* Init bitstream context */
-    if( h->param.b_sliced_threads )
-    {
-        for( int i = 0; i < h->param.i_threads; i++ )
-        {
-            bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
-            h->thread[i]->out.i_nal = 0;
-        }
-    }
-    else
-    {
-        bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
-        h->out.i_nal = 0;
-    }
-
-    if( h->param.b_aud )
-    {
-        int pic_type;
-
-        if( h->sh.i_type == SLICE_TYPE_I )
-            pic_type = 0;
-        else if( h->sh.i_type == SLICE_TYPE_P )
-            pic_type = 1;
-        else if( h->sh.i_type == SLICE_TYPE_B )
-            pic_type = 2;
-        else
-            pic_type = 7;
-
-        x264_nal_start( h, NAL_AUD, NAL_PRIORITY_DISPOSABLE );
-        bs_write( &h->out.bs, 3, pic_type );
-        bs_rbsp_trailing( &h->out.bs );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
-    }
-
-    h->i_nal_type = i_nal_type;
-    h->i_nal_ref_idc = i_nal_ref_idc;
-
-    if( h->param.b_intra_refresh )
-    {
-        if( IS_X264_TYPE_I( h->fenc->i_type ) )
-        {
-            h->fdec->i_frames_since_pir = 0;
-            h->b_queued_intra_refresh = 0;
-            /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
-             * the whole frame and counts as an intra refresh. */
-            h->fdec->f_pir_position = h->mb.i_mb_width;
-        }
-        else if( h->fenc->i_type == X264_TYPE_P )
-        {
-            int pocdiff = (h->fdec->i_poc - h->fref[0][0]->i_poc)/2;
-            float increment = X264_MAX( ((float)h->mb.i_mb_width-1) / h->param.i_keyint_max, 1 );
-            h->fdec->f_pir_position = h->fref[0][0]->f_pir_position;
-            h->fdec->i_frames_since_pir = h->fref[0][0]->i_frames_since_pir + pocdiff;
-            if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max ||
-                (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->mb.i_mb_width) )
-            {
-                h->fdec->f_pir_position = 0;
-                h->fdec->i_frames_since_pir = 0;
-                h->b_queued_intra_refresh = 0;
-                h->fenc->b_keyframe = 1;
-            }
-            h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
-            h->fdec->f_pir_position += increment * pocdiff;
-            h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
-            /* If our intra refresh has reached the right side of the frame, we're done. */
-            if( h->fdec->i_pir_end_col >= h->mb.i_mb_width - 1 )
-            {
-                h->fdec->f_pir_position = h->mb.i_mb_width;
-                h->fdec->i_pir_end_col = h->mb.i_mb_width - 1;
-            }
-        }
-    }
-
-    if( h->fenc->b_keyframe )
-    {
-        /* Write SPS and PPS */
-        if( h->param.b_repeat_headers )
-        {
-            /* generate sequence parameters */
-            x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
-            x264_sps_write( &h->out.bs, h->sps );
-            if( x264_nal_end( h ) )
-                return -1;
-            /* Pad AUD/SPS to 256 bytes like Panasonic */
-            if( h->param.i_avcintra_class )
-                h->out.nal[h->out.i_nal-1].i_padding = 256 - bs_pos( &h->out.bs ) / 8 - 2*NALU_OVERHEAD;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + NALU_OVERHEAD;
-
-            /* generate picture parameters */
-            x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-            x264_pps_write( &h->out.bs, h->sps, h->pps );
-            if( x264_nal_end( h ) )
-                return -1;
-            if( h->param.i_avcintra_class )
-                h->out.nal[h->out.i_nal-1].i_padding = 256 - h->out.nal[h->out.i_nal-1].i_payload - NALU_OVERHEAD;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + NALU_OVERHEAD;
-        }
-
-        /* when frame threading is used, buffering period sei is written in x264_encoder_frame_end */
-        if( h->i_thread_frames == 1 && h->sps->vui.b_nal_hrd_parameters_present )
-        {
-            x264_hrd_fullness( h );
-            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            x264_sei_buffering_period_write( h, &h->out.bs );
-            if( x264_nal_end( h ) )
-               return -1;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-        }
-    }
-
-    /* write extra sei */
-    for( int i = 0; i < h->fenc->extra_sei.num_payloads; i++ )
-    {
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_write( &h->out.bs, h->fenc->extra_sei.payloads[i].payload, h->fenc->extra_sei.payloads[i].payload_size,
-                        h->fenc->extra_sei.payloads[i].payload_type );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-        if( h->fenc->extra_sei.sei_free )
-        {
-            h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads[i].payload );
-            h->fenc->extra_sei.payloads[i].payload = NULL;
-        }
-    }
-
-    if( h->fenc->extra_sei.sei_free )
-    {
-        h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads );
-        h->fenc->extra_sei.payloads = NULL;
-        h->fenc->extra_sei.sei_free = NULL;
-    }
-
-    if( h->fenc->b_keyframe )
-    {
-        /* Avid's decoder strictly wants two SEIs for AVC-Intra so we can't insert the x264 SEI */
-        if( h->param.b_repeat_headers && h->fenc->i_frame == 0 && !h->param.i_avcintra_class )
-        {
-            /* identify ourself */
-            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            if( x264_sei_version_write( h, &h->out.bs ) )
-                return -1;
-            if( x264_nal_end( h ) )
-                return -1;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-        }
-
-        if( h->fenc->i_type != X264_TYPE_IDR )
-        {
-            int time_to_recovery = h->param.b_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1;
-            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
-            if( x264_nal_end( h ) )
-                return -1;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-        }
-    }
-
-    if( h->param.i_frame_packing >= 0 && (h->fenc->b_keyframe || h->param.i_frame_packing == 5) )
-    {
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_frame_packing_write( h, &h->out.bs );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-    }
-
-    /* generate sei pic timing */
-    if( h->sps->vui.b_pic_struct_present || h->sps->vui.b_nal_hrd_parameters_present )
-    {
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_pic_timing_write( h, &h->out.bs );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-    }
-
-    /* As required by Blu-ray. */
-    if( !IS_X264_TYPE_B( h->fenc->i_type ) && h->b_sh_backup )
-    {
-        h->b_sh_backup = 0;
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_dec_ref_pic_marking_write( h, &h->out.bs );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-    }
-
-    if( h->fenc->b_keyframe && h->param.b_intra_refresh )
-        h->i_cpb_delay_pir_offset_next = h->fenc->i_cpb_delay;
-
-    /* Filler space: 10 or 18 SEIs' worth of space, depending on resolution */
-    if( h->param.i_avcintra_class )
-    {
-        /* Write an empty filler NAL to mimic the AUD in the P2 format*/
-        x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE );
-        x264_filler_write( h, &h->out.bs, 0 );
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
-
-        /* All lengths are magic lengths that decoders expect to see */
-        /* "UMID" SEI */
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        if( x264_sei_avcintra_umid_write( h, &h->out.bs ) < 0 )
-            return -1;
-        if( x264_nal_end( h ) )
-            return -1;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-
-        int unpadded_len;
-        int total_len;
-        if( h->param.i_height == 1080 )
-        {
-            unpadded_len = 5780;
-            total_len = 17*512;
-        }
-        else
-        {
-            unpadded_len = 2900;
-            total_len = 9*512;
-        }
-        /* "VANC" SEI */
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        if( x264_sei_avcintra_vanc_write( h, &h->out.bs, unpadded_len ) < 0 )
-            return -1;
-        if( x264_nal_end( h ) )
-            return -1;
-
-        h->out.nal[h->out.i_nal-1].i_padding = total_len - h->out.nal[h->out.i_nal-1].i_payload - SEI_OVERHEAD;
-        overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + SEI_OVERHEAD;
-    }
-
-    /* Init the rate control */
-    /* FIXME: Include slice header bit cost. */
-    x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 );
-    i_global_qp = x264_ratecontrol_qp( h );
-
-    pic_out->i_qpplus1 =
-    h->fdec->i_qpplus1 = i_global_qp + 1;
-
-    if( h->param.rc.b_stat_read && h->sh.i_type != SLICE_TYPE_I )
-    {
-        x264_reference_build_list_optimal( h );
-        x264_reference_check_reorder( h );
-    }
-
-    if( h->i_ref[0] )
-        h->fdec->i_poc_l0ref0 = h->fref[0][0]->i_poc;
-
-    /* ------------------------ Create slice header  ----------------------- */
-    x264_slice_init( h, i_nal_type, i_global_qp );
-
-    /*------------------------- Weights -------------------------------------*/
-    if( h->sh.i_type == SLICE_TYPE_B )
-        x264_macroblock_bipred_init( h );
-
-    x264_weighted_pred_init( h );
-
-    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
-        h->i_frame_num++;
-
-    /* Write frame */
-    h->i_threadslice_start = 0;
-    h->i_threadslice_end = h->mb.i_mb_height;
-    if( h->i_thread_frames > 1 )
-    {
-        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h );
-        h->b_thread_active = 1;
-    }
-    else if( h->param.b_sliced_threads )
-    {
-        if( x264_threaded_slices_write( h ) )
-            return -1;
-    }
-    else
-        if( (intptr_t)x264_slices_write( h ) )
-            return -1;
-
-    return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
-}
-
-static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
-                                   x264_nal_t **pp_nal, int *pi_nal,
-                                   x264_picture_t *pic_out )
-{
-    char psz_message[80];
-
-    if( !h->param.b_sliced_threads && h->b_thread_active )
-    {
-        h->b_thread_active = 0;
-        if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) )
-            return -1;
-    }
-    if( !h->out.i_nal )
-    {
-        pic_out->i_type = X264_TYPE_AUTO;
-        return 0;
-    }
-
-    x264_emms();
-
-    /* generate buffering period sei and insert it into place */
-    if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present )
-    {
-        x264_hrd_fullness( h );
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_buffering_period_write( h, &h->out.bs );
-        if( x264_nal_end( h ) )
-           return -1;
-        /* buffering period sei must follow AUD, SPS and PPS and precede all other SEIs */
-        int idx = 0;
-        while( h->out.nal[idx].i_type == NAL_AUD ||
-               h->out.nal[idx].i_type == NAL_SPS ||
-               h->out.nal[idx].i_type == NAL_PPS )
-            idx++;
-        x264_nal_t nal_tmp = h->out.nal[h->out.i_nal-1];
-        memmove( &h->out.nal[idx+1], &h->out.nal[idx], (h->out.i_nal-idx-1)*sizeof(x264_nal_t) );
-        h->out.nal[idx] = nal_tmp;
-    }
-
-    int frame_size = x264_encoder_encapsulate_nals( h, 0 );
-    if( frame_size < 0 )
-        return -1;
-
-    /* Set output picture properties */
-    pic_out->i_type = h->fenc->i_type;
-
-    pic_out->b_keyframe = h->fenc->b_keyframe;
-    pic_out->i_pic_struct = h->fenc->i_pic_struct;
-
-    pic_out->i_pts = h->fdec->i_pts;
-    pic_out->i_dts = h->fdec->i_dts;
-
-    if( pic_out->i_pts < pic_out->i_dts )
-        x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" );
-
-    pic_out->opaque = h->fenc->opaque;
-
-    pic_out->img.i_csp = h->fdec->i_csp;
-#if HIGH_BIT_DEPTH
-    pic_out->img.i_csp |= X264_CSP_HIGH_DEPTH;
-#endif
-    pic_out->img.i_plane = h->fdec->i_plane;
-    for( int i = 0; i < pic_out->img.i_plane; i++ )
-    {
-        pic_out->img.i_stride[i] = h->fdec->i_stride[i] * sizeof(pixel);
-        pic_out->img.plane[i] = (uint8_t*)h->fdec->plane[i];
-    }
-
-    x264_frame_push_unused( thread_current, h->fenc );
-
-    /* ---------------------- Update encoder state ------------------------- */
-
-    /* update rc */
-    int filler = 0;
-    if( x264_ratecontrol_end( h, frame_size * 8, &filler ) < 0 )
-        return -1;
-
-    pic_out->hrd_timing = h->fenc->hrd_timing;
-    pic_out->prop.f_crf_avg = h->fdec->f_crf_avg;
-
-    /* Filler in AVC-Intra mode is written as zero bytes to the last slice
-     * We don't know the size of the last slice until encapsulation so we add filler to the encapsulated NAL */
-    if( h->param.i_avcintra_class )
-    {
-        if( x264_check_encapsulated_buffer( h, h->thread[0], h->out.i_nal, frame_size, frame_size + filler ) < 0 )
-            return -1;
-
-        x264_nal_t *nal = &h->out.nal[h->out.i_nal-1];
-        memset( nal->p_payload + nal->i_payload, 0, filler );
-        nal->i_payload += filler;
-        nal->i_padding = filler;
-        frame_size += filler;
-
-        /* Fix up the size header for mp4/etc */
-        if( !h->param.b_annexb )
-        {
-            /* Size doesn't include the size of the header we're writing now. */
-            uint8_t *nal_data = nal->p_payload;
-            int chunk_size = nal->i_payload - 4;
-            nal_data[0] = chunk_size >> 24;
-            nal_data[1] = chunk_size >> 16;
-            nal_data[2] = chunk_size >> 8;
-            nal_data[3] = chunk_size >> 0;
-        }
-    }
-    else
-    {
-        while( filler > 0 )
-        {
-            int f, overhead = FILLER_OVERHEAD - h->param.b_annexb;
-            if( h->param.i_slice_max_size && filler > h->param.i_slice_max_size )
-            {
-                int next_size = filler - h->param.i_slice_max_size;
-                int overflow = X264_MAX( overhead - next_size, 0 );
-                f = h->param.i_slice_max_size - overhead - overflow;
-            }
-            else
-                f = X264_MAX( 0, filler - overhead );
-
-            if( x264_bitstream_check_buffer_filler( h, f ) )
-                return -1;
-            x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE );
-            x264_filler_write( h, &h->out.bs, f );
-            if( x264_nal_end( h ) )
-                return -1;
-            int total_size = x264_encoder_encapsulate_nals( h, h->out.i_nal-1 );
-            if( total_size < 0 )
-                return -1;
-            frame_size += total_size;
-            filler -= total_size;
-        }
-    }
-
-    /* End bitstream, set output  */
-    *pi_nal = h->out.i_nal;
-    *pp_nal = h->out.nal;
-
-    h->out.i_nal = 0;
-
-    x264_noise_reduction_update( h );
-
-    /* ---------------------- Compute/Print statistics --------------------- */
-    x264_thread_sync_stat( h, h->thread[0] );
-
-    /* Slice stat */
-    h->stat.i_frame_count[h->sh.i_type]++;
-    h->stat.i_frame_size[h->sh.i_type] += frame_size;
-    h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
-
-    for( int i = 0; i < X264_MBTYPE_MAX; i++ )
-        h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
-    for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
-        h->stat.i_mb_partition[h->sh.i_type][i] += h->stat.frame.i_mb_partition[i];
-    for( int i = 0; i < 2; i++ )
-        h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];
-    for( int i = 0; i < 6; i++ )
-        h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];
-    for( int i = 0; i < 4; i++ )
-        for( int j = 0; j < 13; j++ )
-            h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
-    if( h->sh.i_type != SLICE_TYPE_I )
-        for( int i_list = 0; i_list < 2; i_list++ )
-            for( int i = 0; i < X264_REF_MAX*2; i++ )
-                h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
-    for( int i = 0; i < 3; i++ )
-        h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i];
-    if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
-    {
-        h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn;
-        h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn;
-    }
-    if( h->sh.i_type == SLICE_TYPE_B )
-    {
-        h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
-        if( h->mb.b_direct_auto_write )
-        {
-            //FIXME somewhat arbitrary time constants
-            if( h->stat.i_direct_score[0] + h->stat.i_direct_score[1] > h->mb.i_mb_count )
-                for( int i = 0; i < 2; i++ )
-                    h->stat.i_direct_score[i] = h->stat.i_direct_score[i] * 9/10;
-            for( int i = 0; i < 2; i++ )
-                h->stat.i_direct_score[i] += h->stat.frame.i_direct_score[i];
-        }
-    }
-    else
-        h->stat.i_consecutive_bframes[h->fenc->i_bframes]++;
-
-    psz_message[0] = '\0';
-    double dur = h->fenc->f_duration;
-    h->stat.f_frame_duration[h->sh.i_type] += dur;
-    if( h->param.analyse.b_psnr )
-    {
-        int64_t ssd[3] =
-        {
-            h->stat.frame.i_ssd[0],
-            h->stat.frame.i_ssd[1],
-            h->stat.frame.i_ssd[2],
-        };
-        int luma_size = h->param.i_width * h->param.i_height;
-        int chroma_size = CHROMA_SIZE( luma_size );
-        pic_out->prop.f_psnr[0] = x264_psnr( ssd[0], luma_size );
-        pic_out->prop.f_psnr[1] = x264_psnr( ssd[1], chroma_size );
-        pic_out->prop.f_psnr[2] = x264_psnr( ssd[2], chroma_size );
-        pic_out->prop.f_psnr_avg = x264_psnr( ssd[0] + ssd[1] + ssd[2], luma_size + chroma_size*2 );
-
-        h->stat.f_ssd_global[h->sh.i_type]   += dur * (ssd[0] + ssd[1] + ssd[2]);
-        h->stat.f_psnr_average[h->sh.i_type] += dur * pic_out->prop.f_psnr_avg;
-        h->stat.f_psnr_mean_y[h->sh.i_type]  += dur * pic_out->prop.f_psnr[0];
-        h->stat.f_psnr_mean_u[h->sh.i_type]  += dur * pic_out->prop.f_psnr[1];
-        h->stat.f_psnr_mean_v[h->sh.i_type]  += dur * pic_out->prop.f_psnr[2];
-
-        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", pic_out->prop.f_psnr[0],
-                                                                    pic_out->prop.f_psnr[1],
-                                                                    pic_out->prop.f_psnr[2] );
-    }
-
-    if( h->param.analyse.b_ssim )
-    {
-        pic_out->prop.f_ssim = h->stat.frame.f_ssim / h->stat.frame.i_ssim_cnt;
-        h->stat.f_ssim_mean_y[h->sh.i_type] += pic_out->prop.f_ssim * dur;
-        int msg_len = strlen(psz_message);
-        snprintf( psz_message + msg_len, 80 - msg_len, " SSIM Y:%.5f", pic_out->prop.f_ssim );
-    }
-    psz_message[79] = '\0';
-
-    x264_log( h, X264_LOG_DEBUG,
-              "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
-              h->i_frame,
-              h->fdec->f_qp_avg_aq,
-              h->i_nal_ref_idc,
-              h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ),
-              h->fdec->i_poc,
-              h->stat.frame.i_mb_count_i,
-              h->stat.frame.i_mb_count_p,
-              h->stat.frame.i_mb_count_skip,
-              frame_size,
-              psz_message );
-
-    // keep stats all in one place
-    x264_thread_sync_stat( h->thread[0], h );
-    // for the use of the next frame
-    x264_thread_sync_stat( thread_current, h );
-
-#ifdef DEBUG_MB_TYPE
-{
-    static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S',
-        'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' };
-    for( int mb_xy = 0; mb_xy < h->mb.i_mb_width * h->mb.i_mb_height; mb_xy++ )
-    {
-        if( h->mb.type[mb_xy] < X264_MBTYPE_MAX && h->mb.type[mb_xy] >= 0 )
-            fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] );
-        else
-            fprintf( stderr, "? " );
-
-        if( (mb_xy+1) % h->mb.i_mb_width == 0 )
-            fprintf( stderr, "\n" );
-    }
-}
-#endif
-
-    /* Remove duplicates, must be done near the end as breaks h->fref0 array
-     * by freeing some of its pointers. */
-    for( int i = 0; i < h->i_ref[0]; i++ )
-        if( h->fref[0][i] && h->fref[0][i]->b_duplicate )
-        {
-            x264_frame_push_blank_unused( h, h->fref[0][i] );
-            h->fref[0][i] = 0;
-        }
-
-    if( h->param.psz_dump_yuv )
-        x264_frame_dump( h );
-    x264_emms();
-
-    return frame_size;
-}
-
-static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra )
-{
-    intra += sprintf( intra, "I16..4%s: %4.1f%% %4.1f%% %4.1f%%",
-        b_print_pcm ? "..PCM" : "",
-        i_mb_count[I_16x16]/ i_count,
-        i_mb_count[I_8x8]  / i_count,
-        i_mb_count[I_4x4]  / i_count );
-    if( b_print_pcm )
-        sprintf( intra, " %4.1f%%", i_mb_count[I_PCM]  / i_count );
-}
-
-/****************************************************************************
- * x264_encoder_close:
- ****************************************************************************/
-void    x264_encoder_close  ( x264_t *h )
-{
-    int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height );
-    int64_t i_mb_count_size[2][7] = {{0}};
-    char buf[200];
-    int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
-                   || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
-                   || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
-
-    x264_lookahead_delete( h );
-
-#if HAVE_OPENCL
-    x264_opencl_lookahead_delete( h );
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-#endif
-
-    if( h->param.b_sliced_threads )
-        x264_threadpool_wait_all( h );
-    if( h->param.i_threads > 1 )
-        x264_threadpool_delete( h->threadpool );
-    if( h->param.i_lookahead_threads > 1 )
-        x264_threadpool_delete( h->lookaheadpool );
-    if( h->i_thread_frames > 1 )
-    {
-        for( int i = 0; i < h->i_thread_frames; i++ )
-            if( h->thread[i]->b_thread_active )
-            {
-                assert( h->thread[i]->fenc->i_reference_count == 1 );
-                x264_frame_delete( h->thread[i]->fenc );
-            }
-
-        x264_t *thread_prev = h->thread[h->i_thread_phase];
-        x264_thread_sync_ratecontrol( h, thread_prev, h );
-        x264_thread_sync_ratecontrol( thread_prev, thread_prev, h );
-        h->i_frame = thread_prev->i_frame + 1 - h->i_thread_frames;
-    }
-    h->i_frame++;
-
-    /* Slices used and PSNR */
-    for( int i = 0; i < 3; i++ )
-    {
-        static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_P, SLICE_TYPE_B };
-        int i_slice = slice_order[i];
-
-        if( h->stat.i_frame_count[i_slice] > 0 )
-        {
-            int i_count = h->stat.i_frame_count[i_slice];
-            double dur =  h->stat.f_frame_duration[i_slice];
-            if( h->param.analyse.b_psnr )
-            {
-                x264_log( h, X264_LOG_INFO,
-                          "frame %c:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
-                          slice_type_to_char[i_slice],
-                          i_count,
-                          h->stat.f_frame_qp[i_slice] / i_count,
-                          (double)h->stat.i_frame_size[i_slice] / i_count,
-                          h->stat.f_psnr_mean_y[i_slice] / dur, h->stat.f_psnr_mean_u[i_slice] / dur, h->stat.f_psnr_mean_v[i_slice] / dur,
-                          h->stat.f_psnr_average[i_slice] / dur,
-                          x264_psnr( h->stat.f_ssd_global[i_slice], dur * i_yuv_size ) );
-            }
-            else
-            {
-                x264_log( h, X264_LOG_INFO,
-                          "frame %c:%-5d Avg QP:%5.2f  size:%6.0f\n",
-                          slice_type_to_char[i_slice],
-                          i_count,
-                          h->stat.f_frame_qp[i_slice] / i_count,
-                          (double)h->stat.i_frame_size[i_slice] / i_count );
-            }
-        }
-    }
-    if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_B] )
-    {
-        char *p = buf;
-        int den = 0;
-        // weight by number of frames (including the I/P-frames) that are in a sequence of N B-frames
-        for( int i = 0; i <= h->param.i_bframe; i++ )
-            den += (i+1) * h->stat.i_consecutive_bframes[i];
-        for( int i = 0; i <= h->param.i_bframe; i++ )
-            p += sprintf( p, " %4.1f%%", 100. * (i+1) * h->stat.i_consecutive_bframes[i] / den );
-        x264_log( h, X264_LOG_INFO, "consecutive B-frames:%s\n", buf );
-    }
-
-    for( int i_type = 0; i_type < 2; i_type++ )
-        for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
-        {
-            if( i == D_DIRECT_8x8 ) continue; /* direct is counted as its own type */
-            i_mb_count_size[i_type][x264_mb_partition_pixel_table[i]] += h->stat.i_mb_partition[i_type][i];
-        }
-
-    /* MB types used */
-    if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
-    {
-        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
-        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
-        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
-        x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
-    }
-    if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
-    {
-        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
-        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
-        int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
-        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
-        x264_log( h, X264_LOG_INFO,
-                  "mb P  %s  P16..4: %4.1f%% %4.1f%% %4.1f%% %4.1f%% %4.1f%%    skip:%4.1f%%\n",
-                  buf,
-                  i_mb_size[PIXEL_16x16] / (i_count*4),
-                  (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4),
-                  i_mb_size[PIXEL_8x8] / (i_count*4),
-                  (i_mb_size[PIXEL_8x4] + i_mb_size[PIXEL_4x8]) / (i_count*4),
-                  i_mb_size[PIXEL_4x4] / (i_count*4),
-                  i_mb_count[P_SKIP] / i_count );
-    }
-    if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
-    {
-        int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
-        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
-        double i_mb_list_count;
-        int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
-        int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
-        x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
-        for( int i = 0; i < X264_PARTTYPE_MAX; i++ )
-            for( int j = 0; j < 2; j++ )
-            {
-                int l0 = x264_mb_type_list_table[i][0][j];
-                int l1 = x264_mb_type_list_table[i][1][j];
-                if( l0 || l1 )
-                    list_count[l1+l0*l1] += h->stat.i_mb_count[SLICE_TYPE_B][i] * 2;
-            }
-        list_count[0] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L0_8x8];
-        list_count[1] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L1_8x8];
-        list_count[2] += h->stat.i_mb_partition[SLICE_TYPE_B][D_BI_8x8];
-        i_mb_count[B_DIRECT] += (h->stat.i_mb_partition[SLICE_TYPE_B][D_DIRECT_8x8]+2)/4;
-        i_mb_list_count = (list_count[0] + list_count[1] + list_count[2]) / 100.0;
-        sprintf( buf + strlen(buf), "  B16..8: %4.1f%% %4.1f%% %4.1f%%  direct:%4.1f%%  skip:%4.1f%%",
-                 i_mb_size[PIXEL_16x16] / (i_count*4),
-                 (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4),
-                 i_mb_size[PIXEL_8x8] / (i_count*4),
-                 i_mb_count[B_DIRECT] / i_count,
-                 i_mb_count[B_SKIP]   / i_count );
-        if( i_mb_list_count != 0 )
-            sprintf( buf + strlen(buf), "  L0:%4.1f%% L1:%4.1f%% BI:%4.1f%%",
-                     list_count[0] / i_mb_list_count,
-                     list_count[1] / i_mb_list_count,
-                     list_count[2] / i_mb_list_count );
-        x264_log( h, X264_LOG_INFO, "mb B  %s\n", buf );
-    }
-
-    x264_ratecontrol_summary( h );
-
-    if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
-    {
-#define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B])
-#define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o])
-        int64_t i_i8x8 = SUM3b( h->stat.i_mb_count, I_8x8 );
-        int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 )
-                                 + SUM3b( h->stat.i_mb_count, I_16x16 );
-        int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM);
-        int64_t i_skip = SUM3b( h->stat.i_mb_count, P_SKIP )
-                       + SUM3b( h->stat.i_mb_count, B_SKIP );
-        const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] +
-                            h->stat.i_frame_count[SLICE_TYPE_P] +
-                            h->stat.i_frame_count[SLICE_TYPE_B];
-        int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count;
-        int64_t i_inter = i_mb_count - i_skip - i_intra;
-        const double duration = h->stat.f_frame_duration[SLICE_TYPE_I] +
-                                h->stat.f_frame_duration[SLICE_TYPE_P] +
-                                h->stat.f_frame_duration[SLICE_TYPE_B];
-        float f_bitrate = SUM3(h->stat.i_frame_size) / duration / 125;
-
-        if( PARAM_INTERLACED )
-        {
-            char *fieldstats = buf;
-            fieldstats[0] = 0;
-            if( i_inter )
-                fieldstats += sprintf( fieldstats, " inter:%.1f%%", h->stat.i_mb_field[1] * 100.0 / i_inter );
-            if( i_skip )
-                fieldstats += sprintf( fieldstats, " skip:%.1f%%", h->stat.i_mb_field[2] * 100.0 / i_skip );
-            x264_log( h, X264_LOG_INFO, "field mbs: intra: %.1f%%%s\n",
-                      h->stat.i_mb_field[0] * 100.0 / i_intra, buf );
-        }
-
-        if( h->pps->b_transform_8x8_mode )
-        {
-            buf[0] = 0;
-            if( h->stat.i_mb_count_8x8dct[0] )
-                sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
-            x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
-        }
-
-        if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
-            (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
-            && h->stat.i_frame_count[SLICE_TYPE_B] )
-        {
-            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%% temporal:%.1f%%\n",
-                      h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B],
-                      h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] );
-        }
-
-        buf[0] = 0;
-        int csize = CHROMA444 ? 4 : 1;
-        if( i_mb_count != i_all_intra )
-            sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
-                     h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
-                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)*csize),
-                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)*csize) );
-        x264_log( h, X264_LOG_INFO, "coded y,%s,%s intra: %.1f%% %.1f%% %.1f%%%s\n",
-                  CHROMA444?"u":"uvDC", CHROMA444?"v":"uvAC",
-                  h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
-                  h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra*csize),
-                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra*csize), buf );
-
-        int64_t fixed_pred_modes[4][9] = {{0}};
-        int64_t sum_pred_modes[4] = {0};
-        for( int i = 0; i <= I_PRED_16x16_DC_128; i++ )
-        {
-            fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i];
-            sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i];
-        }
-        if( sum_pred_modes[0] )
-            x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
-                      fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0],
-                      fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0],
-                      fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0],
-                      fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] );
-        for( int i = 1; i <= 2; i++ )
-        {
-            for( int j = 0; j <= I_PRED_8x8_DC_128; j++ )
-            {
-                fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j];
-                sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j];
-            }
-            if( sum_pred_modes[i] )
-                x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4,
-                          fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i],
-                          fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
-        }
-        for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ )
-        {
-            fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i];
-            sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
-        }
-        if( sum_pred_modes[3] && !CHROMA444 )
-            x264_log( h, X264_LOG_INFO, "i8c dc,h,v,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
-                      fixed_pred_modes[3][0] * 100.0 / sum_pred_modes[3],
-                      fixed_pred_modes[3][1] * 100.0 / sum_pred_modes[3],
-                      fixed_pred_modes[3][2] * 100.0 / sum_pred_modes[3],
-                      fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] );
-
-        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
-            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%% UV:%.1f%%\n",
-                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P],
-                      h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
-
-        for( int i_list = 0; i_list < 2; i_list++ )
-            for( int i_slice = 0; i_slice < 2; i_slice++ )
-            {
-                char *p = buf;
-                int64_t i_den = 0;
-                int i_max = 0;
-                for( int i = 0; i < X264_REF_MAX*2; i++ )
-                    if( h->stat.i_mb_count_ref[i_slice][i_list][i] )
-                    {
-                        i_den += h->stat.i_mb_count_ref[i_slice][i_list][i];
-                        i_max = i;
-                    }
-                if( i_max == 0 )
-                    continue;
-                for( int i = 0; i <= i_max; i++ )
-                    p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den );
-                x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf );
-            }
-
-        if( h->param.analyse.b_ssim )
-        {
-            float ssim = SUM3( h->stat.f_ssim_mean_y ) / duration;
-            x264_log( h, X264_LOG_INFO, "SSIM Mean Y:%.7f (%6.3fdb)\n", ssim, x264_ssim( ssim ) );
-        }
-        if( h->param.analyse.b_psnr )
-        {
-            x264_log( h, X264_LOG_INFO,
-                      "PSNR Mean Y:%6.3f U:%6.3f V:%6.3f Avg:%6.3f Global:%6.3f kb/s:%.2f\n",
-                      SUM3( h->stat.f_psnr_mean_y ) / duration,
-                      SUM3( h->stat.f_psnr_mean_u ) / duration,
-                      SUM3( h->stat.f_psnr_mean_v ) / duration,
-                      SUM3( h->stat.f_psnr_average ) / duration,
-                      x264_psnr( SUM3( h->stat.f_ssd_global ), duration * i_yuv_size ),
-                      f_bitrate );
-        }
-        else
-            x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate );
-    }
-
-    /* rc */
-    x264_ratecontrol_delete( h );
-
-    /* param */
-    if( h->param.rc.psz_stat_out )
-        free( h->param.rc.psz_stat_out );
-    if( h->param.rc.psz_stat_in )
-        free( h->param.rc.psz_stat_in );
-
-    x264_cqm_delete( h );
-    x264_free( h->nal_buffer );
-    x264_free( h->reconfig_h );
-    x264_analyse_free_costs( h );
-
-    if( h->i_thread_frames > 1 )
-        h = h->thread[h->i_thread_phase];
-
-    /* frames */
-    x264_frame_delete_list( h->frames.unused[0] );
-    x264_frame_delete_list( h->frames.unused[1] );
-    x264_frame_delete_list( h->frames.current );
-    x264_frame_delete_list( h->frames.blank_unused );
-
-    h = h->thread[0];
-
-    for( int i = 0; i < h->i_thread_frames; i++ )
-        if( h->thread[i]->b_thread_active )
-            for( int j = 0; j < h->thread[i]->i_ref[0]; j++ )
-                if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
-                    x264_frame_delete( h->thread[i]->fref[0][j] );
-
-    if( h->param.i_lookahead_threads > 1 )
-        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-            x264_free( h->lookahead_thread[i] );
-
-    for( int i = h->param.i_threads - 1; i >= 0; i-- )
-    {
-        x264_frame_t **frame;
-
-        if( !h->param.b_sliced_threads || i == 0 )
-        {
-            for( frame = h->thread[i]->frames.reference; *frame; frame++ )
-            {
-                assert( (*frame)->i_reference_count > 0 );
-                (*frame)->i_reference_count--;
-                if( (*frame)->i_reference_count == 0 )
-                    x264_frame_delete( *frame );
-            }
-            frame = &h->thread[i]->fdec;
-            if( *frame )
-            {
-                assert( (*frame)->i_reference_count > 0 );
-                (*frame)->i_reference_count--;
-                if( (*frame)->i_reference_count == 0 )
-                    x264_frame_delete( *frame );
-            }
-            x264_macroblock_cache_free( h->thread[i] );
-        }
-        x264_macroblock_thread_free( h->thread[i], 0 );
-        x264_free( h->thread[i]->out.p_bitstream );
-        x264_free( h->thread[i]->out.nal );
-        x264_pthread_mutex_destroy( &h->thread[i]->mutex );
-        x264_pthread_cond_destroy( &h->thread[i]->cv );
-        x264_free( h->thread[i] );
-    }
-#if HAVE_OPENCL
-    x264_opencl_close_library( ocl );
-#endif
-}
-
-int x264_encoder_delayed_frames( x264_t *h )
-{
-    int delayed_frames = 0;
-    if( h->i_thread_frames > 1 )
-    {
-        for( int i = 0; i < h->i_thread_frames; i++ )
-            delayed_frames += h->thread[i]->b_thread_active;
-        h = h->thread[h->i_thread_phase];
-    }
-    for( int i = 0; h->frames.current[i]; i++ )
-        delayed_frames++;
-    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
-    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
-    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size;
-    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
-    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
-    return delayed_frames;
-}
-
-int x264_encoder_maximum_delayed_frames( x264_t *h )
-{
-    return h->frames.i_delay;
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/lookahead.c b/android/src/main/libenc/jni/libx264/encoder/lookahead.c
deleted file mode 100755
index 42e31bd..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/lookahead.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*****************************************************************************
- * lookahead.c: high-level lookahead functions
- *****************************************************************************
- * Copyright (C) 2010-2016 Avail Media and x264 project
- *
- * Authors: Michael Kazmier <mkazmier@availmedia.com>
- *          Alex Giladi <agiladi@availmedia.com>
- *          Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-/* LOOKAHEAD (threaded and non-threaded mode)
- *
- * Lookahead types:
- *     [1] Slice type / scene cut;
- *
- * In non-threaded mode, we run the existing slicetype decision code as it was.
- * In threaded mode, we run in a separate thread, that lives between the calls
- * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
- * the number of frames specified in rc_lookahead.  Recommended setting is
- * # of bframes + # of threads.
- */
-#include "common/common.h"
-#include "analyse.h"
-
-static void x264_lookahead_shift( x264_sync_frame_list_t *dst, x264_sync_frame_list_t *src, int count )
-{
-    int i = count;
-    while( i-- )
-    {
-        assert( dst->i_size < dst->i_max_size );
-        assert( src->i_size );
-        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
-        src->i_size--;
-    }
-    if( count )
-    {
-        x264_pthread_cond_broadcast( &dst->cv_fill );
-        x264_pthread_cond_broadcast( &src->cv_empty );
-    }
-}
-
-static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
-{
-    if( h->lookahead->last_nonb )
-        x264_frame_push_unused( h, h->lookahead->last_nonb );
-    h->lookahead->last_nonb = new_nonb;
-    new_nonb->i_reference_count++;
-}
-
-#if HAVE_THREAD
-static void x264_lookahead_slicetype_decide( x264_t *h )
-{
-    x264_stack_align( x264_slicetype_decide, h );
-
-    x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
-    int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
-
-    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
-    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
-        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
-
-    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
-    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
-
-    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
-    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-        x264_stack_align( x264_slicetype_analyse, h, shift_frames );
-
-    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
-}
-
-static void *x264_lookahead_thread( x264_t *h )
-{
-    while( !h->lookahead->b_exit_thread )
-    {
-        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
-        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-        int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
-        x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
-        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
-        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
-        {
-            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
-                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
-            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-        }
-        else
-        {
-            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-            x264_lookahead_slicetype_decide( h );
-        }
-    }   /* end of input frames */
-    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
-    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
-    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
-    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-    while( h->lookahead->next.i_size )
-        x264_lookahead_slicetype_decide( h );
-    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
-    h->lookahead->b_thread_active = 0;
-    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
-    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
-    return NULL;
-}
-#endif
-
-int x264_lookahead_init( x264_t *h, int i_slicetype_length )
-{
-    x264_lookahead_t *look;
-    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
-    for( int i = 0; i < h->param.i_threads; i++ )
-        h->thread[i]->lookahead = look;
-
-    look->i_last_keyframe = - h->param.i_keyint_max;
-    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
-                               && !h->param.rc.b_stat_read;
-    look->i_slicetype_length = i_slicetype_length;
-
-    /* init frame lists */
-    if( x264_sync_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
-        x264_sync_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
-        x264_sync_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
-        goto fail;
-
-    if( !h->param.i_sync_lookahead )
-        return 0;
-
-    x264_t *look_h = h->thread[h->param.i_threads];
-    *look_h = *h;
-    if( x264_macroblock_cache_allocate( look_h ) )
-        goto fail;
-
-    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
-        goto fail;
-
-    if( x264_pthread_create( &look->thread_handle, NULL, (void*)x264_lookahead_thread, look_h ) )
-        goto fail;
-    look->b_thread_active = 1;
-
-    return 0;
-fail:
-    x264_free( look );
-    return -1;
-}
-
-void x264_lookahead_delete( x264_t *h )
-{
-    if( h->param.i_sync_lookahead )
-    {
-        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
-        h->lookahead->b_exit_thread = 1;
-        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
-        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
-        x264_pthread_join( h->lookahead->thread_handle, NULL );
-        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
-        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
-        x264_free( h->thread[h->param.i_threads] );
-    }
-    x264_sync_frame_list_delete( &h->lookahead->ifbuf );
-    x264_sync_frame_list_delete( &h->lookahead->next );
-    if( h->lookahead->last_nonb )
-        x264_frame_push_unused( h, h->lookahead->last_nonb );
-    x264_sync_frame_list_delete( &h->lookahead->ofbuf );
-    x264_free( h->lookahead );
-}
-
-void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
-{
-    if( h->param.i_sync_lookahead )
-        x264_sync_frame_list_push( &h->lookahead->ifbuf, frame );
-    else
-        x264_sync_frame_list_push( &h->lookahead->next, frame );
-}
-
-int x264_lookahead_is_empty( x264_t *h )
-{
-    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
-    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    int b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
-    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
-    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
-    return b_empty;
-}
-
-static void x264_lookahead_encoder_shift( x264_t *h )
-{
-    if( !h->lookahead->ofbuf.i_size )
-        return;
-    int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
-    while( i_frames-- )
-    {
-        x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
-        h->lookahead->ofbuf.i_size--;
-    }
-    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
-}
-
-void x264_lookahead_get_frames( x264_t *h )
-{
-    if( h->param.i_sync_lookahead )
-    {   /* We have a lookahead thread, so get frames from there */
-        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
-        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
-            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
-        x264_lookahead_encoder_shift( h );
-        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
-    }
-    else
-    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
-
-        if( h->frames.current[0] || !h->lookahead->next.i_size )
-            return;
-
-        x264_stack_align( x264_slicetype_decide, h );
-        x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
-        int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
-        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
-
-        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
-        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-            x264_stack_align( x264_slicetype_analyse, h, shift_frames );
-
-        x264_lookahead_encoder_shift( h );
-    }
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/macroblock.c b/android/src/main/libenc/jni/libx264/encoder/macroblock.c
deleted file mode 100755
index 81814f7..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/macroblock.c
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*****************************************************************************
- * macroblock.c: macroblock encoding
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-
-/* These chroma DC functions don't have assembly versions and are only used here. */
-
-#define ZIG(i,y,x) level[i] = dct[x*2+y];
-static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
-{
-    ZIG(0,0,0)
-    ZIG(1,0,1)
-    ZIG(2,1,0)
-    ZIG(3,1,1)
-}
-#undef ZIG
-
-static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
-{
-    level[0] = dct[0];
-    level[1] = dct[2];
-    level[2] = dct[1];
-    level[3] = dct[4];
-    level[4] = dct[6];
-    level[5] = dct[3];
-    level[6] = dct[5];
-    level[7] = dct[7];
-}
-
-#define IDCT_DEQUANT_2X2_START \
-    int d0 = dct[0] + dct[1]; \
-    int d1 = dct[2] + dct[3]; \
-    int d2 = dct[0] - dct[1]; \
-    int d3 = dct[2] - dct[3]; \
-    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
-
-static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
-{
-    IDCT_DEQUANT_2X2_START
-    dct4x4[0][0] = (d0 + d1) * dmf >> 5;
-    dct4x4[1][0] = (d0 - d1) * dmf >> 5;
-    dct4x4[2][0] = (d2 + d3) * dmf >> 5;
-    dct4x4[3][0] = (d2 - d3) * dmf >> 5;
-}
-
-static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
-{
-    IDCT_DEQUANT_2X2_START
-    dct[0] = (d0 + d1) * dmf >> 5;
-    dct[1] = (d0 - d1) * dmf >> 5;
-    dct[2] = (d2 + d3) * dmf >> 5;
-    dct[3] = (d2 - d3) * dmf >> 5;
-}
-#undef IDCT_2X2_DEQUANT_START
-
-static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
-{
-    int d0 = dct4x4[0][0] + dct4x4[1][0];
-    int d1 = dct4x4[2][0] + dct4x4[3][0];
-    int d2 = dct4x4[0][0] - dct4x4[1][0];
-    int d3 = dct4x4[2][0] - dct4x4[3][0];
-    d[0] = d0 + d1;
-    d[2] = d2 + d3;
-    d[1] = d0 - d1;
-    d[3] = d2 - d3;
-    dct4x4[0][0] = 0;
-    dct4x4[1][0] = 0;
-    dct4x4[2][0] = 0;
-    dct4x4[3][0] = 0;
-}
-
-static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
-{
-    if( WORD_SIZE == 8 )
-    {
-        for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
-            if( M64( &v[i] ) )
-                return 1;
-    }
-    else
-    {
-        for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
-            if( M32( &v[i] ) )
-                return 1;
-    }
-    return 0;
-}
-
-/* All encoding functions must output the correct CBP and NNZ values.
- * The entropy coding functions will check CBP first, then NNZ, before
- * actually reading the DCT coefficients.  NNZ still must be correct even
- * if CBP is zero because of the use of NNZ values for context selection.
- * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
- * that is only needed in CAVLC, and will be calculated by CAVLC's residual
- * coding and stored as necessary. */
-
-/* This means that decimation can be done merely by adjusting the CBP and NNZ
- * rather than memsetting the coefficients. */
-
-static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
-{
-    pixel *p_src = h->mb.pic.p_fenc[p];
-    pixel *p_dst = h->mb.pic.p_fdec[p];
-
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
-
-    int nz, block_cbp = 0;
-    int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
-    int i_quant_cat = p ? CQM_4IC : CQM_4IY;
-    int i_mode = h->mb.i_intra16x16_pred_mode;
-
-    if( h->mb.b_lossless )
-        x264_predict_lossless_16x16( h, p, i_mode );
-    else
-        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
-
-    if( h->mb.b_lossless )
-    {
-        for( int i = 0; i < 16; i++ )
-        {
-            int oe = block_idx_xy_fenc[i];
-            int od = block_idx_xy_fdec[i];
-            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
-            h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
-            block_cbp |= nz;
-        }
-        h->mb.i_cbp_luma |= block_cbp * 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
-        return;
-    }
-
-    CLEAR_16x16_NNZ( p );
-
-    h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
-
-    if( h->mb.b_noise_reduction )
-        for( int idx = 0; idx < 16; idx++ )
-            h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-
-    for( int idx = 0; idx < 16; idx++ )
-    {
-        dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
-        dct4x4[idx][0] = 0;
-    }
-
-    if( h->mb.b_trellis )
-    {
-        for( int idx = 0; idx < 16; idx++ )
-            if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
-            {
-                block_cbp = 0xf;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
-                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
-                if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
-                h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
-            }
-    }
-    else
-    {
-        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-        {
-            nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
-            if( nz )
-            {
-                block_cbp = 0xf;
-                FOREACH_BIT( idx, i8x8*4, nz )
-                {
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
-                    h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
-                    if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
-                    h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
-                }
-            }
-        }
-    }
-
-    /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
-    /* More useful with CAVLC, but still useful with CABAC. */
-    if( decimate_score < 6 )
-    {
-        CLEAR_16x16_NNZ( p );
-        block_cbp = 0;
-    }
-    else
-        h->mb.i_cbp_luma |= block_cbp;
-
-    h->dctf.dct4x4dc( dct_dc4x4 );
-    if( h->mb.b_trellis )
-        nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
-    else
-        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
-
-    h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
-    if( nz )
-    {
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
-
-        /* output samples to fdec */
-        h->dctf.idct4x4dc( dct_dc4x4 );
-        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
-        if( block_cbp )
-            for( int i = 0; i < 16; i++ )
-                dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
-    }
-
-    /* put pixels to fdec */
-    if( block_cbp )
-        h->dctf.add16x16_idct( p_dst, dct4x4 );
-    else if( nz )
-        h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
-}
-
-/* Round down coefficients losslessly in DC-only chroma blocks.
- * Unlike luma blocks, this can't be done with a lookup table or
- * other shortcut technique because of the interdependencies
- * between the coefficients due to the chroma DC transform. */
-static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
-{
-    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
-
-    /* If the QP is too high, there's no benefit to rounding optimization. */
-    if( dmf > 32*64 )
-        return 1;
-
-    if( chroma422 )
-        return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
-    else
-        return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
-}
-
-static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
-{
-    int nz, nz_dc;
-    int b_decimate = b_inter && h->mb.b_dct_decimate;
-    int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
-    ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
-    h->mb.i_cbp_chroma = 0;
-    h->nr_count[2] += h->mb.b_noise_reduction * 4;
-
-    M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
-    M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
-    M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
-    M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
-    if( chroma422 )
-    {
-        M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
-        M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
-        M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
-        M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
-    }
-
-    /* Early termination: check variance of chroma residual before encoding.
-     * Don't bother trying early termination at low QPs.
-     * Values are experimentally derived. */
-    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
-    {
-        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
-        int ssd[2];
-        int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
-
-        int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
-        if( score < thresh*4 )
-            score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
-        if( score < thresh*4 )
-        {
-            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
-
-            for( int ch = 0; ch < 2; ch++ )
-            {
-                if( ssd[ch] > thresh )
-                {
-                    pixel *p_src = h->mb.pic.p_fenc[1+ch];
-                    pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-
-                    if( chroma422 )
-                        /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
-                        h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
-                    else
-                        h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
-
-                    if( h->mb.b_trellis )
-                        nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
-                    else
-                    {
-                        nz_dc = 0;
-                        for( int i = 0; i <= chroma422; i++ )
-                            nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
-                                                             h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
-                    }
-
-                    if( nz_dc )
-                    {
-                        if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
-                            continue;
-                        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
-                        if( chroma422 )
-                        {
-                            zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
-                            h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
-                        }
-                        else
-                        {
-                            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
-                            idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
-                        }
-
-                        for( int i = 0; i <= chroma422; i++ )
-                            h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
-                        h->mb.i_cbp_chroma = 1;
-                    }
-                }
-            }
-            return;
-        }
-    }
-
-    for( int ch = 0; ch < 2; ch++ )
-    {
-        pixel *p_src = h->mb.pic.p_fenc[1+ch];
-        pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-        int i_decimate_score = b_decimate ? 0 : 7;
-        int nz_ac = 0;
-
-        ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
-
-        if( h->mb.b_lossless )
-        {
-            static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
-
-            for( int i = 0; i < (chroma422?8:4); i++ )
-            {
-                int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
-                int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
-                                           &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
-                h->mb.i_cbp_chroma |= nz;
-            }
-            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
-            continue;
-        }
-
-        for( int i = 0; i <= chroma422; i++ )
-            h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
-
-        if( h->mb.b_noise_reduction )
-            for( int i = 0; i < (chroma422?8:4); i++ )
-                h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-
-        if( chroma422 )
-            h->dctf.dct2x4dc( dct_dc, dct4x4 );
-        else
-            dct2x2dc( dct_dc, dct4x4 );
-
-        /* calculate dct coeffs */
-        for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
-        {
-            if( h->mb.b_trellis )
-            {
-                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-                {
-                    if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
-                    {
-                        int idx = 16+ch*16+i8x8*8+i4x4;
-                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
-                        h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
-                        if( i_decimate_score < 7 )
-                            i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
-                        h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
-                        nz_ac = 1;
-                    }
-                }
-            }
-            else
-            {
-                nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
-                                            h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-                nz_ac |= nz;
-
-                FOREACH_BIT( i4x4, 0, nz )
-                {
-                    int idx = 16+ch*16+i8x8*8+i4x4;
-
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
-                    h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
-                    if( i_decimate_score < 7 )
-                        i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
-                    h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
-                }
-            }
-        }
-
-        if( h->mb.b_trellis )
-            nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
-        else
-        {
-            nz_dc = 0;
-            for( int i = 0; i <= chroma422; i++ )
-                nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
-                                                 h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
-        }
-
-        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
-
-        if( i_decimate_score < 7 || !nz_ac )
-        {
-            /* Decimate the block */
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
-            if( chroma422 )
-            {
-                M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
-            }
-
-            if( !nz_dc ) /* Whole block is empty */
-                continue;
-            if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
-                continue;
-            }
-            /* DC-only */
-            if( chroma422 )
-            {
-                zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
-                h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
-            }
-            else
-            {
-                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
-                idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
-            }
-
-            for( int i = 0; i <= chroma422; i++ )
-                h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
-        }
-        else
-        {
-            h->mb.i_cbp_chroma = 1;
-
-            if( nz_dc )
-            {
-                if( chroma422 )
-                {
-                    zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
-                    h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
-                }
-                else
-                {
-                    zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
-                    idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
-                }
-            }
-
-            for( int i = 0; i <= chroma422; i++ )
-                h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
-        }
-    }
-
-    /* 0 = none, 1 = DC only, 2 = DC+AC */
-    h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
-                           h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
-}
-
-void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
-{
-    if( CHROMA_FORMAT == CHROMA_420 )
-        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
-    else
-        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
-}
-
-static void x264_macroblock_encode_skip( x264_t *h )
-{
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
-    if( CHROMA_FORMAT >= CHROMA_422 )
-    {
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
-    }
-    h->mb.i_cbp_luma = 0;
-    h->mb.i_cbp_chroma = 0;
-    h->mb.cbp[h->mb.i_mb_xy] = 0;
-}
-
-/*****************************************************************************
- * Intra prediction for predictive lossless mode.
- *****************************************************************************/
-
-void x264_predict_lossless_chroma( x264_t *h, int i_mode )
-{
-    int height = 16 >> CHROMA_V_SHIFT;
-    if( i_mode == I_PRED_CHROMA_V )
-    {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
-        memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
-        memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
-    }
-    else if( i_mode == I_PRED_CHROMA_H )
-    {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
-        x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
-        x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
-        if( CHROMA_FORMAT == CHROMA_422 )
-        {
-            x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
-            x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
-        }
-    }
-    else
-    {
-        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
-        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
-    }
-}
-
-void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
-{
-    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
-    pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
-
-    if( i_mode == I_PRED_4x4_V )
-        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
-    else if( i_mode == I_PRED_4x4_H )
-        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
-    else
-        h->predict_4x4[i_mode]( p_dst );
-}
-
-void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
-{
-    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
-    pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
-
-    if( i_mode == I_PRED_8x8_V )
-        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
-    else if( i_mode == I_PRED_8x8_H )
-        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
-    else
-        h->predict_8x8[i_mode]( p_dst, edge );
-}
-
-void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
-{
-    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
-    if( i_mode == I_PRED_16x16_V )
-        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
-    else if( i_mode == I_PRED_16x16_H )
-        h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
-    else
-        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
-}
-
-/*****************************************************************************
- * x264_macroblock_encode:
- *****************************************************************************/
-static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
-{
-    int i_qp = h->mb.i_qp;
-    int b_decimate = h->mb.b_dct_decimate;
-    int b_force_no_skip = 0;
-    int nz;
-    h->mb.i_cbp_luma = 0;
-    for( int p = 0; p < plane_count; p++ )
-        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
-
-    if( h->mb.i_type == I_PCM )
-    {
-        /* if PCM is chosen, we need to store reconstructed frame data */
-        for( int p = 0; p < plane_count; p++ )
-            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
-        if( chroma )
-        {
-            int height = 16 >> CHROMA_V_SHIFT;
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
-        }
-        return;
-    }
-
-    if( !h->mb.b_allow_skip )
-    {
-        b_force_no_skip = 1;
-        if( IS_SKIP(h->mb.i_type) )
-        {
-            if( h->mb.i_type == P_SKIP )
-                h->mb.i_type = P_L0;
-            else if( h->mb.i_type == B_SKIP )
-                h->mb.i_type = B_DIRECT;
-        }
-    }
-
-    if( h->mb.i_type == P_SKIP )
-    {
-        /* don't do pskip motion compensation if it was already done in macroblock_analyse */
-        if( !h->mb.b_skip_mc )
-        {
-            int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
-                                  h->mb.mv_min[0], h->mb.mv_max[0] );
-            int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
-                                  h->mb.mv_min[1], h->mb.mv_max[1] );
-
-            for( int p = 0; p < plane_count; p++ )
-                h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
-                               &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
-                               mvx, mvy, 16, 16, &h->sh.weight[0][p] );
-
-            if( chroma )
-            {
-                int v_shift = CHROMA_V_SHIFT;
-                int height = 16 >> v_shift;
-
-                /* Special case for mv0, which is (of course) very common in P-skip mode. */
-                if( mvx | mvy )
-                    h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                     h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                     mvx, 2*mvy>>v_shift, 8, height );
-                else
-                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
-                                                         h->mb.pic.i_stride[1], height );
-
-                if( h->sh.weight[0][1].weightfn )
-                    h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                                       &h->sh.weight[0][1], height );
-                if( h->sh.weight[0][2].weightfn )
-                    h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                                       &h->sh.weight[0][2], height );
-            }
-        }
-
-        x264_macroblock_encode_skip( h );
-        return;
-    }
-    if( h->mb.i_type == B_SKIP )
-    {
-        /* don't do bskip motion compensation if it was already done in macroblock_analyse */
-        if( !h->mb.b_skip_mc )
-            x264_mb_mc( h );
-        x264_macroblock_encode_skip( h );
-        return;
-    }
-
-    if( h->mb.i_type == I_16x16 )
-    {
-        h->mb.b_transform_8x8 = 0;
-
-        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-            x264_mb_encode_i16x16( h, p, i_qp );
-    }
-    else if( h->mb.i_type == I_8x8 )
-    {
-        h->mb.b_transform_8x8 = 1;
-        /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
-        if( h->mb.i_skip_intra )
-        {
-            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
-            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
-            /* In RD mode, restore the now-overwritten DCT data. */
-            if( h->mb.i_skip_intra == 2 )
-                h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
-        }
-        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-        {
-            for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0; i < 4; i++ )
-            {
-                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
-                x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
-            }
-        }
-    }
-    else if( h->mb.i_type == I_4x4 )
-    {
-        h->mb.b_transform_8x8 = 0;
-        /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
-        if( h->mb.i_skip_intra )
-        {
-            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
-            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
-            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
-            /* In RD mode, restore the now-overwritten DCT data. */
-            if( h->mb.i_skip_intra == 2 )
-                h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
-        }
-        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-        {
-            for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0; i < 16; i++ )
-            {
-                pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
-                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
-
-                if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                    /* emulate missing topright samples */
-                    MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
-
-                x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
-            }
-        }
-    }
-    else    /* Inter MB */
-    {
-        int i_decimate_mb = 0;
-
-        /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
-        if( !h->mb.b_skip_mc )
-            x264_mb_mc( h );
-
-        if( h->mb.b_lossless )
-        {
-            if( h->mb.b_transform_8x8 )
-                for( int p = 0; p < plane_count; p++ )
-                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                    {
-                        int x = i8x8&1;
-                        int y = i8x8>>1;
-                        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
-                                                                           h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
-                        STORE_8x8_NNZ( p, i8x8, nz );
-                        h->mb.i_cbp_luma |= nz << i8x8;
-                    }
-            else
-                for( int p = 0; p < plane_count; p++ )
-                    for( int i4x4 = 0; i4x4 < 16; i4x4++ )
-                    {
-                        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
-                                                 h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
-                                                 h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
-                        h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
-                        h->mb.i_cbp_luma |= nz << (i4x4>>2);
-                    }
-        }
-        else if( h->mb.b_transform_8x8 )
-        {
-            ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
-            b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
-
-            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-            {
-                int quant_cat = p ? CQM_8PC : CQM_8PY;
-                CLEAR_16x16_NNZ( p );
-                h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
-                h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
-
-                int plane_cbp = 0;
-                for( int idx = 0; idx < 4; idx++ )
-                {
-                    nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
-
-                    if( nz )
-                    {
-                        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
-                        if( b_decimate )
-                        {
-                            int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
-                            i_decimate_mb += i_decimate_8x8;
-                            if( i_decimate_8x8 >= 4 )
-                                plane_cbp |= 1<<idx;
-                        }
-                        else
-                            plane_cbp |= 1<<idx;
-                    }
-                }
-
-                if( i_decimate_mb >= 6 || !b_decimate )
-                {
-                    h->mb.i_cbp_luma |= plane_cbp;
-                    FOREACH_BIT( idx, 0, plane_cbp )
-                    {
-                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[quant_cat], i_qp );
-                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
-                        STORE_8x8_NNZ( p, idx, 1 );
-                    }
-                }
-            }
-        }
-        else
-        {
-            ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
-            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-            {
-                int quant_cat = p ? CQM_4PC : CQM_4PY;
-                CLEAR_16x16_NNZ( p );
-                h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
-
-                if( h->mb.b_noise_reduction )
-                {
-                    h->nr_count[0+!!p*2] += 16;
-                    for( int idx = 0; idx < 16; idx++ )
-                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
-                }
-
-                int plane_cbp = 0;
-                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                {
-                    int i_decimate_8x8 = b_decimate ? 0 : 6;
-                    int nnz8x8 = 0;
-                    if( h->mb.b_trellis )
-                    {
-                        for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-                        {
-                            int idx = i8x8*4+i4x4;
-                            if( x264_quant_4x4_trellis( h, dct4x4[idx], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
-                            {
-                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
-                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp );
-                                if( i_decimate_8x8 < 6 )
-                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
-                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
-                                nnz8x8 = 1;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
-                        if( nz )
-                        {
-                            FOREACH_BIT( idx, i8x8*4, nz )
-                            {
-                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
-                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp );
-                                if( i_decimate_8x8 < 6 )
-                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
-                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
-                            }
-                        }
-                    }
-                    if( nnz8x8 )
-                    {
-                        i_decimate_mb += i_decimate_8x8;
-                        if( i_decimate_8x8 < 4 )
-                            STORE_8x8_NNZ( p, i8x8, 0 );
-                        else
-                            plane_cbp |= 1<<i8x8;
-                    }
-                }
-
-                if( i_decimate_mb < 6 )
-                {
-                    plane_cbp = 0;
-                    CLEAR_16x16_NNZ( p );
-                }
-                else
-                {
-                    h->mb.i_cbp_luma |= plane_cbp;
-                    FOREACH_BIT( i8x8, 0, plane_cbp )
-                    {
-                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
-                    }
-                }
-            }
-        }
-    }
-
-    /* encode chroma */
-    if( chroma )
-    {
-        if( IS_INTRA( h->mb.i_type ) )
-        {
-            int i_mode = h->mb.i_chroma_pred_mode;
-            if( h->mb.b_lossless )
-                x264_predict_lossless_chroma( h, i_mode );
-            else
-            {
-                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
-            }
-        }
-
-        /* encode the 8x8 blocks */
-        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
-    }
-    else
-        h->mb.i_cbp_chroma = 0;
-
-    /* store cbp */
-    int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
-    if( h->param.b_cabac )
-        cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
-            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
-            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
-    h->mb.cbp[h->mb.i_mb_xy] = cbp;
-
-    /* Check for P_SKIP
-     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
-     *      (if multiple mv give same result)*/
-    if( !b_force_no_skip )
-    {
-        if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
-            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
-            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
-            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
-        {
-            h->mb.i_type = P_SKIP;
-        }
-
-        /* Check for B_SKIP */
-        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
-        {
-            h->mb.i_type = B_SKIP;
-        }
-    }
-}
-
-void x264_macroblock_encode( x264_t *h )
-{
-    if( CHROMA444 )
-        x264_macroblock_encode_internal( h, 3, 0 );
-    else
-        x264_macroblock_encode_internal( h, 1, 1 );
-}
-
-/*****************************************************************************
- * x264_macroblock_probe_skip:
- *  Check if the current MB could be encoded as a [PB]_SKIP
- *****************************************************************************/
-static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
-{
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
-    ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
-    ALIGNED_4( int16_t mvp[2] );
-    int i_qp = h->mb.i_qp;
-
-    for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-    {
-        int quant_cat = p ? CQM_4PC : CQM_4PY;
-        if( !b_bidir )
-        {
-            /* Get the MV */
-            mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
-            mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
-
-            /* Motion compensation */
-            h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
-                           &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
-                           mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
-        }
-
-        for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
-        {
-            int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
-            int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
-
-            h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
-                                        h->mb.pic.p_fdec[p] + fdec_offset );
-
-            if( h->mb.b_noise_reduction )
-                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
-
-            int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
-            FOREACH_BIT( idx, 0, nz )
-            {
-                h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
-                i_decimate_mb += h->quantf.decimate_score16( dctscan );
-                if( i_decimate_mb >= 6 )
-                    return 0;
-            }
-        }
-    }
-
-    if( chroma == CHROMA_420 || chroma == CHROMA_422 )
-    {
-        i_qp = h->mb.i_chroma_qp;
-        int chroma422 = chroma == CHROMA_422;
-        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
-        int ssd;
-        ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
-
-        if( !b_bidir )
-        {
-            /* Special case for mv0, which is (of course) very common in P-skip mode. */
-            if( M32( mvp ) )
-                h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                 h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                 mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
-            else
-                h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
-                                                     h->mb.pic.i_stride[1], chroma422?16:8 );
-        }
-
-        for( int ch = 0; ch < 2; ch++ )
-        {
-            pixel *p_src = h->mb.pic.p_fenc[1+ch];
-            pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-
-            if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
-                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                      &h->sh.weight[0][1+ch], chroma422?16:8 );
-
-            /* there is almost never a termination during chroma, but we can't avoid the check entirely */
-            /* so instead we check SSD and skip the actual check if the score is low enough. */
-            ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
-            if( ssd < thresh )
-                continue;
-
-            /* The vast majority of chroma checks will terminate during the DC check or the higher
-             * threshold check, so we can save time by doing a DC-only DCT. */
-            if( h->mb.b_noise_reduction )
-            {
-                for( int i = 0; i <= chroma422; i++ )
-                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
-
-                for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
-                {
-                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                    dct_dc[i4x4] = dct4x4[i4x4][0];
-                    dct4x4[i4x4][0] = 0;
-                }
-            }
-            else
-            {
-                if( chroma422 )
-                    h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
-                else
-                    h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
-            }
-
-            for( int i = 0; i <= chroma422; i++ )
-                if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
-                                            h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
-                    return 0;
-
-            /* If there wasn't a termination in DC, we can check against a much higher threshold. */
-            if( ssd < thresh*4 )
-                continue;
-
-            if( !h->mb.b_noise_reduction )
-                for( int i = 0; i <= chroma422; i++ )
-                {
-                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
-                    dct4x4[i*4+0][0] = 0;
-                    dct4x4[i*4+1][0] = 0;
-                    dct4x4[i*4+2][0] = 0;
-                    dct4x4[i*4+3][0] = 0;
-                }
-
-            /* calculate dct coeffs */
-            for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
-            {
-                int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-                FOREACH_BIT( idx, i8x8*4, nz )
-                {
-                    h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
-                    i_decimate_mb += h->quantf.decimate_score15( dctscan );
-                    if( i_decimate_mb >= 7 )
-                        return 0;
-                }
-            }
-        }
-    }
-
-    h->mb.b_skip_mc = 1;
-    return 1;
-}
-
-int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
-{
-    if( CHROMA_FORMAT == CHROMA_444 )
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
-    else if( CHROMA_FORMAT == CHROMA_422 )
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
-    else
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
-}
-
-/****************************************************************************
- * DCT-domain noise reduction / adaptive deadzone
- * from libavcodec
- ****************************************************************************/
-
-void x264_noise_reduction_update( x264_t *h )
-{
-    h->nr_offset = h->nr_offset_denoise;
-    h->nr_residual_sum = h->nr_residual_sum_buf[0];
-    h->nr_count = h->nr_count_buf[0];
-    for( int cat = 0; cat < 3 + CHROMA444; cat++ )
-    {
-        int dct8x8 = cat&1;
-        int size = dct8x8 ? 64 : 16;
-        const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
-
-        if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
-        {
-            for( int i = 0; i < size; i++ )
-                h->nr_residual_sum[cat][i] >>= 1;
-            h->nr_count[cat] >>= 1;
-        }
-
-        for( int i = 0; i < size; i++ )
-            h->nr_offset[cat][i] =
-                ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
-                 + h->nr_residual_sum[cat][i]/2)
-              / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
-
-        /* Don't denoise DC coefficients */
-        h->nr_offset[cat][0] = 0;
-    }
-}
-
-/*****************************************************************************
- * RD only; 4 calls to this do not make up for one macroblock_encode.
- * doesn't transform chroma dc.
- *****************************************************************************/
-static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
-{
-    int b_decimate = h->mb.b_dct_decimate;
-    int i_qp = h->mb.i_qp;
-    int x = i8&1;
-    int y = i8>>1;
-    int nz;
-    int chroma422 = chroma == CHROMA_422;
-
-    h->mb.i_cbp_chroma = 0;
-    h->mb.i_cbp_luma &= ~(1 << i8);
-
-    if( !h->mb.b_skip_mc )
-        x264_mb_mc_8x8( h, i8 );
-
-    if( h->mb.b_lossless )
-    {
-        for( int p = 0; p < plane_count; p++ )
-        {
-            pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
-            pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-            int nnz8x8 = 0;
-            if( h->mb.b_transform_8x8 )
-            {
-                nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
-                STORE_8x8_NNZ( p, i8, nnz8x8 );
-            }
-            else
-            {
-                for( int i4 = i8*4; i4 < i8*4+4; i4++ )
-                {
-                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
-                                             h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
-                                             h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
-                    h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
-                    nnz8x8 |= nz;
-                }
-            }
-            h->mb.i_cbp_luma |= nnz8x8 << i8;
-        }
-        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
-        {
-            for( int ch = 0; ch < 2; ch++ )
-            {
-                dctcoef dc;
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
-
-                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
-                {
-                    int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
-                    nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
-                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
-                }
-            }
-            h->mb.i_cbp_chroma = 0x02;
-        }
-    }
-    else
-    {
-        if( h->mb.b_transform_8x8 )
-        {
-            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-            {
-                int quant_cat = p ? CQM_8PC : CQM_8PY;
-                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-                ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
-
-                h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-                int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
-                if( nnz8x8 )
-                {
-                    h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
-
-                    if( b_decimate && !h->mb.b_trellis )
-                        nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
-
-                    if( nnz8x8 )
-                    {
-                        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
-                        h->dctf.add8x8_idct8( p_fdec, dct8x8 );
-                        STORE_8x8_NNZ( p, i8, 1 );
-                        h->mb.i_cbp_luma |= 1 << i8;
-                    }
-                    else
-                        STORE_8x8_NNZ( p, i8, 0 );
-                }
-                else
-                    STORE_8x8_NNZ( p, i8, 0 );
-            }
-        }
-        else
-        {
-            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-            {
-                int quant_cat = p ? CQM_4PC : CQM_4PY;
-                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-                int i_decimate_8x8 = b_decimate ? 0 : 4;
-                ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
-                int nnz8x8 = 0;
-
-                h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-                STORE_8x8_NNZ( p, i8, 0 );
-
-                if( h->mb.b_noise_reduction )
-                    for( int idx = 0; idx < 4; idx++ )
-                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
-
-                if( h->mb.b_trellis )
-                {
-                    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-                    {
-                        if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
-                        {
-                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
-                            h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
-                            if( i_decimate_8x8 < 4 )
-                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
-                            h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
-                            nnz8x8 = 1;
-                        }
-                    }
-                }
-                else
-                {
-                    nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
-                    if( nz )
-                    {
-                        FOREACH_BIT( i4x4, 0, nz )
-                        {
-                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
-                            h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
-                            if( i_decimate_8x8 < 4 )
-                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
-                            h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
-                        }
-                    }
-                }
-                if( nnz8x8 )
-                {
-                    /* decimate this 8x8 block */
-                    if( i_decimate_8x8 < 4 )
-                        STORE_8x8_NNZ( p, i8, 0 );
-                    else
-                    {
-                        h->dctf.add8x8_idct( p_fdec, dct4x4 );
-                        h->mb.i_cbp_luma |= 1 << i8;
-                    }
-                }
-            }
-        }
-
-        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
-        {
-            i_qp = h->mb.i_chroma_qp;
-            for( int ch = 0; ch < 2; ch++ )
-            {
-                ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
-
-                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
-                {
-                    h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
-
-                    if( h->mb.b_noise_reduction )
-                        h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                    dct4x4[i4x4][0] = 0;
-
-                    if( h->mb.b_trellis )
-                        nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
-                    else
-                        nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-
-                    int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
-                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
-                    if( nz )
-                    {
-                        h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
-                        h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
-                        h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
-                    }
-                }
-            }
-            h->mb.i_cbp_chroma = 0x02;
-        }
-    }
-}
-
-void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
-{
-    if( CHROMA444 )
-        x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
-    else if( CHROMA_FORMAT == CHROMA_422 )
-        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
-    else
-        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
-}
-
-/*****************************************************************************
- * RD only, luma only (for 4:2:0)
- *****************************************************************************/
-static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
-{
-    int i_qp = h->mb.i_qp;
-
-    for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
-    {
-        int quant_cat = p ? CQM_4PC : CQM_4PY;
-        pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
-        pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
-        int nz;
-
-        /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
-
-        if( h->mb.b_lossless )
-        {
-            nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
-            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
-        }
-        else
-        {
-            ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
-            h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-            nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
-            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
-            if( nz )
-            {
-                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
-                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
-                h->dctf.add4x4_idct( p_fdec, dct4x4 );
-            }
-        }
-    }
-}
-
-void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
-{
-    if( CHROMA444 )
-        x264_macroblock_encode_p4x4_internal( h, i8, 3 );
-    else
-        x264_macroblock_encode_p4x4_internal( h, i8, 1 );
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/macroblock.h b/android/src/main/libenc/jni/libx264/encoder/macroblock.h
deleted file mode 100755
index 585d585..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/macroblock.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*****************************************************************************
- * macroblock.h: macroblock encoding
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ENCODER_MACROBLOCK_H
-#define X264_ENCODER_MACROBLOCK_H
-
-#include "common/macroblock.h"
-
-extern const int x264_lambda2_tab[QP_MAX_MAX+1];
-extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1];
-
-void x264_rdo_init( void );
-
-int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
-
-#define x264_macroblock_probe_pskip( h )\
-    x264_macroblock_probe_skip( h, 0 )
-#define x264_macroblock_probe_bskip( h )\
-    x264_macroblock_probe_skip( h, 1 )
-
-void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
-void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
-void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
-void x264_predict_lossless_chroma( x264_t *h, int i_mode );
-
-void x264_macroblock_encode      ( x264_t *h );
-void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
-void x264_macroblock_write_cavlc ( x264_t *h );
-
-void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
-void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
-void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
-
-void x264_cabac_mb_skip( x264_t *h, int b_skip );
-
-int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
-                                int ctx_block_cat, int b_intra, int idx );
-int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
-int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
-int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
-
-void x264_noise_reduction_update( x264_t *h );
-
-static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
-{
-    int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
-    if( h->mb.b_noise_reduction )
-        h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
-    if( h->mb.b_trellis )
-        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
-    else
-        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
-}
-
-static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
-{
-    int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
-    if( h->mb.b_noise_reduction )
-        h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
-    if( h->mb.b_trellis )
-        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
-    else
-        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
-}
-
-#define STORE_8x8_NNZ( p, idx, nz )\
-do\
-{\
-    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
-    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
-} while(0)
-
-#define CLEAR_16x16_NNZ( p ) \
-do\
-{\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\
-} while(0)
-
-/* A special for loop that iterates branchlessly over each set
- * bit in a 4-bit input. */
-#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
-
-static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
-{
-    int nz;
-    pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
-    pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
-
-    if( b_predict )
-    {
-        if( h->mb.b_lossless )
-            x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
-        else
-            h->predict_4x4[i_mode]( p_dst );
-    }
-
-    if( h->mb.b_lossless )
-    {
-        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
-        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
-        h->mb.i_cbp_luma |= nz<<(idx>>2);
-        return;
-    }
-
-    h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
-
-    nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
-    h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
-    if( nz )
-    {
-        h->mb.i_cbp_luma |= 1<<(idx>>2);
-        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
-        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
-        h->dctf.add4x4_idct( p_dst, dct4x4 );
-    }
-}
-
-static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict )
-{
-    int x = idx&1;
-    int y = idx>>1;
-    int nz;
-    pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
-    pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
-    ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
-    ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
-
-    if( b_predict )
-    {
-        if( !edge )
-        {
-            h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
-            edge = edge_buf;
-        }
-
-        if( h->mb.b_lossless )
-            x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
-        else
-            h->predict_8x8[i_mode]( p_dst, edge );
-    }
-
-    if( h->mb.b_lossless )
-    {
-        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
-        STORE_8x8_NNZ( p, idx, nz );
-        h->mb.i_cbp_luma |= nz<<idx;
-        return;
-    }
-
-    h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
-
-    nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
-    if( nz )
-    {
-        h->mb.i_cbp_luma |= 1<<idx;
-        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
-        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
-        h->dctf.add8x8_idct8( p_dst, dct8x8 );
-        STORE_8x8_NNZ( p, idx, 1 );
-    }
-    else
-        STORE_8x8_NNZ( p, idx, 0 );
-}
-
-#endif
-
diff --git a/android/src/main/libenc/jni/libx264/encoder/me.c b/android/src/main/libenc/jni/libx264/encoder/me.c
deleted file mode 100755
index b5552fb..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/me.c
+++ /dev/null
@@ -1,1350 +0,0 @@
-/*****************************************************************************
- * me.c: motion estimation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-#include "me.h"
-
-/* presets selected from good points on the speed-vs-quality curve of several test videos
- * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
- * where me_* are the number of EPZS iterations run on all candidate block types,
- * and refine_* are run only on the winner.
- * the subme=8,9 values are much higher because any amount of satd search makes
- * up its time by reducing the number of qpel-rd iterations. */
-static const uint8_t subpel_iterations[][4] =
-   {{0,0,0,0},
-    {1,1,0,0},
-    {0,1,1,0},
-    {0,2,1,0},
-    {0,2,1,1},
-    {0,2,1,2},
-    {0,0,2,2},
-    {0,0,2,2},
-    {0,0,4,10},
-    {0,0,4,10},
-    {0,0,4,10},
-    {0,0,4,10}};
-
-/* (x-1)%6 */
-static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
-/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
-static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
-
-static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
-
-#define BITS_MVD( mx, my )\
-    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
-
-#define COST_MV( mx, my )\
-do\
-{\
-    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
-                   &p_fref_w[(my)*stride+(mx)], stride )\
-             + BITS_MVD(mx,my);\
-    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
-} while(0)
-
-#define COST_MV_HPEL( mx, my, cost )\
-do\
-{\
-    intptr_t stride2 = 16;\
-    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
-    cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
-         + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
-} while(0)
-
-#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
-{\
-    pixel *pix_base = p_fref_w + bmx + bmy*stride;\
-    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
-        pix_base + (m0x) + (m0y)*stride,\
-        pix_base + (m1x) + (m1y)*stride,\
-        pix_base + (m2x) + (m2y)*stride,\
-        stride, costs );\
-    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
-    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
-    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
-}
-
-#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
-{\
-    pixel *pix_base = p_fref_w + bmx + bmy*stride;\
-    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
-        pix_base + (m0x) + (m0y)*stride,\
-        pix_base + (m1x) + (m1y)*stride,\
-        pix_base + (m2x) + (m2y)*stride,\
-        pix_base + (m3x) + (m3y)*stride,\
-        stride, costs );\
-    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
-    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
-    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
-    (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) );\
-}
-
-#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
-{\
-    pixel *pix_base = p_fref_w + omx + omy*stride;\
-    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
-        pix_base + (m0x) + (m0y)*stride,\
-        pix_base + (m1x) + (m1y)*stride,\
-        pix_base + (m2x) + (m2y)*stride,\
-        pix_base + (m3x) + (m3y)*stride,\
-        stride, costs );\
-    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
-    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
-    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
-    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
-    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
-    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
-    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
-    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
-}
-
-#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
-{\
-    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
-        p_fref_w + (m0x) + (m0y)*stride,\
-        p_fref_w + (m1x) + (m1y)*stride,\
-        p_fref_w + (m2x) + (m2y)*stride,\
-        stride, costs );\
-    costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
-    costs[1] += p_cost_mvx[(m1x)<<2];\
-    costs[2] += p_cost_mvx[(m2x)<<2];\
-    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
-    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
-    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
-}
-
-/*  1  */
-/* 101 */
-/*  1  */
-#define DIA1_ITER( mx, my )\
-{\
-    omx = mx; omy = my;\
-    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
-}
-
-#define CROSS( start, x_max, y_max )\
-{\
-    int i = start;\
-    if( (x_max) <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
-        for( ; i < (x_max)-2; i+=4 )\
-            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
-    for( ; i < (x_max); i+=2 )\
-    {\
-        if( omx+i <= mv_x_max )\
-            COST_MV( omx+i, omy );\
-        if( omx-i >= mv_x_min )\
-            COST_MV( omx-i, omy );\
-    }\
-    i = start;\
-    if( (y_max) <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
-        for( ; i < (y_max)-2; i+=4 )\
-            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
-    for( ; i < (y_max); i+=2 )\
-    {\
-        if( omy+i <= mv_y_max )\
-            COST_MV( omx, omy+i );\
-        if( omy-i >= mv_y_min )\
-            COST_MV( omx, omy-i );\
-    }\
-}
-
-#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
-#define SPEL(mv) ((mv)<<2)     /* ... and the reverse. */
-#define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */
-
-void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
-{
-    const int bw = x264_pixel_size[m->i_pixel].w;
-    const int bh = x264_pixel_size[m->i_pixel].h;
-    const int i_pixel = m->i_pixel;
-    const int stride = m->i_stride[0];
-    int i_me_range = h->param.analyse.i_me_range;
-    int bmx, bmy, bcost = COST_MAX;
-    int bpred_cost = COST_MAX;
-    int omx, omy, pmx, pmy;
-    pixel *p_fenc = m->p_fenc[0];
-    pixel *p_fref_w = m->p_fref_w;
-    ALIGNED_ARRAY_N( pixel, pix,[16*16] );
-    ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
-
-    ALIGNED_ARRAY_16( int, costs,[16] );
-
-    int mv_x_min = h->mb.mv_limit_fpel[0][0];
-    int mv_y_min = h->mb.mv_limit_fpel[0][1];
-    int mv_x_max = h->mb.mv_limit_fpel[1][0];
-    int mv_y_max = h->mb.mv_limit_fpel[1][1];
-/* Special version of pack to allow shortcuts in CHECK_MVRANGE */
-#define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
-    uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
-    uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
-    uint32_t pmv, bpred_mv = 0;
-
-#define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
-
-    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
-
-    /* Try extra predictors if provided.  If subme >= 3, check subpel predictors,
-     * otherwise round them to fullpel. */
-    if( h->mb.i_subpel_refine >= 3 )
-    {
-        /* Calculate and check the MVP first */
-        int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
-        int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
-        pmv = pack16to32_mask( bpred_mx, bpred_my );
-        pmx = FPEL( bpred_mx );
-        pmy = FPEL( bpred_my );
-
-        COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
-        int pmv_cost = bpred_cost;
-
-        if( i_mvc > 0 )
-        {
-            /* Clip MV candidates and eliminate those equal to zero and pmv. */
-            int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
-            if( valid_mvcs > 0 )
-            {
-                int i = 1, cost;
-                /* We stuff pmv here to branchlessly pick between pmv and the various
-                 * MV candidates. [0] gets skipped in order to maintain alignment for
-                 * x264_predictor_clip. */
-                M32( mvc_temp[1] ) = pmv;
-                bpred_cost <<= 4;
-                do
-                {
-                    int mx = mvc_temp[i+1][0];
-                    int my = mvc_temp[i+1][1];
-                    COST_MV_HPEL( mx, my, cost );
-                    COPY1_IF_LT( bpred_cost, (cost << 4) + i );
-                } while( ++i <= valid_mvcs );
-                bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
-                bpred_my = mvc_temp[(bpred_cost&15)+1][1];
-                bpred_cost >>= 4;
-            }
-        }
-
-        /* Round the best predictor back to fullpel and get the cost, since this is where
-         * we'll be starting the fullpel motion search. */
-        bmx = FPEL( bpred_mx );
-        bmy = FPEL( bpred_my );
-        bpred_mv = pack16to32_mask(bpred_mx, bpred_my);
-        if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */
-            COST_MV( bmx, bmy );
-        else                          /* Otherwise just copy the cost (we already know it) */
-            bcost = bpred_cost;
-
-        /* Test the zero vector if it hasn't been tested yet. */
-        if( pmv )
-        {
-            if( bmx|bmy ) COST_MV( 0, 0 );
-        }
-        /* If a subpel mv candidate was better than the zero vector, the previous
-         * fullpel check won't have gotten it even if the pmv was zero. So handle
-         * that possibility here. */
-        else
-        {
-            COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
-        }
-    }
-    else
-    {
-        /* Calculate and check the fullpel MVP first */
-        bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
-        bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
-        pmv = pack16to32_mask( bmx, bmy );
-
-        /* Because we are rounding the predicted motion vector to fullpel, there will be
-         * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
-         * chosen as the best predictor, it is often the case that the subpel search will
-         * result in a vector at or next to the predicted motion vector.  Therefore, we omit
-         * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
-         * the predicted motion vector.
-         *
-         * Disclaimer: this is a post-hoc rationalization for why this hack works. */
-        bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
-
-        if( i_mvc > 0 )
-        {
-            /* Like in subme>=3, except we also round the candidates to fullpel. */
-            int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
-            if( valid_mvcs > 0 )
-            {
-                int i = 1, cost;
-                M32( mvc_temp[1] ) = pmv;
-                bcost <<= 4;
-                do
-                {
-                    int mx = mvc_temp[i+1][0];
-                    int my = mvc_temp[i+1][1];
-                    cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
-                    COPY1_IF_LT( bcost, (cost << 4) + i );
-                } while( ++i <= valid_mvcs );
-                bmx = mvc_temp[(bcost&15)+1][0];
-                bmy = mvc_temp[(bcost&15)+1][1];
-                bcost >>= 4;
-            }
-        }
-
-        /* Same as above, except the condition is simpler. */
-        if( pmv )
-            COST_MV( 0, 0 );
-    }
-
-    switch( h->mb.i_me_method )
-    {
-        case X264_ME_DIA:
-        {
-            /* diamond search, radius 1 */
-            bcost <<= 4;
-            int i = i_me_range;
-            do
-            {
-                COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
-                COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
-                COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
-                COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
-                COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
-                if( !(bcost&15) )
-                    break;
-                bmx -= (bcost<<28)>>30;
-                bmy -= (bcost<<30)>>30;
-                bcost &= ~15;
-            } while( --i && CHECK_MVRANGE(bmx, bmy) );
-            bcost >>= 4;
-            break;
-        }
-
-        case X264_ME_HEX:
-        {
-    me_hex2:
-            /* hexagon search, radius 2 */
-    #if 0
-            for( int i = 0; i < i_me_range/2; i++ )
-            {
-                omx = bmx; omy = bmy;
-                COST_MV( omx-2, omy   );
-                COST_MV( omx-1, omy+2 );
-                COST_MV( omx+1, omy+2 );
-                COST_MV( omx+2, omy   );
-                COST_MV( omx+1, omy-2 );
-                COST_MV( omx-1, omy-2 );
-                if( bmx == omx && bmy == omy )
-                    break;
-                if( !CHECK_MVRANGE(bmx, bmy) )
-                    break;
-            }
-    #else
-            /* equivalent to the above, but eliminates duplicate candidates */
-
-            /* hexagon */
-            COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );
-            COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */
-            bcost <<= 3;
-            COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
-            COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
-            COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
-            COPY1_IF_LT( bcost, (costs[4]<<3)+5 );
-            COPY1_IF_LT( bcost, (costs[5]<<3)+6 );
-            COPY1_IF_LT( bcost, (costs[6]<<3)+7 );
-
-            if( bcost&7 )
-            {
-                int dir = (bcost&7)-2;
-                bmx += hex2[dir+1][0];
-                bmy += hex2[dir+1][1];
-
-                /* half hexagon, not overlapping the previous iteration */
-                for( int i = (i_me_range>>1) - 1; i > 0 && CHECK_MVRANGE(bmx, bmy); i-- )
-                {
-                    COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1],
-                                    hex2[dir+1][0], hex2[dir+1][1],
-                                    hex2[dir+2][0], hex2[dir+2][1],
-                                    costs );
-                    bcost &= ~7;
-                    COPY1_IF_LT( bcost, (costs[0]<<3)+1 );
-                    COPY1_IF_LT( bcost, (costs[1]<<3)+2 );
-                    COPY1_IF_LT( bcost, (costs[2]<<3)+3 );
-                    if( !(bcost&7) )
-                        break;
-                    dir += (bcost&7)-2;
-                    dir = mod6m1[dir+1];
-                    bmx += hex2[dir+1][0];
-                    bmy += hex2[dir+1][1];
-                }
-            }
-            bcost >>= 3;
-    #endif
-            /* square refine */
-            bcost <<= 4;
-            COST_MV_X4_DIR(  0,-1,  0,1, -1,0, 1,0, costs );
-            COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
-            COPY1_IF_LT( bcost, (costs[1]<<4)+2 );
-            COPY1_IF_LT( bcost, (costs[2]<<4)+3 );
-            COPY1_IF_LT( bcost, (costs[3]<<4)+4 );
-            COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs );
-            COPY1_IF_LT( bcost, (costs[0]<<4)+5 );
-            COPY1_IF_LT( bcost, (costs[1]<<4)+6 );
-            COPY1_IF_LT( bcost, (costs[2]<<4)+7 );
-            COPY1_IF_LT( bcost, (costs[3]<<4)+8 );
-            bmx += square1[bcost&15][0];
-            bmy += square1[bcost&15][1];
-            bcost >>= 4;
-            break;
-        }
-
-        case X264_ME_UMH:
-        {
-            /* Uneven-cross Multi-Hexagon-grid Search
-             * as in JM, except with different early termination */
-
-            static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
-
-            int ucost1, ucost2;
-            int cross_start = 1;
-
-            /* refine predictors */
-            ucost1 = bcost;
-            DIA1_ITER( pmx, pmy );
-            if( pmx | pmy )
-                DIA1_ITER( 0, 0 );
-
-            if( i_pixel == PIXEL_4x4 )
-                goto me_hex2;
-
-            ucost2 = bcost;
-            if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
-                DIA1_ITER( bmx, bmy );
-            if( bcost == ucost2 )
-                cross_start = 3;
-            omx = bmx; omy = bmy;
-
-            /* early termination */
-#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
-            if( bcost == ucost2 && SAD_THRESH(2000) )
-            {
-                COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
-                COST_MV_X4( 2, 0, -1, 1, 1, 1,  0,2 );
-                if( bcost == ucost1 && SAD_THRESH(500) )
-                    break;
-                if( bcost == ucost2 )
-                {
-                    int range = (i_me_range>>1) | 1;
-                    CROSS( 3, range, range );
-                    COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
-                    COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
-                    if( bcost == ucost2 )
-                        break;
-                    cross_start = range + 2;
-                }
-            }
-
-            /* adaptive search range */
-            if( i_mvc )
-            {
-                /* range multipliers based on casual inspection of some statistics of
-                 * average distance between current predictor and final mv found by ESA.
-                 * these have not been tuned much by actual encoding. */
-                static const uint8_t range_mul[4][4] =
-                {
-                    { 3, 3, 4, 4 },
-                    { 3, 4, 4, 4 },
-                    { 4, 4, 4, 5 },
-                    { 4, 4, 5, 6 },
-                };
-                int mvd;
-                int sad_ctx, mvd_ctx;
-                int denom = 1;
-
-                if( i_mvc == 1 )
-                {
-                    if( i_pixel == PIXEL_16x16 )
-                        /* mvc is probably the same as mvp, so the difference isn't meaningful.
-                         * but prediction usually isn't too bad, so just use medium range */
-                        mvd = 25;
-                    else
-                        mvd = abs( m->mvp[0] - mvc[0][0] )
-                            + abs( m->mvp[1] - mvc[0][1] );
-                }
-                else
-                {
-                    /* calculate the degree of agreement between predictors. */
-                    /* in 16x16, mvc includes all the neighbors used to make mvp,
-                     * so don't count mvp separately. */
-                    denom = i_mvc - 1;
-                    mvd = 0;
-                    if( i_pixel != PIXEL_16x16 )
-                    {
-                        mvd = abs( m->mvp[0] - mvc[0][0] )
-                            + abs( m->mvp[1] - mvc[0][1] );
-                        denom++;
-                    }
-                    mvd += x264_predictor_difference( mvc, i_mvc );
-                }
-
-                sad_ctx = SAD_THRESH(1000) ? 0
-                        : SAD_THRESH(2000) ? 1
-                        : SAD_THRESH(4000) ? 2 : 3;
-                mvd_ctx = mvd < 10*denom ? 0
-                        : mvd < 20*denom ? 1
-                        : mvd < 40*denom ? 2 : 3;
-
-                i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2;
-            }
-
-            /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
-             * we are still centered on the same place as the DIA2. is this desirable? */
-            CROSS( cross_start, i_me_range, i_me_range>>1 );
-
-            COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );
-
-            /* hexagon grid */
-            omx = bmx; omy = bmy;
-            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
-            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
-            int i = 1;
-            do
-            {
-                static const int8_t hex4[16][2] = {
-                    { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
-                    {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
-                    {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
-                    {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3},
-                };
-
-                if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
-                                     mv_y_max-omy, omy-mv_y_min ) )
-                {
-                    for( int j = 0; j < 16; j++ )
-                    {
-                        int mx = omx + hex4[j][0]*i;
-                        int my = omy + hex4[j][1]*i;
-                        if( CHECK_MVRANGE(mx, my) )
-                            COST_MV( mx, my );
-                    }
-                }
-                else
-                {
-                    int dir = 0;
-                    pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride;
-                    int dy = i*stride;
-#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
-                    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
-                            pix_base x0*i+(y0-2*k+4)*dy,\
-                            pix_base x1*i+(y1-2*k+4)*dy,\
-                            pix_base x2*i+(y2-2*k+4)*dy,\
-                            pix_base x3*i+(y3-2*k+4)*dy,\
-                            stride, costs+4*k );\
-                    pix_base += 2*dy;
-#define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i]
-#define MIN_MV(k,x,y)     COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) )
-                    SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 );
-                    SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 );
-                    SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 );
-                    SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 );
-                    ADD_MVCOST(  0, 0,-4 );
-                    ADD_MVCOST(  1, 0, 4 );
-                    ADD_MVCOST(  2,-2,-3 );
-                    ADD_MVCOST(  3, 2,-3 );
-                    ADD_MVCOST(  4,-4,-2 );
-                    ADD_MVCOST(  5, 4,-2 );
-                    ADD_MVCOST(  6,-4,-1 );
-                    ADD_MVCOST(  7, 4,-1 );
-                    ADD_MVCOST(  8,-4, 0 );
-                    ADD_MVCOST(  9, 4, 0 );
-                    ADD_MVCOST( 10,-4, 1 );
-                    ADD_MVCOST( 11, 4, 1 );
-                    ADD_MVCOST( 12,-4, 2 );
-                    ADD_MVCOST( 13, 4, 2 );
-                    ADD_MVCOST( 14,-2, 3 );
-                    ADD_MVCOST( 15, 2, 3 );
-                    MIN_MV(  0, 0,-4 );
-                    MIN_MV(  1, 0, 4 );
-                    MIN_MV(  2,-2,-3 );
-                    MIN_MV(  3, 2,-3 );
-                    MIN_MV(  4,-4,-2 );
-                    MIN_MV(  5, 4,-2 );
-                    MIN_MV(  6,-4,-1 );
-                    MIN_MV(  7, 4,-1 );
-                    MIN_MV(  8,-4, 0 );
-                    MIN_MV(  9, 4, 0 );
-                    MIN_MV( 10,-4, 1 );
-                    MIN_MV( 11, 4, 1 );
-                    MIN_MV( 12,-4, 2 );
-                    MIN_MV( 13, 4, 2 );
-                    MIN_MV( 14,-2, 3 );
-                    MIN_MV( 15, 2, 3 );
-#undef SADS
-#undef ADD_MVCOST
-#undef MIN_MV
-                    if(dir)
-                    {
-                        bmx = omx + i*(dir>>4);
-                        bmy = omy + i*((dir<<28)>>28);
-                    }
-                }
-            } while( ++i <= i_me_range>>2 );
-            if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min )
-                goto me_hex2;
-            break;
-        }
-
-        case X264_ME_ESA:
-        case X264_ME_TESA:
-        {
-            const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
-            const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
-            const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
-            const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
-            /* SEA is fastest in multiples of 4 */
-            const int width = (max_x - min_x + 3) & ~3;
-#if 0
-            /* plain old exhaustive search */
-            for( int my = min_y; my <= max_y; my++ )
-                for( int mx = min_x; mx < min_x + width; mx++ )
-                    COST_MV( mx, my );
-#else
-            /* successive elimination by comparing DC before a full SAD,
-             * because sum(abs(diff)) >= abs(diff(sum)). */
-            uint16_t *sums_base = m->integral;
-            ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
-            ALIGNED_ARRAY_16( int, enc_dc,[4] );
-            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
-            int delta = x264_pixel_size[sad_size].w;
-            int16_t *xs = h->scratch_buffer;
-            int xn;
-            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
-
-            h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
-                p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
-                FENC_STRIDE, enc_dc );
-            if( delta == 4 )
-                sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
-            if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
-                delta *= stride;
-            if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
-                enc_dc[1] = enc_dc[2];
-
-            if( h->mb.i_me_method == X264_ME_TESA )
-            {
-                // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
-                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4);
-                int nmvsad = 0, limit;
-                int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
-                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
-                         + BITS_MVD( bmx, bmy );
-                for( int my = min_y; my <= max_y; my++ )
-                {
-                    int i;
-                    int ycost = p_cost_mvy[my<<2];
-                    if( bsad <= ycost )
-                        continue;
-                    bsad -= ycost;
-                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
-                                               cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 );
-                    for( i = 0; i < xn-2; i += 3 )
-                    {
-                        pixel *ref = p_fref_w+min_x+my*stride;
-                        ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */
-                        h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
-                        for( int j = 0; j < 3; j++ )
-                        {
-                            int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
-                            if( sad < bsad*sad_thresh>>3 )
-                            {
-                                COPY1_IF_LT( bsad, sad );
-                                mvsads[nmvsad].sad = sad + ycost;
-                                mvsads[nmvsad].mv[0] = min_x+xs[i+j];
-                                mvsads[nmvsad].mv[1] = my;
-                                nmvsad++;
-                            }
-                        }
-                    }
-                    for( ; i < xn; i++ )
-                    {
-                        int mx = min_x+xs[i];
-                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
-                                + cost_fpel_mvx[xs[i]];
-                        if( sad < bsad*sad_thresh>>3 )
-                        {
-                            COPY1_IF_LT( bsad, sad );
-                            mvsads[nmvsad].sad = sad + ycost;
-                            mvsads[nmvsad].mv[0] = mx;
-                            mvsads[nmvsad].mv[1] = my;
-                            nmvsad++;
-                        }
-                    }
-                    bsad += ycost;
-                }
-
-                limit = i_me_range >> 1;
-                sad_thresh = bsad*sad_thresh>>3;
-                while( nmvsad > limit*2 && sad_thresh > bsad )
-                {
-                    int i = 0;
-                    // halve the range if the domain is too large... eh, close enough
-                    sad_thresh = (sad_thresh + bsad) >> 1;
-                    while( i < nmvsad && mvsads[i].sad <= sad_thresh )
-                        i++;
-                    for( int j = i; j < nmvsad; j++ )
-                    {
-                        uint32_t sad;
-                        if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
-                        {
-                            uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
-#if WORDS_BIGENDIAN
-                            mvsad >>= 32;
-#endif
-                            sad = mvsad;
-                        }
-                        else
-                        {
-                            sad = mvsads[j].sad;
-                            CP32( mvsads[i].mv, mvsads[j].mv );
-                            mvsads[i].sad = sad;
-                        }
-                        i += (sad - (sad_thresh+1)) >> 31;
-                    }
-                    nmvsad = i;
-                }
-                while( nmvsad > limit )
-                {
-                    int bi = 0;
-                    for( int i = 1; i < nmvsad; i++ )
-                        if( mvsads[i].sad > mvsads[bi].sad )
-                            bi = i;
-                    nmvsad--;
-                    if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
-                        CP64( &mvsads[bi], &mvsads[nmvsad] );
-                    else
-                        mvsads[bi] = mvsads[nmvsad];
-                }
-                for( int i = 0; i < nmvsad; i++ )
-                    COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] );
-            }
-            else
-            {
-                // just ADS and SAD
-                for( int my = min_y; my <= max_y; my++ )
-                {
-                    int i;
-                    int ycost = p_cost_mvy[my<<2];
-                    if( bcost <= ycost )
-                        continue;
-                    bcost -= ycost;
-                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
-                                               cost_fpel_mvx+min_x, xs, width, bcost );
-                    for( i = 0; i < xn-2; i += 3 )
-                        COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
-                    bcost += ycost;
-                    for( ; i < xn; i++ )
-                        COST_MV( min_x+xs[i], my );
-                }
-            }
-#endif
-        }
-        break;
-    }
-
-    /* -> qpel mv */
-    uint32_t bmv = pack16to32_mask(bmx,bmy);
-    uint32_t bmv_spel = SPELx2(bmv);
-    if( h->mb.i_subpel_refine < 3 )
-    {
-        m->cost_mv = p_cost_mvx[bmx<<2] + p_cost_mvy[bmy<<2];
-        m->cost = bcost;
-        /* compute the real cost */
-        if( bmv == pmv ) m->cost += m->cost_mv;
-        M32( m->mv ) = bmv_spel;
-    }
-    else
-    {
-        M32(m->mv) = bpred_cost < bcost ? bpred_mv : bmv_spel;
-        m->cost = X264_MIN( bpred_cost, bcost );
-    }
-
-    /* subpel refine */
-    if( h->mb.i_subpel_refine >= 2 )
-    {
-        int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
-        int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
-        refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
-    }
-}
-#undef COST_MV
-
-void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
-{
-    int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
-    int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
-
-    if( m->i_pixel <= PIXEL_8x8 )
-        m->cost -= m->i_ref_cost;
-
-    refine_subpel( h, m, hpel, qpel, NULL, 1 );
-}
-
-void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh )
-{
-    refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 );
-}
-
-#define COST_MV_SAD( mx, my ) \
-{ \
-    intptr_t stride = 16; \
-    pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
-    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
-             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
-    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
-}
-
-#define COST_MV_SATD( mx, my, dir ) \
-if( b_refine_qpel || (dir^1) != odir ) \
-{ \
-    intptr_t stride = 16; \
-    pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
-    int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
-             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
-    if( b_chroma_me && cost < bcost ) \
-    { \
-        if( CHROMA444 ) \
-        { \
-            stride = 16; \
-            src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
-            cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \
-            if( cost < bcost ) \
-            { \
-                stride = 16; \
-                src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
-                cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \
-            } \
-        } \
-        else \
-        { \
-            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
-                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
-            if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
-            cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
-            if( cost < bcost ) \
-            { \
-                if( m->weight[2].weightfn ) \
-                    m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
-                cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
-            } \
-        } \
-    } \
-    COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
-}
-
-static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
-{
-    const int bw = x264_pixel_size[m->i_pixel].w;
-    const int bh = x264_pixel_size[m->i_pixel].h;
-    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
-    const int i_pixel = m->i_pixel;
-    const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
-    int chromapix = h->luma2chroma_pixel[i_pixel];
-    int chroma_v_shift = CHROMA_V_SHIFT;
-    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-
-    ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
-    ALIGNED_ARRAY_16( int, costs,[4] );
-
-    int bmx = m->mv[0];
-    int bmy = m->mv[1];
-    int bcost = m->cost;
-    int odir = -1, bdir;
-
-    /* halfpel diamond search */
-    if( hpel_iters )
-    {
-        /* try the subpel component of the predicted mv */
-        if( h->mb.i_subpel_refine < 3 )
-        {
-            int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
-            int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
-            if( (mx-bmx)|(my-bmy) )
-                COST_MV_SAD( mx, my );
-        }
-
-        bcost <<= 6;
-        for( int i = hpel_iters; i > 0; i-- )
-        {
-            int omx = bmx, omy = bmy;
-            intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
-            pixel *src0, *src1, *src2, *src3;
-            src0 = h->mc.get_ref( pix,    &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
-            src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
-            src1 = src0 + stride;
-            src3 = src2 + 1;
-            h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
-            costs[0] += p_cost_mvx[omx  ] + p_cost_mvy[omy-2];
-            costs[1] += p_cost_mvx[omx  ] + p_cost_mvy[omy+2];
-            costs[2] += p_cost_mvx[omx-2] + p_cost_mvy[omy  ];
-            costs[3] += p_cost_mvx[omx+2] + p_cost_mvy[omy  ];
-            COPY1_IF_LT( bcost, (costs[0]<<6)+2 );
-            COPY1_IF_LT( bcost, (costs[1]<<6)+6 );
-            COPY1_IF_LT( bcost, (costs[2]<<6)+16 );
-            COPY1_IF_LT( bcost, (costs[3]<<6)+48 );
-            if( !(bcost&63) )
-                break;
-            bmx -= (bcost<<26)>>29;
-            bmy -= (bcost<<29)>>29;
-            bcost &= ~63;
-        }
-        bcost >>= 6;
-    }
-
-    if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
-    {
-        bcost = COST_MAX;
-        COST_MV_SATD( bmx, bmy, -1 );
-    }
-
-    /* early termination when examining multiple reference frames */
-    if( p_halfpel_thresh )
-    {
-        if( (bcost*7)>>3 > *p_halfpel_thresh )
-        {
-            m->cost = bcost;
-            m->mv[0] = bmx;
-            m->mv[1] = bmy;
-            // don't need cost_mv
-            return;
-        }
-        else if( bcost < *p_halfpel_thresh )
-            *p_halfpel_thresh = bcost;
-    }
-
-    /* quarterpel diamond search */
-    if( h->mb.i_subpel_refine != 1 )
-    {
-        bdir = -1;
-        for( int i = qpel_iters; i > 0; i-- )
-        {
-            if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
-                break;
-            odir = bdir;
-            int omx = bmx, omy = bmy;
-            COST_MV_SATD( omx, omy - 1, 0 );
-            COST_MV_SATD( omx, omy + 1, 1 );
-            COST_MV_SATD( omx - 1, omy, 2 );
-            COST_MV_SATD( omx + 1, omy, 3 );
-            if( (bmx == omx) & (bmy == omy) )
-                break;
-        }
-    }
-    /* Special simplified case for subme=1 */
-    else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] )
-    {
-        int omx = bmx, omy = bmy;
-        /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */
-        h->mc.mc_luma( pix   , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] );
-        h->mc.mc_luma( pix+16, 64, m->p_fref, m->i_stride[0], omx, omy+1, bw, bh, &m->weight[0] );
-        h->mc.mc_luma( pix+32, 64, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] );
-        h->mc.mc_luma( pix+48, 64, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] );
-        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix, pix+16, pix+32, pix+48, 64, costs );
-        costs[0] += p_cost_mvx[omx  ] + p_cost_mvy[omy-1];
-        costs[1] += p_cost_mvx[omx  ] + p_cost_mvy[omy+1];
-        costs[2] += p_cost_mvx[omx-1] + p_cost_mvy[omy  ];
-        costs[3] += p_cost_mvx[omx+1] + p_cost_mvy[omy  ];
-        bcost <<= 4;
-        COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
-        COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
-        COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
-        COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
-        bmx -= (bcost<<28)>>30;
-        bmy -= (bcost<<30)>>30;
-        bcost >>= 4;
-    }
-
-    m->cost = bcost;
-    m->mv[0] = bmx;
-    m->mv[1] = bmy;
-    m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
-}
-
-#define BIME_CACHE( dx, dy, list )\
-{\
-    x264_me_t *m = m##list;\
-    int i = 4 + 3*dx + dy;\
-    int mvx = bm##list##x+dx;\
-    int mvy = bm##list##y+dy;\
-    stride[0][list][i] = bw;\
-    src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\
-                                     m->i_stride[0], mvx, mvy, bw, bh, x264_weight_none );\
-    if( rd )\
-    {\
-        if( CHROMA444 )\
-        {\
-            stride[1][list][i] = bw;\
-            src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\
-                                             m->i_stride[1], mvx, mvy, bw, bh, x264_weight_none );\
-            stride[2][list][i] = bw;\
-            src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\
-                                             m->i_stride[2], mvx, mvy, bw, bh, x264_weight_none );\
-        }\
-        else\
-            h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
-                             mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
-    }\
-}
-
-#define SATD_THRESH(cost) (cost+(cost>>4))
-
-/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
- * other than making its iteration count not a compile-time constant. */
-int x264_iter_kludge = 0;
-
-static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
-{
-    int x = i8&1;
-    int y = i8>>1;
-    int s8 = X264_SCAN8_0 + 2*x + 16*y;
-    int16_t *cache0_mv = h->mb.cache.mv[0][s8];
-    int16_t *cache1_mv = h->mb.cache.mv[1][s8];
-    const int i_pixel = m0->i_pixel;
-    const int bw = x264_pixel_size[i_pixel].w;
-    const int bh = x264_pixel_size[i_pixel].h;
-    ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
-    ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
-    ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
-    pixel *src[3][2][9];
-    int chromapix = h->luma2chroma_pixel[i_pixel];
-    int chroma_v_shift = CHROMA_V_SHIFT;
-    int chroma_x = (8 >> CHROMA_H_SHIFT) * x;
-    int chroma_y = (8 >> chroma_v_shift) * y;
-    pixel *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
-    pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
-    pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
-    int ref0 = h->mb.cache.ref[0][s8];
-    int ref1 = h->mb.cache.ref[1][s8];
-    const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    intptr_t stride[3][2][9];
-    int bm0x = m0->mv[0];
-    int bm0y = m0->mv[1];
-    int bm1x = m1->mv[0];
-    int bm1y = m1->mv[1];
-    int bcost = COST_MAX;
-    int mc_list0 = 1, mc_list1 = 1;
-    uint64_t bcostrd = COST_MAX64;
-    uint16_t amvd;
-    /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
-    ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
-    /* all permutations of an offset in up to 2 of the dimensions */
-    ALIGNED_4( static const int8_t dia4d[33][4] ) =
-    {
-        {0,0,0,0},
-        {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
-        {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
-        {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
-        {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1},
-        {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0},
-        {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0},
-        {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1},
-        {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0},
-    };
-
-    if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 ||
-        bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ||
-        bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 ||
-        bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 )
-        return;
-
-    if( rd && m0->i_pixel != PIXEL_16x16 && i8 != 0 )
-    {
-        x264_mb_predict_mv( h, 0, i8<<2, bw>>2, m0->mvp );
-        x264_mb_predict_mv( h, 1, i8<<2, bw>>2, m1->mvp );
-    }
-
-    const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
-    const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
-    const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
-    const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
-
-    h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
-
-    for( int pass = 0; pass < 8; pass++ )
-    {
-        int bestj = 0;
-        /* check all mv pairs that differ in at most 2 components from the current mvs. */
-        /* doesn't do chroma ME. this probably doesn't matter, as the gains
-         * from bidir ME are the same with and without chroma ME. */
-
-        if( mc_list0 )
-            for( int j = x264_iter_kludge; j < 9; j++ )
-                BIME_CACHE( square1[j][0], square1[j][1], 0 );
-
-        if( mc_list1 )
-            for( int j = x264_iter_kludge; j < 9; j++ )
-                BIME_CACHE( square1[j][0], square1[j][1], 1 );
-
-        for( int j = !!pass; j < 33; j++ )
-        {
-            int m0x = dia4d[j][0] + bm0x;
-            int m0y = dia4d[j][1] + bm0y;
-            int m1x = dia4d[j][2] + bm1x;
-            int m1y = dia4d[j][3] + bm1y;
-            if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
-            {
-                int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1];
-                int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3];
-                visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
-                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight );
-                int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
-                         + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
-                if( rd )
-                {
-                    if( cost < SATD_THRESH(bcost) )
-                    {
-                        bcost = X264_MIN( cost, bcost );
-                        M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
-                        M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
-                        if( CHROMA444 )
-                        {
-                            h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight );
-                            h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight );
-                        }
-                        else
-                        {
-                            h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
-                            h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
-                        }
-                        uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
-                        COPY2_IF_LT( bcostrd, costrd, bestj, j );
-                    }
-                }
-                else
-                    COPY2_IF_LT( bcost, cost, bestj, j );
-            }
-        }
-
-        if( !bestj )
-            break;
-
-        bm0x += dia4d[bestj][0];
-        bm0y += dia4d[bestj][1];
-        bm1x += dia4d[bestj][2];
-        bm1y += dia4d[bestj][3];
-
-        mc_list0 = M16( &dia4d[bestj][0] );
-        mc_list1 = M16( &dia4d[bestj][2] );
-    }
-
-    if( rd )
-    {
-        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
-        amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
-        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
-
-        x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
-        amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
-        x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
-    }
-
-    m0->mv[0] = bm0x;
-    m0->mv[1] = bm0y;
-    m1->mv[0] = bm1x;
-    m1->mv[1] = bm1y;
-}
-
-void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
-{
-    x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 );
-}
-
-void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
-{
-    /* Motion compensation is done as part of bidir_rd; don't repeat
-     * it in encoding. */
-    h->mb.b_skip_mc = 1;
-    x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
-    h->mb.b_skip_mc = 0;
-}
-
-#undef COST_MV_SATD
-#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
-{ \
-    if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
-    { \
-        h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
-        dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
-            + p_cost_mvx[mx] + p_cost_mvy[my]; \
-        COPY1_IF_LT( bsatd, dst ); \
-    } \
-    else \
-        dst = COST_MAX; \
-}
-
-#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
-{ \
-    if( satd <= SATD_THRESH(bsatd) ) \
-    { \
-        uint64_t cost; \
-        M32( cache_mv ) = pack16to32_mask(mx,my); \
-        if( CHROMA444 ) \
-        { \
-            h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
-            h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
-        } \
-        else if( m->i_pixel <= PIXEL_8x8 ) \
-        { \
-            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
-                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
-            if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
-            if( m->weight[2].weightfn ) \
-                m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
-        } \
-        cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
-        COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
-    } \
-}
-
-void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list )
-{
-    int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
-    const uint16_t *p_cost_mvx, *p_cost_mvy;
-    const int bw = x264_pixel_size[m->i_pixel].w;
-    const int bh = x264_pixel_size[m->i_pixel].h;
-    const int i_pixel = m->i_pixel;
-    int chroma_v_shift = CHROMA_V_SHIFT;
-    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-
-    uint64_t bcost = COST_MAX64;
-    int bmx = m->mv[0];
-    int bmy = m->mv[1];
-    int omx, omy, pmx, pmy;
-    int satd, bsatd;
-    int dir = -2;
-    int i8 = i4>>2;
-    uint16_t amvd;
-
-    pixel *pix  = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
-    pixel *pixu, *pixv;
-    if( CHROMA444 )
-    {
-        pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]];
-        pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]];
-    }
-    else
-    {
-        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
-        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
-    }
-
-    h->mb.b_skip_mc = 1;
-
-    if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
-        x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp );
-    pmx = m->mvp[0];
-    pmy = m->mvp[1];
-    p_cost_mvx = m->p_cost_mv - pmx;
-    p_cost_mvy = m->p_cost_mv - pmy;
-    COST_MV_SATD( bmx, bmy, bsatd, 0 );
-    if( m->i_pixel != PIXEL_16x16 )
-        COST_MV_RD( bmx, bmy, 0, 0, 0 )
-    else
-        bcost = m->cost;
-
-    /* check the predicted mv */
-    if( (bmx != pmx || bmy != pmy)
-        && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
-        && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
-    {
-        COST_MV_SATD( pmx, pmy, satd, 0 );
-        COST_MV_RD  ( pmx, pmy, satd, 0, 0 );
-        /* The hex motion search is guaranteed to not repeat the center candidate,
-         * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
-        if( bmx == pmx && bmy == pmy )
-        {
-            pmx = m->mv[0];
-            pmy = m->mv[1];
-        }
-    }
-
-    if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ||
-        bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 )
-    {
-        h->mb.b_skip_mc = 0;
-        return;
-    }
-
-    /* subpel hex search, same pattern as ME HEX. */
-    dir = -2;
-    omx = bmx;
-    omy = bmy;
-    for( int j = 0; j < 6; j++ )
-    {
-        COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 );
-        COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j );
-    }
-
-    if( dir != -2 )
-    {
-        /* half hexagon, not overlapping the previous iteration */
-        for( int i = 1; i < 10; i++ )
-        {
-            const int odir = mod6m1[dir+1];
-            if( bmy < h->mb.mv_min_spel[1] + 3 ||
-                bmy > h->mb.mv_max_spel[1] - 3 )
-                break;
-            dir = -2;
-            omx = bmx;
-            omy = bmy;
-            for( int j = 0; j < 3; j++ )
-            {
-                COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 );
-                COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j );
-            }
-            if( dir == -2 )
-                break;
-        }
-    }
-
-    /* square refine, same pattern as ME HEX. */
-    omx = bmx;
-    omy = bmy;
-    for( int i = 0; i < 8; i++ )
-    {
-        COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 );
-        COST_MV_RD  ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 );
-    }
-
-    m->cost = bcost;
-    m->mv[0] = bmx;
-    m->mv[1] = bmy;
-    x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
-    amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) );
-    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
-    h->mb.b_skip_mc = 0;
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/me.h b/android/src/main/libenc/jni/libx264/encoder/me.h
deleted file mode 100755
index 06a5427..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/me.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*****************************************************************************
- * me.h: motion estimation
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ME_H
-#define X264_ME_H
-
-#define COST_MAX (1<<28)
-#define COST_MAX64 (1ULL<<60)
-
-typedef struct
-{
-    /* aligning the first member is a gcc hack to force the struct to be
-     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
-    /* input */
-    ALIGNED_16( int i_pixel );   /* PIXEL_WxH */
-    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
-    int      i_ref_cost;
-    int      i_ref;
-    const x264_weight_t *weight;
-
-    pixel *p_fref[12];
-    pixel *p_fref_w;
-    pixel *p_fenc[3];
-    uint16_t *integral;
-    int      i_stride[3];
-
-    ALIGNED_4( int16_t mvp[2] );
-
-    /* output */
-    int cost_mv;        /* lambda * nbits for the chosen mv */
-    int cost;           /* satd + lambda * nbits */
-    ALIGNED_4( int16_t mv[2] );
-} ALIGNED_16( x264_me_t );
-
-void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
-#define x264_me_search( h, m, mvc, i_mvc )\
-    x264_me_search_ref( h, m, mvc, i_mvc, NULL )
-
-void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
-void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
-void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
-void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
-void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
-uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
-
-extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];
-
-#define COPY1_IF_LT(x,y)\
-if((y)<(x))\
-    (x)=(y);
-
-#define COPY2_IF_LT(x,y,a,b)\
-if((y)<(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-}
-
-#define COPY3_IF_LT(x,y,a,b,c,d)\
-if((y)<(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-    (c)=(d);\
-}
-
-#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
-if((y)<(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-    (c)=(d);\
-    (e)=(f);\
-}
-
-#define COPY2_IF_GT(x,y,a,b)\
-if((y)>(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-}
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/ratecontrol.c b/android/src/main/libenc/jni/libx264/encoder/ratecontrol.c
deleted file mode 100755
index 301ba0f..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/ratecontrol.c
+++ /dev/null
@@ -1,3104 +0,0 @@
-/*****************************************************************************
- * ratecontrol.c: ratecontrol
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Michael Niedermayer <michaelni@gmx.at>
- *          Gabriel Bouvigne <gabriel.bouvigne@joost.com>
- *          Fiona Glaser <fiona@x264.com>
- *          M�ns Rullg�rd <mru@mru.ath.cx>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#undef NDEBUG // always check asserts, the speed effect is far too small to disable them
-
-#include "common/common.h"
-#include "ratecontrol.h"
-#include "me.h"
-
-typedef struct
-{
-    int pict_type;
-    int frame_type;
-    int kept_as_ref;
-    double qscale;
-    int mv_bits;
-    int tex_bits;
-    int misc_bits;
-    double expected_bits; /* total expected bits up to the current frame (current one excluded) */
-    double expected_vbv;
-    double new_qscale;
-    float new_qp;
-    int i_count;
-    int p_count;
-    int s_count;
-    float blurred_complexity;
-    char direct_mode;
-    int16_t weight[3][2];
-    int16_t i_weight_denom[2];
-    int refcount[16];
-    int refs;
-    int64_t i_duration;
-    int64_t i_cpb_duration;
-    int out_num;
-} ratecontrol_entry_t;
-
-typedef struct
-{
-    float coeff_min;
-    float coeff;
-    float count;
-    float decay;
-    float offset;
-} predictor_t;
-
-struct x264_ratecontrol_t
-{
-    /* constants */
-    int b_abr;
-    int b_2pass;
-    int b_vbv;
-    int b_vbv_min_rate;
-    double fps;
-    double bitrate;
-    double rate_tolerance;
-    double qcompress;
-    int nmb;                    /* number of macroblocks in a frame */
-    int qp_constant[3];
-
-    /* current frame */
-    ratecontrol_entry_t *rce;
-    float qpm;                  /* qp for current macroblock: precise float for AQ */
-    float qpa_rc;               /* average of macroblocks' qp before aq */
-    float qpa_rc_prev;
-    int   qpa_aq;               /* average of macroblocks' qp after aq */
-    int   qpa_aq_prev;
-    float qp_novbv;             /* QP for the current frame if 1-pass VBV was disabled. */
-
-    /* VBV stuff */
-    double buffer_size;
-    int64_t buffer_fill_final;
-    int64_t buffer_fill_final_min;
-    double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
-    double buffer_rate;         /* # of bits added to buffer_fill after each frame */
-    double vbv_max_rate;        /* # of bits added to buffer_fill per second */
-    predictor_t *pred;          /* predict frame size from satd */
-    int single_frame_vbv;
-    float rate_factor_max_increment; /* Don't allow RF above (CRF + this value). */
-
-    /* ABR stuff */
-    int    last_satd;
-    double last_rceq;
-    double cplxr_sum;           /* sum of bits*qscale/rceq */
-    double expected_bits_sum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
-    int64_t filler_bits_sum;    /* sum in bits of finished frames' filler data */
-    double wanted_bits_window;  /* target bitrate * window */
-    double cbr_decay;
-    double short_term_cplxsum;
-    double short_term_cplxcount;
-    double rate_factor_constant;
-    double ip_offset;
-    double pb_offset;
-
-    /* 2pass stuff */
-    FILE *p_stat_file_out;
-    char *psz_stat_file_tmpname;
-    FILE *p_mbtree_stat_file_out;
-    char *psz_mbtree_stat_file_tmpname;
-    char *psz_mbtree_stat_file_name;
-    FILE *p_mbtree_stat_file_in;
-
-    int num_entries;            /* number of ratecontrol_entry_ts */
-    ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */
-    ratecontrol_entry_t **entry_out;
-    double last_qscale;
-    double last_qscale_for[3];  /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */
-    int last_non_b_pict_type;
-    double accum_p_qp;          /* for determining I-frame quant */
-    double accum_p_norm;
-    double last_accum_p_norm;
-    double lmin[3];             /* min qscale by frame type */
-    double lmax[3];
-    double lstep;               /* max change (multiply) in qscale per frame */
-    struct
-    {
-        uint16_t *qp_buffer[2]; /* Global buffers for converting MB-tree quantizer data. */
-        int qpbuf_pos;          /* In order to handle pyramid reordering, QP buffer acts as a stack.
-                                 * This value is the current position (0 or 1). */
-        int src_mb_count;
-
-        /* For rescaling */
-        int rescale_enabled;
-        float *scale_buffer[2]; /* Intermediate buffers */
-        int filtersize[2];      /* filter size (H/V) */
-        float *coeffs[2];
-        int *pos[2];
-        int srcdim[2];          /* Source dimensions (W/H) */
-    } mbtree;
-
-    /* MBRC stuff */
-    float frame_size_estimated; /* Access to this variable must be atomic: double is
-                                 * not atomic on all arches we care about */
-    double frame_size_maximum;  /* Maximum frame size due to MinCR */
-    double frame_size_planned;
-    double slice_size_planned;
-    predictor_t *row_pred;
-    predictor_t row_preds[3][2];
-    predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
-    int bframes;                /* # consecutive B-frames before this P-frame */
-    int bframe_bits;            /* total cost of those frames */
-
-    int i_zones;
-    x264_zone_t *zones;
-    x264_zone_t *prev_zone;
-
-    /* hrd stuff */
-    int initial_cpb_removal_delay;
-    int initial_cpb_removal_delay_offset;
-    double nrt_first_access_unit; /* nominal removal time */
-    double previous_cpb_final_arrival_time;
-    uint64_t hrd_multiply_denom;
-};
-
-
-static int parse_zones( x264_t *h );
-static int init_pass2(x264_t *);
-static float rate_estimate_qscale( x264_t *h );
-static int update_vbv( x264_t *h, int bits );
-static void update_vbv_plan( x264_t *h, int overhead );
-static float predict_size( predictor_t *p, float q, float var );
-static void update_predictor( predictor_t *p, float q, float var, float bits );
-
-#define CMP_OPT_FIRST_PASS( opt, param_val )\
-{\
-    if( ( p = strstr( opts, opt "=" ) ) && sscanf( p, opt "=%d" , &i ) && param_val != i )\
-    {\
-        x264_log( h, X264_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i );\
-        return -1;\
-    }\
-}
-
-/* Terminology:
- * qp = h.264's quantizer
- * qscale = linearized quantizer = Lagrange multiplier
- */
-static inline float qp2qscale( float qp )
-{
-    return 0.85f * powf( 2.0f, ( qp - (12.0f + QP_BD_OFFSET) ) / 6.0f );
-}
-static inline float qscale2qp( float qscale )
-{
-    return (12.0f + QP_BD_OFFSET) + 6.0f * log2f( qscale/0.85f );
-}
-
-/* Texture bitrate is not quite inversely proportional to qscale,
- * probably due the the changing number of SKIP blocks.
- * MV bits level off at about qp<=12, because the lambda used
- * for motion estimation is constant there. */
-static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale )
-{
-    if( qscale<0.1 )
-        qscale = 0.1;
-    return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 )
-           + rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 )
-           + rce->misc_bits;
-}
-
-static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i, int b_store )
-{
-    uint32_t sum = sum_ssd;
-    uint32_t ssd = sum_ssd >> 32;
-    if( b_store )
-    {
-        frame->i_pixel_sum[i] += sum;
-        frame->i_pixel_ssd[i] += ssd;
-    }
-    return ssd - ((uint64_t)sum * sum >> shift);
-}
-
-static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
-{
-    int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
-    int stride = frame->i_stride[i];
-    int offset = b_field
-        ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
-        : 16 * mb_x + height * mb_y * stride;
-    stride <<= b_field;
-    if( b_chroma )
-    {
-        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
-        int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
-        int shift = 7 - CHROMA_V_SHIFT;
-
-        h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
-        return ac_energy_var( h->pixf.var[chromapix]( pix,               FENC_STRIDE ), shift, frame, 1, b_store )
-             + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store );
-    }
-    else
-        return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
-}
-
-// Find the total AC energy of the block in all planes.
-static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
-{
-    /* This function contains annoying hacks because GCC has a habit of reordering emms
-     * and putting it after floating point ops.  As a result, we put the emms at the end of the
-     * function and make sure that its always called before the float math.  Noinline makes
-     * sure no reordering goes on. */
-    uint32_t var;
-    x264_prefetch_fenc( h, frame, mb_x, mb_y );
-    if( h->mb.b_adaptive_mbaff )
-    {
-        /* We don't know the super-MB mode we're going to pick yet, so
-         * simply try both and pick the lower of the two. */
-        uint32_t var_interlaced, var_progressive;
-        var_interlaced   = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 1, 1 );
-        var_progressive  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0, 0 );
-        if( CHROMA444 )
-        {
-            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 1, 1 );
-            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0, 0 );
-            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 1, 1 );
-            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 0, 0 );
-        }
-        else
-        {
-            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1, 1 );
-            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 0, 0 );
-        }
-        var = X264_MIN( var_interlaced, var_progressive );
-    }
-    else
-    {
-        var  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, PARAM_INTERLACED, 1 );
-        if( CHROMA444 )
-        {
-            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, PARAM_INTERLACED, 1 );
-            var += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, PARAM_INTERLACED, 1 );
-        }
-        else
-            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, PARAM_INTERLACED, 1 );
-    }
-    x264_emms();
-    return var;
-}
-
-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
-{
-    /* Initialize frame stats */
-    for( int i = 0; i < 3; i++ )
-    {
-        frame->i_pixel_sum[i] = 0;
-        frame->i_pixel_ssd[i] = 0;
-    }
-
-    /* Degenerate cases */
-    if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 )
-    {
-        /* Need to init it anyways for MB tree */
-        if( h->param.rc.i_aq_mode && h->param.rc.f_aq_strength == 0 )
-        {
-            if( quant_offsets )
-            {
-                for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
-                    frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy];
-                if( h->frames.b_have_lowres )
-                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
-                        frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] );
-            }
-            else
-            {
-                memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
-                memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
-                if( h->frames.b_have_lowres )
-                    for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
-                        frame->i_inv_qscale_factor[mb_xy] = 256;
-            }
-        }
-        /* Need variance data for weighted prediction */
-        if( h->param.analyse.i_weighted_pred )
-        {
-            for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
-                for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
-                    x264_ac_energy_mb( h, mb_x, mb_y, frame );
-        }
-        else
-            return;
-    }
-    /* Actual adaptive quantization */
-    else
-    {
-        /* constants chosen to result in approximately the same overall bitrate as without AQ.
-         * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
-        float strength;
-        float avg_adj = 0.f;
-        float bias_strength = 0.f;
-
-        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE || h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
-        {
-            float bit_depth_correction = 1.f / (1 << (2*(BIT_DEPTH-8)));
-            float avg_adj_pow2 = 0.f;
-            for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
-                for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
-                {
-                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    float qp_adj = powf( energy * bit_depth_correction + 1, 0.125f );
-                    frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
-                    avg_adj += qp_adj;
-                    avg_adj_pow2 += qp_adj * qp_adj;
-                }
-            avg_adj /= h->mb.i_mb_count;
-            avg_adj_pow2 /= h->mb.i_mb_count;
-            strength = h->param.rc.f_aq_strength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
-            bias_strength = h->param.rc.f_aq_strength;
-        }
-        else
-            strength = h->param.rc.f_aq_strength * 1.0397f;
-
-        for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
-            for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
-            {
-                float qp_adj;
-                int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
-                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
-                {
-                    qp_adj = frame->f_qp_offset[mb_xy];
-                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 14.f / (qp_adj * qp_adj));
-                }
-                else if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
-                {
-                    qp_adj = frame->f_qp_offset[mb_xy];
-                    qp_adj = strength * (qp_adj - avg_adj);
-                }
-                else
-                {
-                    uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8)));
-                }
-                if( quant_offsets )
-                    qp_adj += quant_offsets[mb_xy];
-                frame->f_qp_offset[mb_xy] =
-                frame->f_qp_offset_aq[mb_xy] = qp_adj;
-                if( h->frames.b_have_lowres )
-                    frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj);
-            }
-    }
-
-    /* Remove mean from SSD calculation */
-    for( int i = 0; i < 3; i++ )
-    {
-        uint64_t ssd = frame->i_pixel_ssd[i];
-        uint64_t sum = frame->i_pixel_sum[i];
-        int width  = 16*h->mb.i_mb_width  >> (i && CHROMA_H_SHIFT);
-        int height = 16*h->mb.i_mb_height >> (i && CHROMA_V_SHIFT);
-        frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
-    }
-}
-
-static int x264_macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc )
-{
-    /* Use fractional QP array dimensions to compensate for edge padding */
-    float srcdim[2] = {rc->mbtree.srcdim[0] / 16.f, rc->mbtree.srcdim[1] / 16.f};
-    float dstdim[2] = {    h->param.i_width / 16.f,    h->param.i_height / 16.f};
-    int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])};
-    int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])};
-    if( PARAM_INTERLACED )
-    {
-        srcdimi[1] = (srcdimi[1]+1)&~1;
-        dstdimi[1] = (dstdimi[1]+1)&~1;
-    }
-
-    rc->mbtree.src_mb_count = srcdimi[0] * srcdimi[1];
-
-    CHECKED_MALLOC( rc->mbtree.qp_buffer[0], rc->mbtree.src_mb_count * sizeof(uint16_t) );
-    if( h->param.i_bframe_pyramid && h->param.rc.b_stat_read )
-        CHECKED_MALLOC( rc->mbtree.qp_buffer[1], rc->mbtree.src_mb_count * sizeof(uint16_t) );
-    rc->mbtree.qpbuf_pos = -1;
-
-    /* No rescaling to do */
-    if( srcdimi[0] == dstdimi[0] && srcdimi[1] == dstdimi[1] )
-        return 0;
-
-    rc->mbtree.rescale_enabled = 1;
-
-    /* Allocate intermediate scaling buffers */
-    CHECKED_MALLOC( rc->mbtree.scale_buffer[0], srcdimi[0] * srcdimi[1] * sizeof(float) );
-    CHECKED_MALLOC( rc->mbtree.scale_buffer[1], dstdimi[0] * srcdimi[1] * sizeof(float) );
-
-    /* Allocate and calculate resize filter parameters and coefficients */
-    for( int i = 0; i < 2; i++ )
-    {
-        if( srcdim[i] > dstdim[i] ) // downscale
-            rc->mbtree.filtersize[i] = 1 + (2 * srcdimi[i] + dstdimi[i] - 1) / dstdimi[i];
-        else                        // upscale
-            rc->mbtree.filtersize[i] = 3;
-
-        CHECKED_MALLOC( rc->mbtree.coeffs[i], rc->mbtree.filtersize[i] * dstdimi[i] * sizeof(float) );
-        CHECKED_MALLOC( rc->mbtree.pos[i], dstdimi[i] * sizeof(int) );
-
-        /* Initialize filter coefficients */
-        float inc = srcdim[i] / dstdim[i];
-        float dmul = inc > 1.f ? dstdim[i] / srcdim[i] : 1.f;
-        float dstinsrc = 0.5f * inc - 0.5f;
-        int filtersize = rc->mbtree.filtersize[i];
-        for( int j = 0; j < dstdimi[i]; j++ )
-        {
-            int pos = dstinsrc - (filtersize - 2.f) * 0.5f;
-            float sum = 0.0;
-            rc->mbtree.pos[i][j] = pos;
-            for( int k = 0; k < filtersize; k++ )
-            {
-                float d = fabs( pos + k - dstinsrc ) * dmul;
-                float coeff = X264_MAX( 1.f - d, 0 );
-                rc->mbtree.coeffs[i][j * filtersize + k] = coeff;
-                sum += coeff;
-            }
-            sum = 1.0f / sum;
-            for( int k = 0; k < filtersize; k++ )
-                rc->mbtree.coeffs[i][j * filtersize + k] *= sum;
-            dstinsrc += inc;
-        }
-    }
-
-    /* Write back actual qp array dimensions */
-    rc->mbtree.srcdim[0] = srcdimi[0];
-    rc->mbtree.srcdim[1] = srcdimi[1];
-    return 0;
-fail:
-    return -1;
-}
-
-static void x264_macroblock_tree_rescale_destroy( x264_ratecontrol_t *rc )
-{
-    for( int i = 0; i < 2; i++ )
-    {
-        x264_free( rc->mbtree.qp_buffer[i] );
-        x264_free( rc->mbtree.scale_buffer[i] );
-        x264_free( rc->mbtree.coeffs[i] );
-        x264_free( rc->mbtree.pos[i] );
-    }
-}
-
-static ALWAYS_INLINE float tapfilter( float *src, int pos, int max, int stride, float *coeff, int filtersize )
-{
-    float sum = 0.f;
-    for( int i = 0; i < filtersize; i++, pos++ )
-        sum += src[x264_clip3( pos, 0, max-1 )*stride] * coeff[i];
-    return sum;
-}
-
-static void x264_macroblock_tree_rescale( x264_t *h, x264_ratecontrol_t *rc, float *dst )
-{
-    float *input, *output;
-    int filtersize, stride, height;
-
-    /* H scale first */
-    input = rc->mbtree.scale_buffer[0];
-    output = rc->mbtree.scale_buffer[1];
-    filtersize = rc->mbtree.filtersize[0];
-    stride = rc->mbtree.srcdim[0];
-    height = rc->mbtree.srcdim[1];
-    for( int y = 0; y < height; y++, input += stride, output += h->mb.i_mb_width )
-    {
-        float *coeff = rc->mbtree.coeffs[0];
-        for( int x = 0; x < h->mb.i_mb_width; x++, coeff+=filtersize )
-            output[x] = tapfilter( input, rc->mbtree.pos[0][x], stride, 1, coeff, filtersize );
-    }
-
-    /* V scale next */
-    input = rc->mbtree.scale_buffer[1];
-    output = dst;
-    filtersize = rc->mbtree.filtersize[1];
-    stride = h->mb.i_mb_width;
-    height = rc->mbtree.srcdim[1];
-    for( int x = 0; x < h->mb.i_mb_width; x++, input++, output++ )
-    {
-        float *coeff = rc->mbtree.coeffs[1];
-        for( int y = 0; y < h->mb.i_mb_height; y++, coeff+=filtersize )
-            output[y*stride] = tapfilter( input, rc->mbtree.pos[1][y], height, stride, coeff, filtersize );
-    }
-}
-
-int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
-
-    if( rc->entry[frame->i_frame].kept_as_ref )
-    {
-        uint8_t i_type;
-        if( rc->mbtree.qpbuf_pos < 0 )
-        {
-            do
-            {
-                rc->mbtree.qpbuf_pos++;
-
-                if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) )
-                    goto fail;
-                if( fread( rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], sizeof(uint16_t), rc->mbtree.src_mb_count, rc->p_mbtree_stat_file_in ) != rc->mbtree.src_mb_count )
-                    goto fail;
-
-                if( i_type != i_type_actual && rc->mbtree.qpbuf_pos == 1 )
-                {
-                    x264_log( h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type, i_type_actual );
-                    return -1;
-                }
-            } while( i_type != i_type_actual );
-        }
-
-        float *dst = rc->mbtree.rescale_enabled ? rc->mbtree.scale_buffer[0] : frame->f_qp_offset;
-        h->mc.mbtree_fix8_unpack( dst, rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], rc->mbtree.src_mb_count );
-        if( rc->mbtree.rescale_enabled )
-            x264_macroblock_tree_rescale( h, rc, frame->f_qp_offset );
-        if( h->frames.b_have_lowres )
-            for( int i = 0; i < h->mb.i_mb_count; i++ )
-                frame->i_inv_qscale_factor[i] = x264_exp2fix8( frame->f_qp_offset[i] );
-        rc->mbtree.qpbuf_pos--;
-    }
-    else
-        x264_stack_align( x264_adaptive_quant_frame, h, frame, quant_offsets );
-    return 0;
-fail:
-    x264_log( h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n" );
-    return -1;
-}
-
-int x264_reference_build_list_optimal( x264_t *h )
-{
-    ratecontrol_entry_t *rce = h->rc->rce;
-    x264_frame_t *frames[16];
-    x264_weight_t weights[16][3];
-    int refcount[16];
-
-    if( rce->refs != h->i_ref[0] )
-        return -1;
-
-    memcpy( frames, h->fref[0], sizeof(frames) );
-    memcpy( refcount, rce->refcount, sizeof(refcount) );
-    memcpy( weights, h->fenc->weight, sizeof(weights) );
-    memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) );
-
-    /* For now don't reorder ref 0; it seems to lower quality
-       in most cases due to skips. */
-    for( int ref = 1; ref < h->i_ref[0]; ref++ )
-    {
-        int max = -1;
-        int bestref = 1;
-
-        for( int i = 1; i < h->i_ref[0]; i++ )
-            /* Favor lower POC as a tiebreaker. */
-            COPY2_IF_GT( max, refcount[i], bestref, i );
-
-        /* FIXME: If there are duplicates from frames other than ref0 then it is possible
-         * that the optimal ordering doesnt place every duplicate. */
-
-        refcount[bestref] = -1;
-        h->fref[0][ref] = frames[bestref];
-        memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) );
-    }
-
-    return 0;
-}
-
-static char *x264_strcat_filename( char *input, char *suffix )
-{
-    char *output = x264_malloc( strlen( input ) + strlen( suffix ) + 1 );
-    if( !output )
-        return NULL;
-    strcpy( output, input );
-    strcat( output, suffix );
-    return output;
-}
-
-void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    if( !b_init && rc->b_2pass )
-        return;
-
-    if( h->param.rc.i_rc_method == X264_RC_CRF )
-    {
-        /* Arbitrary rescaling to make CRF somewhat similar to QP.
-         * Try to compensate for MB-tree's effects as well. */
-        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
-        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
-                                 / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset + QP_BD_OFFSET );
-    }
-
-    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
-    {
-        /* We don't support changing the ABR bitrate right now,
-           so if the stream starts as CBR, keep it CBR. */
-        if( rc->b_vbv_min_rate )
-            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
-
-        if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
-        {
-            h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
-            x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
-                      h->param.rc.i_vbv_buffer_size );
-        }
-
-        int kilobit_size = h->param.i_avcintra_class ? 1024 : 1000;
-        int vbv_buffer_size = h->param.rc.i_vbv_buffer_size * kilobit_size;
-        int vbv_max_bitrate = h->param.rc.i_vbv_max_bitrate * kilobit_size;
-
-        /* Init HRD */
-        if( h->param.i_nal_hrd && b_init )
-        {
-            h->sps->vui.hrd.i_cpb_cnt = 1;
-            h->sps->vui.hrd.b_cbr_hrd = h->param.i_nal_hrd == X264_NAL_HRD_CBR;
-            h->sps->vui.hrd.i_time_offset_length = 0;
-
-            #define BR_SHIFT  6
-            #define CPB_SHIFT 4
-
-            // normalize HRD size and rate to the value / scale notation
-            h->sps->vui.hrd.i_bit_rate_scale = x264_clip3( x264_ctz( vbv_max_bitrate ) - BR_SHIFT, 0, 15 );
-            h->sps->vui.hrd.i_bit_rate_value = vbv_max_bitrate >> ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT );
-            h->sps->vui.hrd.i_bit_rate_unscaled = h->sps->vui.hrd.i_bit_rate_value << ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT );
-            h->sps->vui.hrd.i_cpb_size_scale = x264_clip3( x264_ctz( vbv_buffer_size ) - CPB_SHIFT, 0, 15 );
-            h->sps->vui.hrd.i_cpb_size_value = vbv_buffer_size >> ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT );
-            h->sps->vui.hrd.i_cpb_size_unscaled = h->sps->vui.hrd.i_cpb_size_value << ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT );
-
-            #undef CPB_SHIFT
-            #undef BR_SHIFT
-
-            // arbitrary
-            #define MAX_DURATION 0.5
-
-            int max_cpb_output_delay = X264_MIN( h->param.i_keyint_max * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick, INT_MAX );
-            int max_dpb_output_delay = h->sps->vui.i_max_dec_frame_buffering * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick;
-            int max_delay = (int)(90000.0 * (double)h->sps->vui.hrd.i_cpb_size_unscaled / h->sps->vui.hrd.i_bit_rate_unscaled + 0.5);
-
-            h->sps->vui.hrd.i_initial_cpb_removal_delay_length = 2 + x264_clip3( 32 - x264_clz( max_delay ), 4, 22 );
-            h->sps->vui.hrd.i_cpb_removal_delay_length = x264_clip3( 32 - x264_clz( max_cpb_output_delay ), 4, 31 );
-            h->sps->vui.hrd.i_dpb_output_delay_length  = x264_clip3( 32 - x264_clz( max_dpb_output_delay ), 4, 31 );
-
-            #undef MAX_DURATION
-
-            vbv_buffer_size = h->sps->vui.hrd.i_cpb_size_unscaled;
-            vbv_max_bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
-        }
-        else if( h->param.i_nal_hrd && !b_init )
-        {
-            x264_log( h, X264_LOG_WARNING, "VBV parameters cannot be changed when NAL HRD is in use\n" );
-            return;
-        }
-        h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate;
-        h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size;
-
-        if( rc->b_vbv_min_rate )
-            rc->bitrate = (double)h->param.rc.i_bitrate * kilobit_size;
-        rc->buffer_rate = vbv_max_bitrate / rc->fps;
-        rc->vbv_max_rate = vbv_max_bitrate;
-        rc->buffer_size = vbv_buffer_size;
-        rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
-        rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
-                      * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
-        if( h->param.rc.i_rc_method == X264_RC_CRF && h->param.rc.f_rf_constant_max )
-        {
-            rc->rate_factor_max_increment = h->param.rc.f_rf_constant_max - h->param.rc.f_rf_constant;
-            if( rc->rate_factor_max_increment <= 0 )
-            {
-                x264_log( h, X264_LOG_WARNING, "CRF max must be greater than CRF\n" );
-                rc->rate_factor_max_increment = 0;
-            }
-        }
-        if( b_init )
-        {
-            if( h->param.rc.f_vbv_buffer_init > 1. )
-                h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
-            h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1);
-            rc->buffer_fill_final =
-            rc->buffer_fill_final_min = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
-            rc->b_vbv = 1;
-            rc->b_vbv_min_rate = !rc->b_2pass
-                          && h->param.rc.i_rc_method == X264_RC_ABR
-                          && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
-        }
-    }
-}
-
-int x264_ratecontrol_new( x264_t *h )
-{
-    x264_ratecontrol_t *rc;
-
-    x264_emms();
-
-    CHECKED_MALLOCZERO( h->rc, h->param.i_threads * sizeof(x264_ratecontrol_t) );
-    rc = h->rc;
-
-    rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
-    rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;
-
-    /* FIXME: use integers */
-    if( h->param.i_fps_num > 0 && h->param.i_fps_den > 0 )
-        rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den;
-    else
-        rc->fps = 25.0;
-
-    if( h->param.rc.b_mb_tree )
-    {
-        h->param.rc.f_pb_factor = 1;
-        rc->qcompress = 1;
-    }
-    else
-        rc->qcompress = h->param.rc.f_qcompress;
-
-    rc->bitrate = h->param.rc.i_bitrate * (h->param.i_avcintra_class ? 1024. : 1000.);
-    rc->rate_tolerance = h->param.rc.f_rate_tolerance;
-    rc->nmb = h->mb.i_mb_count;
-    rc->last_non_b_pict_type = -1;
-    rc->cbr_decay = 1.0;
-
-    if( h->param.rc.i_rc_method == X264_RC_CRF && h->param.rc.b_stat_read )
-    {
-        x264_log( h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n" );
-        return -1;
-    }
-
-    x264_ratecontrol_init_reconfigurable( h, 1 );
-
-    if( h->param.i_nal_hrd )
-    {
-        uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale;
-        uint64_t num = 90000;
-        x264_reduce_fraction64( &num, &denom );
-        rc->hrd_multiply_denom = 90000 / num;
-
-        double bits_required = log2( num )
-                             + log2( h->sps->vui.i_time_scale )
-                             + log2( h->sps->vui.hrd.i_cpb_size_unscaled );
-        if( bits_required >= 63 )
-        {
-            x264_log( h, X264_LOG_ERROR, "HRD with very large timescale and bufsize not supported\n" );
-            return -1;
-        }
-    }
-
-    if( rc->rate_tolerance < 0.01 )
-    {
-        x264_log( h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n" );
-        rc->rate_tolerance = 0.01;
-    }
-
-    h->mb.b_variable_qp = rc->b_vbv || h->param.rc.i_aq_mode;
-
-    if( rc->b_abr )
-    {
-        /* FIXME ABR_INIT_QP is actually used only in CRF */
-#define ABR_INIT_QP (( h->param.rc.i_rc_method == X264_RC_CRF ? h->param.rc.f_rf_constant : 24 ) + QP_BD_OFFSET)
-        rc->accum_p_norm = .01;
-        rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm;
-        /* estimated ratio that produces a reasonable QP for the first I-frame */
-        rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 );
-        rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps;
-        rc->last_non_b_pict_type = SLICE_TYPE_I;
-    }
-
-    rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor );
-    rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor );
-    rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
-    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX );
-    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX );
-    h->mb.ip_offset = rc->ip_offset + 0.5;
-
-    rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
-    rc->last_qscale = qp2qscale( 26 + QP_BD_OFFSET );
-    int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
-    CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
-    CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
-    static const float pred_coeff_table[3] = { 1.0, 1.0, 1.5 };
-    for( int i = 0; i < 3; i++ )
-    {
-        rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
-        rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
-        rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
-        for( int j = 0; j < num_preds; j++ )
-        {
-            rc->pred[i+j*5].coeff_min = pred_coeff_table[i] / 2;
-            rc->pred[i+j*5].coeff = pred_coeff_table[i];
-            rc->pred[i+j*5].count = 1.0;
-            rc->pred[i+j*5].decay = 0.5;
-            rc->pred[i+j*5].offset = 0.0;
-        }
-        for( int j = 0; j < 2; j++ )
-        {
-            rc->row_preds[i][j].coeff_min = .25 / 4;
-            rc->row_preds[i][j].coeff = .25;
-            rc->row_preds[i][j].count = 1.0;
-            rc->row_preds[i][j].decay = 0.5;
-            rc->row_preds[i][j].offset = 0.0;
-        }
-    }
-    rc->pred_b_from_p->coeff_min = 0.5 / 2;
-    rc->pred_b_from_p->coeff = 0.5;
-    rc->pred_b_from_p->count = 1.0;
-    rc->pred_b_from_p->decay = 0.5;
-    rc->pred_b_from_p->offset = 0.0;
-
-    if( parse_zones( h ) < 0 )
-    {
-        x264_log( h, X264_LOG_ERROR, "failed to parse zones\n" );
-        return -1;
-    }
-
-    /* Load stat file and init 2pass algo */
-    if( h->param.rc.b_stat_read )
-    {
-        char *p, *stats_in, *stats_buf;
-
-        /* read 1st pass stats */
-        assert( h->param.rc.psz_stat_in );
-        stats_buf = stats_in = x264_slurp_file( h->param.rc.psz_stat_in );
-        if( !stats_buf )
-        {
-            x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" );
-            return -1;
-        }
-        if( h->param.rc.b_mb_tree )
-        {
-            char *mbtree_stats_in = x264_strcat_filename( h->param.rc.psz_stat_in, ".mbtree" );
-            if( !mbtree_stats_in )
-                return -1;
-            rc->p_mbtree_stat_file_in = x264_fopen( mbtree_stats_in, "rb" );
-            x264_free( mbtree_stats_in );
-            if( !rc->p_mbtree_stat_file_in )
-            {
-                x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" );
-                return -1;
-            }
-        }
-
-        /* check whether 1st pass options were compatible with current options */
-        if( strncmp( stats_buf, "#options:", 9 ) )
-        {
-            x264_log( h, X264_LOG_ERROR, "options list in stats file not valid\n" );
-            return -1;
-        }
-
-        float res_factor, res_factor_bits;
-        {
-            int i, j;
-            uint32_t k, l;
-            char *opts = stats_buf;
-            stats_in = strchr( stats_buf, '\n' );
-            if( !stats_in )
-                return -1;
-            *stats_in = '\0';
-            stats_in++;
-            if( sscanf( opts, "#options: %dx%d", &i, &j ) != 2 )
-            {
-                x264_log( h, X264_LOG_ERROR, "resolution specified in stats file not valid\n" );
-                return -1;
-            }
-            else if( h->param.rc.b_mb_tree )
-            {
-                rc->mbtree.srcdim[0] = i;
-                rc->mbtree.srcdim[1] = j;
-            }
-            res_factor = (float)h->param.i_width * h->param.i_height / (i*j);
-            /* Change in bits relative to resolution isn't quite linear on typical sources,
-             * so we'll at least try to roughly approximate this effect. */
-            res_factor_bits = powf( res_factor, 0.7 );
-
-            if( !( p = strstr( opts, "timebase=" ) ) || sscanf( p, "timebase=%u/%u", &k, &l ) != 2 )
-            {
-                x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
-                return -1;
-            }
-            if( k != h->param.i_timebase_num || l != h->param.i_timebase_den )
-            {
-                x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
-                          h->param.i_timebase_num, h->param.i_timebase_den, k, l );
-                return -1;
-            }
-
-            CMP_OPT_FIRST_PASS( "bitdepth", BIT_DEPTH );
-            CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
-            CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe );
-            CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid );
-            CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh );
-            CMP_OPT_FIRST_PASS( "open_gop", h->param.b_open_gop );
-            CMP_OPT_FIRST_PASS( "bluray_compat", h->param.b_bluray_compat );
-
-            if( (p = strstr( opts, "interlaced=" )) )
-            {
-                char *current = h->param.b_interlaced ? h->param.b_tff ? "tff" : "bff" : h->param.b_fake_interlaced ? "fake" : "0";
-                char buf[5];
-                sscanf( p, "interlaced=%4s", buf );
-                if( strcmp( current, buf ) )
-                {
-                    x264_log( h, X264_LOG_ERROR, "different interlaced setting than first pass (%s vs %s)\n", current, buf );
-                    return -1;
-                }
-            }
-
-            if( (p = strstr( opts, "keyint=" )) )
-            {
-                p += 7;
-                char buf[13] = "infinite ";
-                if( h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE )
-                    sprintf( buf, "%d ", h->param.i_keyint_max );
-                if( strncmp( p, buf, strlen(buf) ) )
-                {
-                    x264_log( h, X264_LOG_ERROR, "different keyint setting than first pass (%.*s vs %.*s)\n",
-                              strlen(buf)-1, buf, strcspn(p, " "), p );
-                    return -1;
-                }
-            }
-
-            if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
-                x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );
-
-            if( !strstr( opts, "direct=3" ) && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
-            {
-                x264_log( h, X264_LOG_WARNING, "direct=auto not used on the first pass\n" );
-                h->mb.b_direct_auto_write = 1;
-            }
-
-            if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS )
-                h->param.i_bframe_adaptive = i;
-            else if( h->param.i_bframe )
-            {
-                x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" );
-                return -1;
-            }
-
-            if( (h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size) && ( p = strstr( opts, "rc_lookahead=" ) ) && sscanf( p, "rc_lookahead=%d", &i ) )
-                h->param.rc.i_lookahead = i;
-        }
-
-        /* find number of pics */
-        p = stats_in;
-        int num_entries;
-        for( num_entries = -1; p; num_entries++ )
-            p = strchr( p + 1, ';' );
-        if( !num_entries )
-        {
-            x264_log( h, X264_LOG_ERROR, "empty stats file\n" );
-            return -1;
-        }
-        rc->num_entries = num_entries;
-
-        if( h->param.i_frame_total < rc->num_entries && h->param.i_frame_total > 0 )
-        {
-            x264_log( h, X264_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
-                      h->param.i_frame_total, rc->num_entries );
-        }
-        if( h->param.i_frame_total > rc->num_entries )
-        {
-            x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
-                      h->param.i_frame_total, rc->num_entries );
-            return -1;
-        }
-
-        CHECKED_MALLOCZERO( rc->entry, rc->num_entries * sizeof(ratecontrol_entry_t) );
-        CHECKED_MALLOC( rc->entry_out, rc->num_entries * sizeof(ratecontrol_entry_t*) );
-
-        /* init all to skipped p frames */
-        for( int i = 0; i < rc->num_entries; i++ )
-        {
-            ratecontrol_entry_t *rce = &rc->entry[i];
-            rce->pict_type = SLICE_TYPE_P;
-            rce->qscale = rce->new_qscale = qp2qscale( 20 + QP_BD_OFFSET );
-            rce->misc_bits = rc->nmb + 10;
-            rce->new_qp = 0;
-            rc->entry_out[i] = rce;
-        }
-
-        /* read stats */
-        p = stats_in;
-        double total_qp_aq = 0;
-        for( int i = 0; i < rc->num_entries; i++ )
-        {
-            ratecontrol_entry_t *rce;
-            int frame_number = 0;
-            int frame_out_number = 0;
-            char pict_type = 0;
-            int e;
-            char *next;
-            float qp_rc, qp_aq;
-            int ref;
-
-            next= strchr(p, ';');
-            if( next )
-                *next++ = 0; //sscanf is unbelievably slow on long strings
-            e = sscanf( p, " in:%d out:%d ", &frame_number, &frame_out_number );
-
-            if( frame_number < 0 || frame_number >= rc->num_entries )
-            {
-                x264_log( h, X264_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frame_number, i );
-                return -1;
-            }
-            if( frame_out_number < 0 || frame_out_number >= rc->num_entries )
-            {
-                x264_log( h, X264_LOG_ERROR, "bad frame output number (%d) at stats line %d\n", frame_out_number, i );
-                return -1;
-            }
-            rce = &rc->entry[frame_number];
-            rc->entry_out[frame_out_number] = rce;
-            rce->direct_mode = 0;
-
-            e += sscanf( p, " in:%*d out:%*d type:%c dur:%"SCNd64" cpbdur:%"SCNd64" q:%f aq:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
-                   &pict_type, &rce->i_duration, &rce->i_cpb_duration, &qp_rc, &qp_aq, &rce->tex_bits,
-                   &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
-                   &rce->s_count, &rce->direct_mode );
-            rce->tex_bits  *= res_factor_bits;
-            rce->mv_bits   *= res_factor_bits;
-            rce->misc_bits *= res_factor_bits;
-            rce->i_count   *= res_factor;
-            rce->p_count   *= res_factor;
-            rce->s_count   *= res_factor;
-
-            p = strstr( p, "ref:" );
-            if( !p )
-                goto parse_error;
-            p += 4;
-            for( ref = 0; ref < 16; ref++ )
-            {
-                if( sscanf( p, " %d", &rce->refcount[ref] ) != 1 )
-                    break;
-                p = strchr( p+1, ' ' );
-                if( !p )
-                    goto parse_error;
-            }
-            rce->refs = ref;
-
-            /* find weights */
-            rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
-            char *w = strchr( p, 'w' );
-            if( w )
-            {
-                int count = sscanf( w, "w:%hd,%hd,%hd,%hd,%hd,%hd,%hd,%hd",
-                                    &rce->i_weight_denom[0], &rce->weight[0][0], &rce->weight[0][1],
-                                    &rce->i_weight_denom[1], &rce->weight[1][0], &rce->weight[1][1],
-                                    &rce->weight[2][0], &rce->weight[2][1] );
-                if( count == 3 )
-                    rce->i_weight_denom[1] = -1;
-                else if ( count != 8 )
-                    rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1;
-            }
-
-            if( pict_type != 'b' )
-                rce->kept_as_ref = 1;
-            switch( pict_type )
-            {
-                case 'I':
-                    rce->frame_type = X264_TYPE_IDR;
-                    rce->pict_type  = SLICE_TYPE_I;
-                    break;
-                case 'i':
-                    rce->frame_type = X264_TYPE_I;
-                    rce->pict_type  = SLICE_TYPE_I;
-                    break;
-                case 'P':
-                    rce->frame_type = X264_TYPE_P;
-                    rce->pict_type  = SLICE_TYPE_P;
-                    break;
-                case 'B':
-                    rce->frame_type = X264_TYPE_BREF;
-                    rce->pict_type  = SLICE_TYPE_B;
-                    break;
-                case 'b':
-                    rce->frame_type = X264_TYPE_B;
-                    rce->pict_type  = SLICE_TYPE_B;
-                    break;
-                default:  e = -1; break;
-            }
-            if( e < 14 )
-            {
-parse_error:
-                x264_log( h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e );
-                return -1;
-            }
-            rce->qscale = qp2qscale( qp_rc );
-            total_qp_aq += qp_aq;
-            p = next;
-        }
-        if( !h->param.b_stitchable )
-            h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
-
-        x264_free( stats_buf );
-
-        if( h->param.rc.i_rc_method == X264_RC_ABR )
-        {
-            if( init_pass2( h ) < 0 )
-                return -1;
-        } /* else we're using constant quant, so no need to run the bitrate allocation */
-    }
-
-    /* Open output file */
-    /* If input and output files are the same, output to a temp file
-     * and move it to the real name only when it's complete */
-    if( h->param.rc.b_stat_write )
-    {
-        char *p;
-        rc->psz_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".temp" );
-        if( !rc->psz_stat_file_tmpname )
-            return -1;
-
-        rc->p_stat_file_out = x264_fopen( rc->psz_stat_file_tmpname, "wb" );
-        if( rc->p_stat_file_out == NULL )
-        {
-            x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" );
-            return -1;
-        }
-
-        p = x264_param2string( &h->param, 1 );
-        if( p )
-            fprintf( rc->p_stat_file_out, "#options: %s\n", p );
-        x264_free( p );
-        if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
-        {
-            rc->psz_mbtree_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree.temp" );
-            rc->psz_mbtree_stat_file_name = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree" );
-            if( !rc->psz_mbtree_stat_file_tmpname || !rc->psz_mbtree_stat_file_name )
-                return -1;
-
-            rc->p_mbtree_stat_file_out = x264_fopen( rc->psz_mbtree_stat_file_tmpname, "wb" );
-            if( rc->p_mbtree_stat_file_out == NULL )
-            {
-                x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" );
-                return -1;
-            }
-        }
-    }
-
-    if( h->param.rc.b_mb_tree && (h->param.rc.b_stat_read || h->param.rc.b_stat_write) )
-    {
-        if( !h->param.rc.b_stat_read )
-        {
-            rc->mbtree.srcdim[0] = h->param.i_width;
-            rc->mbtree.srcdim[1] = h->param.i_height;
-        }
-        if( x264_macroblock_tree_rescale_init( h, rc ) < 0 )
-            return -1;
-    }
-
-    for( int i = 0; i<h->param.i_threads; i++ )
-    {
-        h->thread[i]->rc = rc+i;
-        if( i )
-        {
-            rc[i] = rc[0];
-            h->thread[i]->param = h->param;
-            h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp;
-            h->thread[i]->mb.ip_offset = h->mb.ip_offset;
-        }
-    }
-
-    return 0;
-fail:
-    return -1;
-}
-
-static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
-{
-    int len = 0;
-    char *tok, UNUSED *saveptr=NULL;
-    z->param = NULL;
-    z->f_bitrate_factor = 1;
-    if( 3 <= sscanf(p, "%d,%d,q=%d%n", &z->i_start, &z->i_end, &z->i_qp, &len) )
-        z->b_force_qp = 1;
-    else if( 3 <= sscanf(p, "%d,%d,b=%f%n", &z->i_start, &z->i_end, &z->f_bitrate_factor, &len) )
-        z->b_force_qp = 0;
-    else if( 2 <= sscanf(p, "%d,%d%n", &z->i_start, &z->i_end, &len) )
-        z->b_force_qp = 0;
-    else
-    {
-        x264_log( h, X264_LOG_ERROR, "invalid zone: \"%s\"\n", p );
-        return -1;
-    }
-    p += len;
-    if( !*p )
-        return 0;
-    CHECKED_MALLOC( z->param, sizeof(x264_param_t) );
-    memcpy( z->param, &h->param, sizeof(x264_param_t) );
-    z->param->param_free = x264_free;
-    while( (tok = strtok_r( p, ",", &saveptr )) )
-    {
-        char *val = strchr( tok, '=' );
-        if( val )
-        {
-            *val = '\0';
-            val++;
-        }
-        if( x264_param_parse( z->param, tok, val ) )
-        {
-            x264_log( h, X264_LOG_ERROR, "invalid zone param: %s = %s\n", tok, val );
-            return -1;
-        }
-        p = NULL;
-    }
-    return 0;
-fail:
-    return -1;
-}
-
-static int parse_zones( x264_t *h )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    if( h->param.rc.psz_zones && !h->param.rc.i_zones )
-    {
-        char *psz_zones, *p;
-        CHECKED_MALLOC( psz_zones, strlen( h->param.rc.psz_zones )+1 );
-        strcpy( psz_zones, h->param.rc.psz_zones );
-        h->param.rc.i_zones = 1;
-        for( p = psz_zones; *p; p++ )
-            h->param.rc.i_zones += (*p == '/');
-        CHECKED_MALLOC( h->param.rc.zones, h->param.rc.i_zones * sizeof(x264_zone_t) );
-        p = psz_zones;
-        for( int i = 0; i < h->param.rc.i_zones; i++ )
-        {
-            int i_tok = strcspn( p, "/" );
-            p[i_tok] = 0;
-            if( parse_zone( h, &h->param.rc.zones[i], p ) )
-            {
-                x264_free( psz_zones );
-                return -1;
-            }
-            p += i_tok + 1;
-        }
-        x264_free( psz_zones );
-    }
-
-    if( h->param.rc.i_zones > 0 )
-    {
-        for( int i = 0; i < h->param.rc.i_zones; i++ )
-        {
-            x264_zone_t z = h->param.rc.zones[i];
-            if( z.i_start < 0 || z.i_start > z.i_end )
-            {
-                x264_log( h, X264_LOG_ERROR, "invalid zone: start=%d end=%d\n",
-                          z.i_start, z.i_end );
-                return -1;
-            }
-            else if( !z.b_force_qp && z.f_bitrate_factor <= 0 )
-            {
-                x264_log( h, X264_LOG_ERROR, "invalid zone: bitrate_factor=%f\n",
-                          z.f_bitrate_factor );
-                return -1;
-            }
-        }
-
-        rc->i_zones = h->param.rc.i_zones + 1;
-        CHECKED_MALLOC( rc->zones, rc->i_zones * sizeof(x264_zone_t) );
-        memcpy( rc->zones+1, h->param.rc.zones, (rc->i_zones-1) * sizeof(x264_zone_t) );
-
-        // default zone to fall back to if none of the others match
-        rc->zones[0].i_start = 0;
-        rc->zones[0].i_end = INT_MAX;
-        rc->zones[0].b_force_qp = 0;
-        rc->zones[0].f_bitrate_factor = 1;
-        CHECKED_MALLOC( rc->zones[0].param, sizeof(x264_param_t) );
-        memcpy( rc->zones[0].param, &h->param, sizeof(x264_param_t) );
-        for( int i = 1; i < rc->i_zones; i++ )
-        {
-            if( !rc->zones[i].param )
-                rc->zones[i].param = rc->zones[0].param;
-        }
-    }
-
-    return 0;
-fail:
-    return -1;
-}
-
-static x264_zone_t *get_zone( x264_t *h, int frame_num )
-{
-    for( int i = h->rc->i_zones - 1; i >= 0; i-- )
-    {
-        x264_zone_t *z = &h->rc->zones[i];
-        if( frame_num >= z->i_start && frame_num <= z->i_end )
-            return z;
-    }
-    return NULL;
-}
-
-void x264_ratecontrol_summary( x264_t *h )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
-    {
-        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
-        x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
-                  qscale2qp( pow( base_cplx, 1 - rc->qcompress )
-                             * rc->cplxr_sum / rc->wanted_bits_window ) - mbtree_offset - QP_BD_OFFSET );
-    }
-}
-
-void x264_ratecontrol_delete( x264_t *h )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    int b_regular_file;
-
-    if( rc->p_stat_file_out )
-    {
-        b_regular_file = x264_is_regular_file( rc->p_stat_file_out );
-        fclose( rc->p_stat_file_out );
-        if( h->i_frame >= rc->num_entries && b_regular_file )
-            if( x264_rename( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ) != 0 )
-            {
-                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
-                          rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out );
-            }
-        x264_free( rc->psz_stat_file_tmpname );
-    }
-    if( rc->p_mbtree_stat_file_out )
-    {
-        b_regular_file = x264_is_regular_file( rc->p_mbtree_stat_file_out );
-        fclose( rc->p_mbtree_stat_file_out );
-        if( h->i_frame >= rc->num_entries && b_regular_file )
-            if( x264_rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 )
-            {
-                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
-                          rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name );
-            }
-        x264_free( rc->psz_mbtree_stat_file_tmpname );
-        x264_free( rc->psz_mbtree_stat_file_name );
-    }
-    if( rc->p_mbtree_stat_file_in )
-        fclose( rc->p_mbtree_stat_file_in );
-    x264_free( rc->pred );
-    x264_free( rc->pred_b_from_p );
-    x264_free( rc->entry );
-    x264_free( rc->entry_out );
-    x264_macroblock_tree_rescale_destroy( rc );
-    if( rc->zones )
-    {
-        x264_free( rc->zones[0].param );
-        for( int i = 1; i < rc->i_zones; i++ )
-            if( rc->zones[i].param != rc->zones[0].param && rc->zones[i].param->param_free )
-                rc->zones[i].param->param_free( rc->zones[i].param );
-        x264_free( rc->zones );
-    }
-    x264_free( rc );
-}
-
-static void accum_p_qp_update( x264_t *h, float qp )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    rc->accum_p_qp   *= .95;
-    rc->accum_p_norm *= .95;
-    rc->accum_p_norm += 1;
-    if( h->sh.i_type == SLICE_TYPE_I )
-        rc->accum_p_qp += qp + rc->ip_offset;
-    else
-        rc->accum_p_qp += qp;
-}
-
-void x264_ratecontrol_zone_init( x264_t *h )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    x264_zone_t *zone = get_zone( h, h->fenc->i_frame );
-    if( zone && (!rc->prev_zone || zone->param != rc->prev_zone->param) )
-        x264_encoder_reconfig_apply( h, zone->param );
-    rc->prev_zone = zone;
-}
-
-/* Before encoding a frame, choose a QP for it */
-void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    ratecontrol_entry_t *rce = NULL;
-    x264_zone_t *zone = get_zone( h, h->fenc->i_frame );
-    float q;
-
-    x264_emms();
-
-    if( h->param.rc.b_stat_read )
-    {
-        int frame = h->fenc->i_frame;
-        assert( frame >= 0 && frame < rc->num_entries );
-        rce = h->rc->rce = &h->rc->entry[frame];
-
-        if( h->sh.i_type == SLICE_TYPE_B
-            && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
-        {
-            h->sh.b_direct_spatial_mv_pred = ( rce->direct_mode == 's' );
-            h->mb.b_direct_auto_read = ( rce->direct_mode == 's' || rce->direct_mode == 't' );
-        }
-    }
-
-    if( rc->b_vbv )
-    {
-        memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) );
-        memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) );
-        memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) );
-        rc->row_pred = rc->row_preds[h->sh.i_type];
-        rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-        update_vbv_plan( h, overhead );
-
-        const x264_level_t *l = x264_levels;
-        while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
-            l++;
-
-        int mincr = l->mincr;
-
-        if( h->param.b_bluray_compat )
-            mincr = 4;
-
-        /* Profiles above High don't require minCR, so just set the maximum to a large value. */
-        if( h->sps->i_profile_idc > PROFILE_HIGH )
-            rc->frame_size_maximum = 1e9;
-        else
-        {
-            /* The spec has a bizarre special case for the first frame. */
-            if( h->i_frame == 0 )
-            {
-                //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
-                double fr = 1. / 172;
-                int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
-                rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
-            }
-            else
-            {
-                //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
-                rc->frame_size_maximum = 384 * BIT_DEPTH * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr;
-            }
-        }
-    }
-
-    if( h->sh.i_type != SLICE_TYPE_B )
-        rc->bframes = h->fenc->i_bframes;
-
-    if( rc->b_abr )
-    {
-        q = qscale2qp( rate_estimate_qscale( h ) );
-    }
-    else if( rc->b_2pass )
-    {
-        rce->new_qscale = rate_estimate_qscale( h );
-        q = qscale2qp( rce->new_qscale );
-    }
-    else /* CQP */
-    {
-        if( h->sh.i_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
-            q = ( rc->qp_constant[ SLICE_TYPE_B ] + rc->qp_constant[ SLICE_TYPE_P ] ) / 2;
-        else
-            q = rc->qp_constant[ h->sh.i_type ];
-
-        if( zone )
-        {
-            if( zone->b_force_qp )
-                q += zone->i_qp - rc->qp_constant[SLICE_TYPE_P];
-            else
-                q -= 6*log2f( zone->f_bitrate_factor );
-        }
-    }
-    if( i_force_qp != X264_QP_AUTO )
-        q = i_force_qp - 1;
-
-    q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-
-    rc->qpa_rc = rc->qpa_rc_prev =
-    rc->qpa_aq = rc->qpa_aq_prev = 0;
-    h->fdec->f_qp_avg_rc =
-    h->fdec->f_qp_avg_aq =
-    rc->qpm = q;
-    if( rce )
-        rce->new_qp = q;
-
-    accum_p_qp_update( h, rc->qpm );
-
-    if( h->sh.i_type != SLICE_TYPE_B )
-        rc->last_non_b_pict_type = h->sh.i_type;
-}
-
-static float predict_row_size( x264_t *h, int y, float qscale )
-{
-    /* average between two predictors:
-     * absolute SATD, and scaled bit cost of the colocated row in the previous frame */
-    x264_ratecontrol_t *rc = h->rc;
-    float pred_s = predict_size( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y] );
-    if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] )
-    {
-        if( h->sh.i_type == SLICE_TYPE_P
-            && h->fref[0][0]->i_type == h->fdec->i_type
-            && h->fref[0][0]->f_row_qscale[y] > 0
-            && h->fref[0][0]->i_row_satd[y] > 0
-            && (abs(h->fref[0][0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
-        {
-            float pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y]
-                         * h->fref[0][0]->f_row_qscale[y] / qscale;
-            return (pred_s + pred_t) * 0.5f;
-        }
-        return pred_s;
-    }
-    /* Our QP is lower than the reference! */
-    else
-    {
-        float pred_intra = predict_size( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] );
-        /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
-        return pred_intra + pred_s;
-    }
-}
-
-static int row_bits_so_far( x264_t *h, int y )
-{
-    int bits = 0;
-    for( int i = h->i_threadslice_start; i <= y; i++ )
-        bits += h->fdec->i_row_bits[i];
-    return bits;
-}
-
-static float predict_row_size_to_end( x264_t *h, int y, float qp )
-{
-    float qscale = qp2qscale( qp );
-    float bits = 0;
-    for( int i = y+1; i < h->i_threadslice_end; i++ )
-        bits += predict_row_size( h, i, qscale );
-    return bits;
-}
-
-/* TODO:
- *  eliminate all use of qp in row ratecontrol: make it entirely qscale-based.
- *  make this function stop being needlessly O(N^2)
- *  update more often than once per row? */
-int x264_ratecontrol_mb( x264_t *h, int bits )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    const int y = h->mb.i_mb_y;
-
-    h->fdec->i_row_bits[y] += bits;
-    rc->qpa_aq += h->mb.i_qp;
-
-    if( h->mb.i_mb_x != h->mb.i_mb_width - 1 )
-        return 0;
-
-    x264_emms();
-    rc->qpa_rc += rc->qpm * h->mb.i_mb_width;
-
-    if( !rc->b_vbv )
-        return 0;
-
-    float qscale = qp2qscale( rc->qpm );
-    h->fdec->f_row_qp[y] = rc->qpm;
-    h->fdec->f_row_qscale[y] = qscale;
-
-    update_predictor( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
-    if( h->sh.i_type != SLICE_TYPE_I && rc->qpm < h->fref[0][0]->f_row_qp[y] )
-        update_predictor( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
-
-    /* update ratecontrol per-mbpair in MBAFF */
-    if( SLICE_MBAFF && !(y&1) )
-        return 0;
-
-    /* FIXME: We don't currently support the case where there's a slice
-     * boundary in between. */
-    int can_reencode_row = h->sh.i_first_mb <= ((h->mb.i_mb_y - SLICE_MBAFF) * h->mb.i_mb_stride);
-
-    /* tweak quality based on difference from predicted size */
-    float prev_row_qp = h->fdec->f_row_qp[y];
-    float qp_absolute_max = h->param.rc.i_qp_max;
-    if( rc->rate_factor_max_increment )
-        qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment );
-    float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max );
-    float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
-    float step_size = 0.5f;
-    float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
-    float bits_so_far = row_bits_so_far( h, y );
-    float max_frame_error = x264_clip3f( 1.0 / h->mb.i_mb_height, 0.05, 0.25 );
-    float max_frame_size = rc->frame_size_maximum - rc->frame_size_maximum * max_frame_error;
-    max_frame_size = X264_MIN( max_frame_size, rc->buffer_fill - rc->buffer_rate * max_frame_error );
-    float size_of_other_slices = 0;
-    if( h->param.b_sliced_threads )
-    {
-        float size_of_other_slices_planned = 0;
-        for( int i = 0; i < h->param.i_threads; i++ )
-            if( h != h->thread[i] )
-            {
-                size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
-                size_of_other_slices_planned += h->thread[i]->rc->slice_size_planned;
-            }
-        float weight = rc->slice_size_planned / rc->frame_size_planned;
-        size_of_other_slices = (size_of_other_slices - size_of_other_slices_planned) * weight + size_of_other_slices_planned;
-    }
-    if( y < h->i_threadslice_end-1 )
-    {
-        /* B-frames shouldn't use lower QP than their reference frames. */
-        if( h->sh.i_type == SLICE_TYPE_B )
-        {
-            qp_min = X264_MAX( qp_min, X264_MAX( h->fref[0][0]->f_row_qp[y+1], h->fref[1][0]->f_row_qp[y+1] ) );
-            rc->qpm = X264_MAX( rc->qpm, qp_min );
-        }
-
-        float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
-        buffer_left_planned = X264_MAX( buffer_left_planned, 0.f );
-        /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
-        float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
-        float b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
-        float trust_coeff = x264_clip3f( bits_so_far / slice_size_planned, 0.0, 1.0 );
-
-        /* Don't increase the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
-        /* area at the top of the frame was measured inaccurately. */
-        if( trust_coeff < 0.05f )
-            qp_max = qp_absolute_max = prev_row_qp;
-
-        if( h->sh.i_type != SLICE_TYPE_I )
-            rc_tol *= 0.5f;
-
-        if( !rc->b_vbv_min_rate )
-            qp_min = X264_MAX( qp_min, rc->qp_novbv );
-
-        while( rc->qpm < qp_max
-               && ((b1 > rc->frame_size_planned + rc_tol) ||
-                   (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv) ||
-                   (b1 > rc->buffer_fill - buffer_left_planned * 0.5f)) )
-        {
-            rc->qpm += step_size;
-            b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
-        }
-
-        float b_max = b1 + ((rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 0.90f - b1) * trust_coeff;
-        rc->qpm -= step_size;
-        float b2 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
-        while( rc->qpm > qp_min && rc->qpm < prev_row_qp
-               && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv)
-               && (b2 < max_frame_size)
-               && ((b2 < rc->frame_size_planned * 0.8f) || (b2 < b_max)) )
-        {
-            b1 = b2;
-            rc->qpm -= step_size;
-            b2 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
-        }
-        rc->qpm += step_size;
-
-        /* avoid VBV underflow or MinCR violation */
-        while( rc->qpm < qp_absolute_max && (b1 > max_frame_size) )
-        {
-            rc->qpm += step_size;
-            b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
-        }
-
-        h->rc->frame_size_estimated = b1 - size_of_other_slices;
-
-        /* If the current row was large enough to cause a large QP jump, try re-encoding it. */
-        if( rc->qpm > qp_max && prev_row_qp < qp_max && can_reencode_row )
-        {
-            /* Bump QP to halfway in between... close enough. */
-            rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max );
-            rc->qpa_rc = rc->qpa_rc_prev;
-            rc->qpa_aq = rc->qpa_aq_prev;
-            h->fdec->i_row_bits[y] = 0;
-            h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
-            return -1;
-        }
-    }
-    else
-    {
-        h->rc->frame_size_estimated = bits_so_far;
-
-        /* Last-ditch attempt: if the last row of the frame underflowed the VBV,
-         * try again. */
-        if( rc->qpm < qp_max && can_reencode_row
-            && (h->rc->frame_size_estimated + size_of_other_slices > X264_MIN( rc->frame_size_maximum, rc->buffer_fill )) )
-        {
-            rc->qpm = qp_max;
-            rc->qpa_rc = rc->qpa_rc_prev;
-            rc->qpa_aq = rc->qpa_aq_prev;
-            h->fdec->i_row_bits[y] = 0;
-            h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
-            return -1;
-        }
-    }
-
-    rc->qpa_rc_prev = rc->qpa_rc;
-    rc->qpa_aq_prev = rc->qpa_aq;
-
-    return 0;
-}
-
-int x264_ratecontrol_qp( x264_t *h )
-{
-    x264_emms();
-    return x264_clip3( h->rc->qpm + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-}
-
-int x264_ratecontrol_mb_qp( x264_t *h )
-{
-    x264_emms();
-    float qp = h->rc->qpm;
-    if( h->param.rc.i_aq_mode )
-    {
-         /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
-        float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
-        /* Scale AQ's effect towards zero in emergency mode. */
-        if( qp > QP_MAX_SPEC )
-            qp_offset *= (QP_MAX - qp) / (QP_MAX - QP_MAX_SPEC);
-        qp += qp_offset;
-    }
-    return x264_clip3( qp + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-}
-
-/* In 2pass, force the same frame types as in the 1st pass */
-int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    if( h->param.rc.b_stat_read )
-    {
-        if( frame_num >= rc->num_entries )
-        {
-            /* We could try to initialize everything required for ABR and
-             * adaptive B-frames, but that would be complicated.
-             * So just calculate the average QP used so far. */
-            h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24 + QP_BD_OFFSET
-                                      : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
-            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
-            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, QP_MAX );
-            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, QP_MAX );
-
-            x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries );
-            x264_log( h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant );
-            if( h->param.i_bframe_adaptive )
-                x264_log( h, X264_LOG_ERROR, "disabling adaptive B-frames\n" );
-
-            for( int i = 0; i < h->param.i_threads; i++ )
-            {
-                h->thread[i]->rc->b_abr = 0;
-                h->thread[i]->rc->b_2pass = 0;
-                h->thread[i]->param.rc.i_rc_method = X264_RC_CQP;
-                h->thread[i]->param.rc.b_stat_read = 0;
-                h->thread[i]->param.i_bframe_adaptive = 0;
-                h->thread[i]->param.i_scenecut_threshold = 0;
-                h->thread[i]->param.rc.b_mb_tree = 0;
-                if( h->thread[i]->param.i_bframe > 1 )
-                    h->thread[i]->param.i_bframe = 1;
-            }
-            return X264_TYPE_AUTO;
-        }
-        return rc->entry[frame_num].frame_type;
-    }
-    else
-        return X264_TYPE_AUTO;
-}
-
-void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
-{
-    ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
-    if( h->param.analyse.i_weighted_pred <= 0 )
-        return;
-
-    if( rce->i_weight_denom[0] >= 0 )
-        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0][0], rce->i_weight_denom[0], rce->weight[0][1] );
-
-    if( rce->i_weight_denom[1] >= 0 )
-    {
-        SET_WEIGHT( frm->weight[0][1], 1, rce->weight[1][0], rce->i_weight_denom[1], rce->weight[1][1] );
-        SET_WEIGHT( frm->weight[0][2], 1, rce->weight[2][0], rce->i_weight_denom[1], rce->weight[2][1] );
-    }
-}
-
-/* After encoding one frame, save stats and update ratecontrol state */
-int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    const int *mbs = h->stat.frame.i_mb_count;
-
-    x264_emms();
-
-    h->stat.frame.i_mb_count_skip = mbs[P_SKIP] + mbs[B_SKIP];
-    h->stat.frame.i_mb_count_i = mbs[I_16x16] + mbs[I_8x8] + mbs[I_4x4];
-    h->stat.frame.i_mb_count_p = mbs[P_L0] + mbs[P_8x8];
-    for( int i = B_DIRECT; i < B_8x8; i++ )
-        h->stat.frame.i_mb_count_p += mbs[i];
-
-    h->fdec->f_qp_avg_rc = rc->qpa_rc /= h->mb.i_mb_count;
-    h->fdec->f_qp_avg_aq = (float)rc->qpa_aq / h->mb.i_mb_count;
-    h->fdec->f_crf_avg = h->param.rc.f_rf_constant + h->fdec->f_qp_avg_rc - rc->qp_novbv;
-
-    if( h->param.rc.b_stat_write )
-    {
-        char c_type = h->sh.i_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
-                    : h->sh.i_type==SLICE_TYPE_P ? 'P'
-                    : h->fenc->b_kept_as_ref ? 'B' : 'b';
-        int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
-        int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
-        char c_direct = h->mb.b_direct_auto_write ?
-                        ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
-                          dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
-                        : '-';
-        if( fprintf( rc->p_stat_file_out,
-                 "in:%d out:%d type:%c dur:%"PRId64" cpbdur:%"PRId64" q:%.2f aq:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
-                 h->fenc->i_frame, h->i_frame,
-                 c_type, h->fenc->i_duration,
-                 h->fenc->i_cpb_duration,
-                 rc->qpa_rc, h->fdec->f_qp_avg_aq,
-                 h->stat.frame.i_tex_bits,
-                 h->stat.frame.i_mv_bits,
-                 h->stat.frame.i_misc_bits,
-                 h->stat.frame.i_mb_count_i,
-                 h->stat.frame.i_mb_count_p,
-                 h->stat.frame.i_mb_count_skip,
-                 c_direct) < 0 )
-            goto fail;
-
-        /* Only write information for reference reordering once. */
-        int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1;
-        for( int i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref[0]); i++ )
-        {
-            int refcount = use_old_stats         ? rc->rce->refcount[i]
-                         : PARAM_INTERLACED      ? h->stat.frame.i_mb_count_ref[0][i*2]
-                                                 + h->stat.frame.i_mb_count_ref[0][i*2+1]
-                         :                         h->stat.frame.i_mb_count_ref[0][i];
-            if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 )
-                goto fail;
-        }
-
-        if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->sh.weight[0][0].weightfn )
-        {
-            if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d",
-                         h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
-                goto fail;
-            if( h->sh.weight[0][1].weightfn || h->sh.weight[0][2].weightfn )
-            {
-                if( fprintf( rc->p_stat_file_out, ",%d,%d,%d,%d,%d ",
-                             h->sh.weight[0][1].i_denom, h->sh.weight[0][1].i_scale, h->sh.weight[0][1].i_offset,
-                             h->sh.weight[0][2].i_scale, h->sh.weight[0][2].i_offset ) < 0 )
-                    goto fail;
-            }
-            else if( fprintf( rc->p_stat_file_out, " " ) < 0 )
-                goto fail;
-        }
-
-        if( fprintf( rc->p_stat_file_out, ";\n") < 0 )
-            goto fail;
-
-        /* Don't re-write the data in multi-pass mode. */
-        if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
-        {
-            uint8_t i_type = h->sh.i_type;
-            h->mc.mbtree_fix8_pack( rc->mbtree.qp_buffer[0], h->fenc->f_qp_offset, h->mb.i_mb_count );
-            if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
-                goto fail;
-            if( fwrite( rc->mbtree.qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
-                goto fail;
-        }
-    }
-
-    if( rc->b_abr )
-    {
-        if( h->sh.i_type != SLICE_TYPE_B )
-            rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / rc->last_rceq;
-        else
-        {
-            /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
-             * Not perfectly accurate with B-refs, but good enough. */
-            rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
-        }
-        rc->cplxr_sum *= rc->cbr_decay;
-        rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
-        rc->wanted_bits_window *= rc->cbr_decay;
-    }
-
-    if( rc->b_2pass )
-        rc->expected_bits_sum += qscale2bits( rc->rce, qp2qscale( rc->rce->new_qp ) );
-
-    if( h->mb.b_variable_qp )
-    {
-        if( h->sh.i_type == SLICE_TYPE_B )
-        {
-            rc->bframe_bits += bits;
-            if( h->fenc->b_last_minigop_bframe )
-            {
-                update_predictor( rc->pred_b_from_p, qp2qscale( rc->qpa_rc ),
-                                  h->fref[1][h->i_ref[1]-1]->i_satd, rc->bframe_bits / rc->bframes );
-                rc->bframe_bits = 0;
-            }
-        }
-    }
-
-    *filler = update_vbv( h, bits );
-    rc->filler_bits_sum += *filler * 8;
-
-    if( h->sps->vui.b_nal_hrd_parameters_present )
-    {
-        if( h->fenc->i_frame == 0 )
-        {
-            // access unit initialises the HRD
-            h->fenc->hrd_timing.cpb_initial_arrival_time = 0;
-            rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
-            rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
-            h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit = (double)rc->initial_cpb_removal_delay / 90000;
-        }
-        else
-        {
-            h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) *
-                                                   h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-
-            if( h->fenc->b_keyframe )
-            {
-                rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
-                rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
-                rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
-            }
-
-            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
-            if( !h->fenc->b_keyframe )
-                cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;
-
-            if( h->sps->vui.hrd.b_cbr_hrd )
-                h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time;
-            else
-                h->fenc->hrd_timing.cpb_initial_arrival_time = X264_MAX( rc->previous_cpb_final_arrival_time, cpb_earliest_arrival_time );
-        }
-        int filler_bits = *filler ? X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), *filler )*8 : 0;
-        // Equation C-6
-        h->fenc->hrd_timing.cpb_final_arrival_time = rc->previous_cpb_final_arrival_time = h->fenc->hrd_timing.cpb_initial_arrival_time +
-                                                     (double)(bits + filler_bits) / h->sps->vui.hrd.i_bit_rate_unscaled;
-
-        h->fenc->hrd_timing.dpb_output_time = (double)h->fenc->i_dpb_output_delay * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale +
-                                              h->fenc->hrd_timing.cpb_removal_time;
-    }
-
-    return 0;
-fail:
-    x264_log( h, X264_LOG_ERROR, "ratecontrol_end: stats file could not be written to\n" );
-    return -1;
-}
-
-/****************************************************************************
- * 2 pass functions
- ***************************************************************************/
-
-/**
- * modify the bitrate curve from pass1 for one frame
- */
-static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor, int frame_num)
-{
-    x264_ratecontrol_t *rcc= h->rc;
-    x264_zone_t *zone = get_zone( h, frame_num );
-    double q;
-    if( h->param.rc.b_mb_tree )
-    {
-        double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-        q = pow( BASE_FRAME_DURATION / CLIP_DURATION(rce->i_duration * timescale), 1 - h->param.rc.f_qcompress );
-    }
-    else
-        q = pow( rce->blurred_complexity, 1 - rcc->qcompress );
-
-    // avoid NaN's in the rc_eq
-    if( !isfinite(q) || rce->tex_bits + rce->mv_bits == 0 )
-        q = rcc->last_qscale_for[rce->pict_type];
-    else
-    {
-        rcc->last_rceq = q;
-        q /= rate_factor;
-        rcc->last_qscale = q;
-    }
-
-    if( zone )
-    {
-        if( zone->b_force_qp )
-            q = qp2qscale( zone->i_qp );
-        else
-            q /= zone->f_bitrate_factor;
-    }
-
-    return q;
-}
-
-static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q, int frame_num)
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    const int pict_type = rce->pict_type;
-    x264_zone_t *zone = get_zone( h, frame_num );
-
-    // force I/B quants as a function of P quants
-    const double last_p_q    = rcc->last_qscale_for[SLICE_TYPE_P];
-    const double last_non_b_q= rcc->last_qscale_for[rcc->last_non_b_pict_type];
-    if( pict_type == SLICE_TYPE_I )
-    {
-        double iq = q;
-        double pq = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm );
-        double ip_factor = fabs( h->param.rc.f_ip_factor );
-        /* don't apply ip_factor if the following frame is also I */
-        if( rcc->accum_p_norm <= 0 )
-            q = iq;
-        else if( h->param.rc.f_ip_factor < 0 )
-            q = iq / ip_factor;
-        else if( rcc->accum_p_norm >= 1 )
-            q = pq / ip_factor;
-        else
-            q = rcc->accum_p_norm * pq / ip_factor + (1 - rcc->accum_p_norm) * iq;
-    }
-    else if( pict_type == SLICE_TYPE_B )
-    {
-        if( h->param.rc.f_pb_factor > 0 )
-            q = last_non_b_q;
-        if( !rce->kept_as_ref )
-            q *= fabs( h->param.rc.f_pb_factor );
-    }
-    else if( pict_type == SLICE_TYPE_P
-             && rcc->last_non_b_pict_type == SLICE_TYPE_P
-             && rce->tex_bits == 0 )
-    {
-        q = last_p_q;
-    }
-
-    /* last qscale / qdiff stuff */
-    if( rcc->last_non_b_pict_type == pict_type &&
-        (pict_type!=SLICE_TYPE_I || rcc->last_accum_p_norm < 1) )
-    {
-        double last_q = rcc->last_qscale_for[pict_type];
-        double max_qscale = last_q * rcc->lstep;
-        double min_qscale = last_q / rcc->lstep;
-
-        if     ( q > max_qscale ) q = max_qscale;
-        else if( q < min_qscale ) q = min_qscale;
-    }
-
-    rcc->last_qscale_for[pict_type] = q;
-    if( pict_type != SLICE_TYPE_B )
-        rcc->last_non_b_pict_type = pict_type;
-    if( pict_type == SLICE_TYPE_I )
-    {
-        rcc->last_accum_p_norm = rcc->accum_p_norm;
-        rcc->accum_p_norm = 0;
-        rcc->accum_p_qp = 0;
-    }
-    if( pict_type == SLICE_TYPE_P )
-    {
-        float mask = 1 - pow( (float)rce->i_count / rcc->nmb, 2 );
-        rcc->accum_p_qp   = mask * (qscale2qp( q ) + rcc->accum_p_qp);
-        rcc->accum_p_norm = mask * (1 + rcc->accum_p_norm);
-    }
-
-    if( zone )
-    {
-        if( zone->b_force_qp )
-            q = qp2qscale( zone->i_qp );
-        else
-            q /= zone->f_bitrate_factor;
-    }
-
-    return q;
-}
-
-static float predict_size( predictor_t *p, float q, float var )
-{
-    return (p->coeff*var + p->offset) / (q*p->count);
-}
-
-static void update_predictor( predictor_t *p, float q, float var, float bits )
-{
-    float range = 1.5;
-    if( var < 10 )
-        return;
-    float old_coeff = p->coeff / p->count;
-    float old_offset = p->offset / p->count;
-    float new_coeff = X264_MAX( (bits*q - old_offset) / var, p->coeff_min );
-    float new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range );
-    float new_offset = bits*q - new_coeff_clipped * var;
-    if( new_offset >= 0 )
-        new_coeff = new_coeff_clipped;
-    else
-        new_offset = 0;
-    p->count  *= p->decay;
-    p->coeff  *= p->decay;
-    p->offset *= p->decay;
-    p->count  ++;
-    p->coeff  += new_coeff;
-    p->offset += new_offset;
-}
-
-// update VBV after encoding a frame
-static int update_vbv( x264_t *h, int bits )
-{
-    int filler = 0;
-    int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
-    x264_ratecontrol_t *rcc = h->rc;
-    x264_ratecontrol_t *rct = h->thread[0]->rc;
-    int64_t buffer_size = (int64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
-
-    if( rcc->last_satd >= h->mb.i_mb_count )
-        update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits );
-
-    if( !rcc->b_vbv )
-        return filler;
-
-    uint64_t buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale;
-    rct->buffer_fill_final -= buffer_diff;
-    rct->buffer_fill_final_min -= buffer_diff;
-
-    if( rct->buffer_fill_final_min < 0 )
-    {
-        double underflow = (double)rct->buffer_fill_final_min / h->sps->vui.i_time_scale;
-        if( rcc->rate_factor_max_increment && rcc->qpm >= rcc->qp_novbv + rcc->rate_factor_max_increment )
-            x264_log( h, X264_LOG_DEBUG, "VBV underflow due to CRF-max (frame %d, %.0f bits)\n", h->i_frame, underflow );
-        else
-            x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, underflow );
-        rct->buffer_fill_final =
-        rct->buffer_fill_final_min = 0;
-    }
-
-    if( h->param.i_avcintra_class )
-        buffer_diff = buffer_size;
-    else
-        buffer_diff = (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration;
-    rct->buffer_fill_final += buffer_diff;
-    rct->buffer_fill_final_min += buffer_diff;
-
-    if( rct->buffer_fill_final > buffer_size )
-    {
-        if( h->param.rc.b_filler )
-        {
-            int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8;
-            filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale;
-            bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8;
-            buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale;
-            rct->buffer_fill_final -= buffer_diff;
-            rct->buffer_fill_final_min -= buffer_diff;
-        }
-        else
-        {
-            rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size );
-            rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, buffer_size );
-        }
-    }
-
-    return filler;
-}
-
-void x264_hrd_fullness( x264_t *h )
-{
-    x264_ratecontrol_t *rct = h->thread[0]->rc;
-    uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale / rct->hrd_multiply_denom;
-    uint64_t cpb_state = rct->buffer_fill_final;
-    uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
-    uint64_t multiply_factor = 90000 / rct->hrd_multiply_denom;
-
-    if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > (int64_t)cpb_size )
-    {
-         x264_log( h, X264_LOG_WARNING, "CPB %s: %.0f bits in a %.0f-bit buffer\n",
-                   rct->buffer_fill_final < 0 ? "underflow" : "overflow",
-                   (double)rct->buffer_fill_final / h->sps->vui.i_time_scale, (double)cpb_size / h->sps->vui.i_time_scale );
-    }
-
-    h->initial_cpb_removal_delay = (multiply_factor * cpb_state) / denom;
-    h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size) / denom - h->initial_cpb_removal_delay;
-
-    int64_t decoder_buffer_fill = h->initial_cpb_removal_delay * denom / multiply_factor;
-    rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, decoder_buffer_fill );
-}
-
-// provisionally update VBV according to the planned size of all frames currently in progress
-static void update_vbv_plan( x264_t *h, int overhead )
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final_min / h->sps->vui.i_time_scale;
-    if( h->i_thread_frames > 1 )
-    {
-        int j = h->rc - h->thread[0]->rc;
-        for( int i = 1; i < h->i_thread_frames; i++ )
-        {
-            x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
-            double bits = t->rc->frame_size_planned;
-            if( !t->b_thread_active )
-                continue;
-            bits = X264_MAX(bits, t->rc->frame_size_estimated);
-            rcc->buffer_fill -= bits;
-            rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
-            rcc->buffer_fill += t->rc->buffer_rate;
-            rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
-        }
-    }
-    rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
-    rcc->buffer_fill -= overhead;
-}
-
-// apply VBV constraints and clip qscale to between lmin and lmax
-static double clip_qscale( x264_t *h, int pict_type, double q )
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    double lmin = rcc->lmin[pict_type];
-    double lmax = rcc->lmax[pict_type];
-    if( rcc->rate_factor_max_increment )
-        lmax = X264_MIN( lmax, qp2qscale( rcc->qp_novbv + rcc->rate_factor_max_increment ) );
-    double q0 = q;
-
-    /* B-frames are not directly subject to VBV,
-     * since they are controlled by the P-frames' QPs. */
-
-    if( rcc->b_vbv && rcc->last_satd > 0 )
-    {
-        double fenc_cpb_duration = (double)h->fenc->i_cpb_duration *
-                                   h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-        /* Lookahead VBV: raise the quantizer as necessary such that no frames in
-         * the lookahead overflow and such that the buffer is in a reasonable state
-         * by the end of the lookahead. */
-        if( h->param.rc.i_lookahead )
-        {
-            int terminate = 0;
-
-            /* Avoid an infinite loop. */
-            for( int iterations = 0; iterations < 1000 && terminate != 3; iterations++ )
-            {
-                double frame_q[3];
-                double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-                double buffer_fill_cur = rcc->buffer_fill - cur_bits;
-                double target_fill;
-                double total_duration = 0;
-                double last_duration = fenc_cpb_duration;
-                frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
-                frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
-                frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;
-
-                /* Loop over the planned future frames. */
-                for( int j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
-                {
-                    total_duration += last_duration;
-                    buffer_fill_cur += rcc->vbv_max_rate * last_duration;
-                    int i_type = h->fenc->i_planned_type[j];
-                    int i_satd = h->fenc->i_planned_satd[j];
-                    if( i_type == X264_TYPE_AUTO )
-                        break;
-                    i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
-                    cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
-                    buffer_fill_cur -= cur_bits;
-                    last_duration = h->fenc->f_planned_cpb_duration[j];
-                }
-                /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
-                target_fill = X264_MIN( rcc->buffer_fill + total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.5 );
-                if( buffer_fill_cur < target_fill )
-                {
-                    q *= 1.01;
-                    terminate |= 1;
-                    continue;
-                }
-                /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
-                target_fill = x264_clip3f( rcc->buffer_fill - total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.8, rcc->buffer_size );
-                if( rcc->b_vbv_min_rate && buffer_fill_cur > target_fill )
-                {
-                    q /= 1.01;
-                    terminate |= 2;
-                    continue;
-                }
-                break;
-            }
-        }
-        /* Fallback to old purely-reactive algorithm: no lookahead. */
-        else
-        {
-            if( ( pict_type == SLICE_TYPE_P ||
-                ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) &&
-                rcc->buffer_fill/rcc->buffer_size < 0.5 )
-            {
-                q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
-            }
-
-            /* Now a hard threshold to make sure the frame fits in VBV.
-             * This one is mostly for I-frames. */
-            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-            /* For small VBVs, allow the frame to use up the entire VBV. */
-            double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
-            /* For single-frame VBVs, request that the frame use up the entire VBV. */
-            double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;
-
-            if( bits > rcc->buffer_fill/max_fill_factor )
-            {
-                double qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
-                q /= qf;
-                bits *= qf;
-            }
-            if( bits < rcc->buffer_rate/min_fill_factor )
-            {
-                double qf = x264_clip3f( bits*min_fill_factor/rcc->buffer_rate, 0.001, 1.0 );
-                q *= qf;
-            }
-            q = X264_MAX( q0, q );
-        }
-
-        /* Check B-frame complexity, and use up any bits that would
-         * overflow before the next P-frame. */
-        if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
-        {
-            int nb = rcc->bframes;
-            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-            double pbbits = bits;
-            double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
-            double space;
-            double bframe_cpb_duration = 0;
-            double minigop_cpb_duration;
-            for( int i = 0; i < nb; i++ )
-                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[i];
-
-            if( bbits * nb > bframe_cpb_duration * rcc->vbv_max_rate )
-                nb = 0;
-            pbbits += nb * bbits;
-
-            minigop_cpb_duration = bframe_cpb_duration + fenc_cpb_duration;
-            space = rcc->buffer_fill + minigop_cpb_duration*rcc->vbv_max_rate - rcc->buffer_size;
-            if( pbbits < space )
-            {
-                q *= X264_MAX( pbbits / space, bits / (0.5 * rcc->buffer_size) );
-            }
-            q = X264_MAX( q0/2, q );
-        }
-
-        /* Apply MinCR and buffer fill restrictions */
-        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-        double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) );
-        if( bits > frame_size_maximum )
-            q *= bits / frame_size_maximum;
-
-        if( !rcc->b_vbv_min_rate )
-            q = X264_MAX( q0, q );
-    }
-
-    if( lmin==lmax )
-        return lmin;
-    else if( rcc->b_2pass )
-    {
-        double min2 = log( lmin );
-        double max2 = log( lmax );
-        q = (log(q) - min2)/(max2-min2) - 0.5;
-        q = 1.0/(1.0 + exp( -4*q ));
-        q = q*(max2-min2) + min2;
-        return exp( q );
-    }
-    else
-        return x264_clip3f( q, lmin, lmax );
-}
-
-// update qscale for 1 frame based on actual bits used so far
-static float rate_estimate_qscale( x264_t *h )
-{
-    float q;
-    x264_ratecontrol_t *rcc = h->rc;
-    ratecontrol_entry_t rce = {0};
-    int pict_type = h->sh.i_type;
-    int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
-                          + h->stat.i_frame_size[SLICE_TYPE_P]
-                          + h->stat.i_frame_size[SLICE_TYPE_B])
-                       - rcc->filler_bits_sum;
-
-    if( rcc->b_2pass )
-    {
-        rce = *rcc->rce;
-        if( pict_type != rce.pict_type )
-        {
-            x264_log( h, X264_LOG_ERROR, "slice=%c but 2pass stats say %c\n",
-                      slice_type_to_char[pict_type], slice_type_to_char[rce.pict_type] );
-        }
-    }
-
-    if( pict_type == SLICE_TYPE_B )
-    {
-        /* B-frames don't have independent ratecontrol, but rather get the
-         * average QP of the two adjacent P-frames + an offset */
-
-        int i0 = IS_X264_TYPE_I(h->fref_nearest[0]->i_type);
-        int i1 = IS_X264_TYPE_I(h->fref_nearest[1]->i_type);
-        int dt0 = abs(h->fenc->i_poc - h->fref_nearest[0]->i_poc);
-        int dt1 = abs(h->fenc->i_poc - h->fref_nearest[1]->i_poc);
-        float q0 = h->fref_nearest[0]->f_qp_avg_rc;
-        float q1 = h->fref_nearest[1]->f_qp_avg_rc;
-
-        if( h->fref_nearest[0]->i_type == X264_TYPE_BREF )
-            q0 -= rcc->pb_offset/2;
-        if( h->fref_nearest[1]->i_type == X264_TYPE_BREF )
-            q1 -= rcc->pb_offset/2;
-
-        if( i0 && i1 )
-            q = (q0 + q1) / 2 + rcc->ip_offset;
-        else if( i0 )
-            q = q1;
-        else if( i1 )
-            q = q0;
-        else
-            q = (q0*dt1 + q1*dt0) / (dt0 + dt1);
-
-        if( h->fenc->b_kept_as_ref )
-            q += rcc->pb_offset/2;
-        else
-            q += rcc->pb_offset;
-
-        rcc->qp_novbv = q;
-        q = qp2qscale( q );
-        if( rcc->b_2pass )
-            rcc->frame_size_planned = qscale2bits( &rce, q );
-        else
-            rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref[1][h->i_ref[1]-1]->i_satd );
-        /* Limit planned size by MinCR */
-        if( rcc->b_vbv )
-            rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum );
-        h->rc->frame_size_estimated = rcc->frame_size_planned;
-
-        /* For row SATDs */
-        if( rcc->b_vbv )
-            rcc->last_satd = x264_rc_analyse_slice( h );
-        return q;
-    }
-    else
-    {
-        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
-        double predicted_bits = total_bits;
-        if( h->i_thread_frames > 1 )
-        {
-            int j = h->rc - h->thread[0]->rc;
-            for( int i = 1; i < h->i_thread_frames; i++ )
-            {
-                x264_t *t = h->thread[(j+i) % h->i_thread_frames];
-                double bits = t->rc->frame_size_planned;
-                if( !t->b_thread_active )
-                    continue;
-                bits = X264_MAX(bits, t->rc->frame_size_estimated);
-                predicted_bits += bits;
-            }
-        }
-
-        if( rcc->b_2pass )
-        {
-            double lmin = rcc->lmin[pict_type];
-            double lmax = rcc->lmax[pict_type];
-            double diff;
-
-            /* Adjust ABR buffer based on distance to the end of the video. */
-            if( rcc->num_entries > h->i_frame )
-            {
-                double final_bits = rcc->entry_out[rcc->num_entries-1]->expected_bits;
-                double video_pos = rce.expected_bits / final_bits;
-                double scale_factor = sqrt( (1 - video_pos) * rcc->num_entries );
-                abr_buffer *= 0.5 * X264_MAX( scale_factor, 0.5 );
-            }
-
-            diff = predicted_bits - rce.expected_bits;
-            q = rce.new_qscale;
-            q /= x264_clip3f((abr_buffer - diff) / abr_buffer, .5, 2);
-            if( h->i_frame >= rcc->fps && rcc->expected_bits_sum >= 1 )
-            {
-                /* Adjust quant based on the difference between
-                 * achieved and expected bitrate so far */
-                double cur_time = (double)h->i_frame / rcc->num_entries;
-                double w = x264_clip3f( cur_time*100, 0.0, 1.0 );
-                q *= pow( (double)total_bits / rcc->expected_bits_sum, w );
-            }
-            rcc->qp_novbv = qscale2qp( q );
-            if( rcc->b_vbv )
-            {
-                /* Do not overflow vbv */
-                double expected_size = qscale2bits( &rce, q );
-                double expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
-                double expected_fullness = rce.expected_vbv / rcc->buffer_size;
-                double qmax = q*(2 - expected_fullness);
-                double size_constraint = 1 + expected_fullness;
-                qmax = X264_MAX( qmax, rce.new_qscale );
-                if( expected_fullness < .05 )
-                    qmax = lmax;
-                qmax = X264_MIN(qmax, lmax);
-                while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) ||
-                        ((expected_vbv < 0) && (q < lmax)))
-                {
-                    q *= 1.05;
-                    expected_size = qscale2bits(&rce, q);
-                    expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
-                }
-                rcc->last_satd = x264_rc_analyse_slice( h );
-            }
-            q = x264_clip3f( q, lmin, lmax );
-        }
-        else /* 1pass ABR */
-        {
-            /* Calculate the quantizer which would have produced the desired
-             * average bitrate if it had been applied to all frames so far.
-             * Then modulate that quant based on the current frame's complexity
-             * relative to the average complexity so far (using the 2pass RCEQ).
-             * Then bias the quant up or down if total size so far was far from
-             * the target.
-             * Result: Depending on the value of rate_tolerance, there is a
-             * tradeoff between quality and bitrate precision. But at large
-             * tolerances, the bit distribution approaches that of 2pass. */
-
-            double wanted_bits, overflow = 1;
-
-            rcc->last_satd = x264_rc_analyse_slice( h );
-            rcc->short_term_cplxsum *= 0.5;
-            rcc->short_term_cplxcount *= 0.5;
-            rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
-            rcc->short_term_cplxcount ++;
-
-            rce.tex_bits = rcc->last_satd;
-            rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount;
-            rce.mv_bits = 0;
-            rce.p_count = rcc->nmb;
-            rce.i_count = 0;
-            rce.s_count = 0;
-            rce.qscale = 1;
-            rce.pict_type = pict_type;
-            rce.i_duration = h->fenc->i_duration;
-
-            if( h->param.rc.i_rc_method == X264_RC_CRF )
-            {
-                q = get_qscale( h, &rce, rcc->rate_factor_constant, h->fenc->i_frame );
-            }
-            else
-            {
-                q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
-
-                /* ABR code can potentially be counterproductive in CBR, so just don't bother.
-                 * Don't run it if the frame complexity is zero either. */
-                if( !rcc->b_vbv_min_rate && rcc->last_satd )
-                {
-                    // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
-                    int i_frame_done = h->i_frame;
-                    double time_done = i_frame_done / rcc->fps;
-                    if( h->param.b_vfr_input && i_frame_done > 0 )
-                        time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den;
-                    wanted_bits = time_done * rcc->bitrate;
-                    if( wanted_bits > 0 )
-                    {
-                        abr_buffer *= X264_MAX( 1, sqrt( time_done ) );
-                        overflow = x264_clip3f( 1.0 + (predicted_bits - wanted_bits) / abr_buffer, .5, 2 );
-                        q *= overflow;
-                    }
-                }
-            }
-
-            if( pict_type == SLICE_TYPE_I && h->param.i_keyint_max > 1
-                /* should test _next_ pict type, but that isn't decided yet */
-                && rcc->last_non_b_pict_type != SLICE_TYPE_I )
-            {
-                q = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm );
-                q /= fabs( h->param.rc.f_ip_factor );
-            }
-            else if( h->i_frame > 0 )
-            {
-                if( h->param.rc.i_rc_method != X264_RC_CRF )
-                {
-                    /* Asymmetric clipping, because symmetric would prevent
-                     * overflow control in areas of rapidly oscillating complexity */
-                    double lmin = rcc->last_qscale_for[pict_type] / rcc->lstep;
-                    double lmax = rcc->last_qscale_for[pict_type] * rcc->lstep;
-                    if( overflow > 1.1 && h->i_frame > 3 )
-                        lmax *= rcc->lstep;
-                    else if( overflow < 0.9 )
-                        lmin /= rcc->lstep;
-
-                    q = x264_clip3f(q, lmin, lmax);
-                }
-            }
-            else if( h->param.rc.i_rc_method == X264_RC_CRF && rcc->qcompress != 1 )
-            {
-                q = qp2qscale( ABR_INIT_QP ) / fabs( h->param.rc.f_ip_factor );
-            }
-            rcc->qp_novbv = qscale2qp( q );
-
-            //FIXME use get_diff_limited_q() ?
-            q = clip_qscale( h, pict_type, q );
-        }
-
-        rcc->last_qscale_for[pict_type] =
-        rcc->last_qscale = q;
-
-        if( !(rcc->b_2pass && !rcc->b_vbv) && h->fenc->i_frame == 0 )
-            rcc->last_qscale_for[SLICE_TYPE_P] = q * fabs( h->param.rc.f_ip_factor );
-
-        if( rcc->b_2pass )
-            rcc->frame_size_planned = qscale2bits( &rce, q );
-        else
-            rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-
-        /* Always use up the whole VBV in this case. */
-        if( rcc->single_frame_vbv )
-            rcc->frame_size_planned = rcc->buffer_rate;
-        /* Limit planned size by MinCR */
-        if( rcc->b_vbv )
-            rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum );
-        h->rc->frame_size_estimated = rcc->frame_size_planned;
-        return q;
-    }
-}
-
-static void x264_threads_normalize_predictors( x264_t *h )
-{
-    double totalsize = 0;
-    for( int i = 0; i < h->param.i_threads; i++ )
-        totalsize += h->thread[i]->rc->slice_size_planned;
-    double factor = h->rc->frame_size_planned / totalsize;
-    for( int i = 0; i < h->param.i_threads; i++ )
-        h->thread[i]->rc->slice_size_planned *= factor;
-}
-
-void x264_threads_distribute_ratecontrol( x264_t *h )
-{
-    int row;
-    x264_ratecontrol_t *rc = h->rc;
-    x264_emms();
-    float qscale = qp2qscale( rc->qpm );
-
-    /* Initialize row predictors */
-    if( h->i_frame == 0 )
-        for( int i = 0; i < h->param.i_threads; i++ )
-        {
-            x264_t *t = h->thread[i];
-            if( t != h )
-                memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) );
-        }
-
-    for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        x264_t *t = h->thread[i];
-        if( t != h )
-            memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
-        t->rc->row_pred = t->rc->row_preds[h->sh.i_type];
-        /* Calculate the planned slice size. */
-        if( rc->b_vbv && rc->frame_size_planned )
-        {
-            int size = 0;
-            for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
-                size += h->fdec->i_row_satd[row];
-            t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], qscale, size );
-        }
-        else
-            t->rc->slice_size_planned = 0;
-    }
-    if( rc->b_vbv && rc->frame_size_planned )
-    {
-        x264_threads_normalize_predictors( h );
-
-        if( rc->single_frame_vbv )
-        {
-            /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
-            for( int i = 0; i < h->param.i_threads; i++ )
-            {
-                x264_t *t = h->thread[i];
-                float max_frame_error = x264_clip3f( 1.0 / (t->i_threadslice_end - t->i_threadslice_start), 0.05, 0.25 );
-                t->rc->slice_size_planned += 2 * max_frame_error * rc->frame_size_planned;
-            }
-            x264_threads_normalize_predictors( h );
-        }
-
-        for( int i = 0; i < h->param.i_threads; i++ )
-            h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
-    }
-}
-
-void x264_threads_merge_ratecontrol( x264_t *h )
-{
-    x264_ratecontrol_t *rc = h->rc;
-    x264_emms();
-
-    for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        x264_t *t = h->thread[i];
-        x264_ratecontrol_t *rct = h->thread[i]->rc;
-        if( h->param.rc.i_vbv_buffer_size )
-        {
-            int size = 0;
-            for( int row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
-                size += h->fdec->i_row_satd[row];
-            int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
-            int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->mb.i_mb_width;
-            update_predictor( &rc->pred[h->sh.i_type+(i+1)*5], qp2qscale( rct->qpa_rc/mb_count ), size, bits );
-        }
-        if( !i )
-            continue;
-        rc->qpa_rc += rct->qpa_rc;
-        rc->qpa_aq += rct->qpa_aq;
-    }
-}
-
-void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
-{
-    if( cur != prev )
-    {
-#define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
-        /* these vars are updated in x264_ratecontrol_start()
-         * so copy them from the context that most recently started (prev)
-         * to the context that's about to start (cur). */
-        COPY(accum_p_qp);
-        COPY(accum_p_norm);
-        COPY(last_satd);
-        COPY(last_rceq);
-        COPY(last_qscale_for);
-        COPY(last_non_b_pict_type);
-        COPY(short_term_cplxsum);
-        COPY(short_term_cplxcount);
-        COPY(bframes);
-        COPY(prev_zone);
-        COPY(mbtree.qpbuf_pos);
-        /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
-        COPY(bitrate);
-        COPY(buffer_size);
-        COPY(buffer_rate);
-        COPY(vbv_max_rate);
-        COPY(single_frame_vbv);
-        COPY(cbr_decay);
-        COPY(rate_factor_constant);
-        COPY(rate_factor_max_increment);
-#undef COPY
-    }
-    if( cur != next )
-    {
-#define COPY(var) next->rc->var = cur->rc->var
-        /* these vars are updated in x264_ratecontrol_end()
-         * so copy them from the context that most recently ended (cur)
-         * to the context that's about to end (next) */
-        COPY(cplxr_sum);
-        COPY(expected_bits_sum);
-        COPY(filler_bits_sum);
-        COPY(wanted_bits_window);
-        COPY(bframe_bits);
-        COPY(initial_cpb_removal_delay);
-        COPY(initial_cpb_removal_delay_offset);
-        COPY(nrt_first_access_unit);
-        COPY(previous_cpb_final_arrival_time);
-#undef COPY
-    }
-    //FIXME row_preds[] (not strictly necessary, but would improve prediction)
-    /* the rest of the variables are either constant or thread-local */
-}
-
-static int find_underflow( x264_t *h, double *fills, int *t0, int *t1, int over )
-{
-    /* find an interval ending on an overflow or underflow (depending on whether
-     * we're adding or removing bits), and starting on the earliest frame that
-     * can influence the buffer fill of that end frame. */
-    x264_ratecontrol_t *rcc = h->rc;
-    const double buffer_min = .1 * rcc->buffer_size;
-    const double buffer_max = .9 * rcc->buffer_size;
-    double fill = fills[*t0-1];
-    double parity = over ? 1. : -1.;
-    int start = -1, end = -1;
-    for( int i = *t0; i < rcc->num_entries; i++ )
-    {
-        fill += (rcc->entry_out[i]->i_cpb_duration * rcc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale -
-                 qscale2bits( rcc->entry_out[i], rcc->entry_out[i]->new_qscale )) * parity;
-        fill = x264_clip3f(fill, 0, rcc->buffer_size);
-        fills[i] = fill;
-        if( fill <= buffer_min || i == 0 )
-        {
-            if( end >= 0 )
-                break;
-            start = i;
-        }
-        else if( fill >= buffer_max && start >= 0 )
-            end = i;
-    }
-    *t0 = start;
-    *t1 = end;
-    return start >= 0 && end >= 0;
-}
-
-static int fix_underflow( x264_t *h, int t0, int t1, double adjustment, double qscale_min, double qscale_max )
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    double qscale_orig, qscale_new;
-    int adjusted = 0;
-    if( t0 > 0 )
-        t0++;
-    for( int i = t0; i <= t1; i++ )
-    {
-        qscale_orig = rcc->entry_out[i]->new_qscale;
-        qscale_orig = x264_clip3f( qscale_orig, qscale_min, qscale_max );
-        qscale_new  = qscale_orig * adjustment;
-        qscale_new  = x264_clip3f( qscale_new, qscale_min, qscale_max );
-        rcc->entry_out[i]->new_qscale = qscale_new;
-        adjusted = adjusted || (qscale_new != qscale_orig);
-    }
-    return adjusted;
-}
-
-static double count_expected_bits( x264_t *h )
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    double expected_bits = 0;
-    for( int i = 0; i < rcc->num_entries; i++ )
-    {
-        ratecontrol_entry_t *rce = rcc->entry_out[i];
-        rce->expected_bits = expected_bits;
-        expected_bits += qscale2bits( rce, rce->new_qscale );
-    }
-    return expected_bits;
-}
-
-static int vbv_pass2( x264_t *h, double all_available_bits )
-{
-    /* for each interval of buffer_full .. underflow, uniformly increase the qp of all
-     * frames in the interval until either buffer is full at some intermediate frame or the
-     * last frame in the interval no longer underflows.  Recompute intervals and repeat.
-     * Then do the converse to put bits back into overflow areas until target size is met */
-
-    x264_ratecontrol_t *rcc = h->rc;
-    double *fills;
-    double expected_bits = 0;
-    double adjustment;
-    double prev_bits = 0;
-    int t0, t1;
-    double qscale_min = qp2qscale( h->param.rc.i_qp_min );
-    double qscale_max = qp2qscale( h->param.rc.i_qp_max );
-    int iterations = 0;
-    int adj_min, adj_max;
-    CHECKED_MALLOC( fills, (rcc->num_entries+1)*sizeof(double) );
-
-    fills++;
-
-    /* adjust overall stream size */
-    do
-    {
-        iterations++;
-        prev_bits = expected_bits;
-
-        if( expected_bits )
-        {   /* not first iteration */
-            adjustment = X264_MAX(X264_MIN(expected_bits / all_available_bits, 0.999), 0.9);
-            fills[-1] = rcc->buffer_size * h->param.rc.f_vbv_buffer_init;
-            t0 = 0;
-            /* fix overflows */
-            adj_min = 1;
-            while(adj_min && find_underflow( h, fills, &t0, &t1, 1 ))
-            {
-                adj_min = fix_underflow( h, t0, t1, adjustment, qscale_min, qscale_max );
-                t0 = t1;
-            }
-        }
-
-        fills[-1] = rcc->buffer_size * (1. - h->param.rc.f_vbv_buffer_init);
-        t0 = 0;
-        /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */
-        adj_max = 1;
-        while( adj_max && find_underflow( h, fills, &t0, &t1, 0 ) )
-            adj_max = fix_underflow( h, t0, t1, 1.001, qscale_min, qscale_max );
-
-        expected_bits = count_expected_bits( h );
-    } while( (expected_bits < .995*all_available_bits) && ((int64_t)(expected_bits+.5) > (int64_t)(prev_bits+.5)) );
-
-    if( !adj_max )
-        x264_log( h, X264_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n");
-
-    /* store expected vbv filling values for tracking when encoding */
-    for( int i = 0; i < rcc->num_entries; i++ )
-        rcc->entry_out[i]->expected_vbv = rcc->buffer_size - fills[i];
-
-    x264_free( fills-1 );
-    return 0;
-fail:
-    return -1;
-}
-
-static int init_pass2( x264_t *h )
-{
-    x264_ratecontrol_t *rcc = h->rc;
-    uint64_t all_const_bits = 0;
-    double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-    double duration = 0;
-    for( int i = 0; i < rcc->num_entries; i++ )
-        duration += rcc->entry[i].i_duration;
-    duration *= timescale;
-    uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
-    double rate_factor, step_mult;
-    double qblur = h->param.rc.f_qblur;
-    double cplxblur = h->param.rc.f_complexity_blur;
-    const int filter_size = (int)(qblur*4) | 1;
-    double expected_bits;
-    double *qscale, *blurred_qscale;
-    double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-
-    /* find total/average complexity & const_bits */
-    for( int i = 0; i < rcc->num_entries; i++ )
-    {
-        ratecontrol_entry_t *rce = &rcc->entry[i];
-        all_const_bits += rce->misc_bits;
-    }
-
-    if( all_available_bits < all_const_bits)
-    {
-        x264_log( h, X264_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
-                 (int)(all_const_bits * rcc->fps / (rcc->num_entries * 1000.)) );
-        return -1;
-    }
-
-    /* Blur complexities, to reduce local fluctuation of QP.
-     * We don't blur the QPs directly, because then one very simple frame
-     * could drag down the QP of a nearby complex frame and give it more
-     * bits than intended. */
-    for( int i = 0; i < rcc->num_entries; i++ )
-    {
-        ratecontrol_entry_t *rce = &rcc->entry[i];
-        double weight_sum = 0;
-        double cplx_sum = 0;
-        double weight = 1.0;
-        double gaussian_weight;
-        /* weighted average of cplx of future frames */
-        for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
-        {
-            ratecontrol_entry_t *rcj = &rcc->entry[i+j];
-            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
-            weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
-            if( weight < .0001 )
-                break;
-            gaussian_weight = weight * exp( -j*j/200.0 );
-            weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
-        }
-        /* weighted average of cplx of past frames */
-        weight = 1.0;
-        for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
-        {
-            ratecontrol_entry_t *rcj = &rcc->entry[i-j];
-            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
-            gaussian_weight = weight * exp( -j*j/200.0 );
-            weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
-            weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
-            if( weight < .0001 )
-                break;
-        }
-        rce->blurred_complexity = cplx_sum / weight_sum;
-    }
-
-    CHECKED_MALLOC( qscale, sizeof(double)*rcc->num_entries );
-    if( filter_size > 1 )
-        CHECKED_MALLOC( blurred_qscale, sizeof(double)*rcc->num_entries );
-    else
-        blurred_qscale = qscale;
-
-    /* Search for a factor which, when multiplied by the RCEQ values from
-     * each frame, adds up to the desired total size.
-     * There is no exact closed-form solution because of VBV constraints and
-     * because qscale2bits is not invertible, but we can start with the simple
-     * approximation of scaling the 1st pass by the ratio of bitrates.
-     * The search range is probably overkill, but speed doesn't matter here. */
-
-    expected_bits = 1;
-    for( int i = 0; i < rcc->num_entries; i++ )
-    {
-        double q = get_qscale(h, &rcc->entry[i], 1.0, i);
-        expected_bits += qscale2bits(&rcc->entry[i], q);
-        rcc->last_qscale_for[rcc->entry[i].pict_type] = q;
-    }
-    step_mult = all_available_bits / expected_bits;
-
-    rate_factor = 0;
-    for( double step = 1E4 * step_mult; step > 1E-7 * step_mult; step *= 0.5)
-    {
-        expected_bits = 0;
-        rate_factor += step;
-
-        rcc->last_non_b_pict_type = -1;
-        rcc->last_accum_p_norm = 1;
-        rcc->accum_p_norm = 0;
-
-        rcc->last_qscale_for[0] =
-        rcc->last_qscale_for[1] =
-        rcc->last_qscale_for[2] = pow( base_cplx, 1 - rcc->qcompress ) / rate_factor;
-
-        /* find qscale */
-        for( int i = 0; i < rcc->num_entries; i++ )
-        {
-            qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, -1 );
-            rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i];
-        }
-
-        /* fixed I/B qscale relative to P */
-        for( int i = rcc->num_entries-1; i >= 0; i-- )
-        {
-            qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i], i );
-            assert(qscale[i] >= 0);
-        }
-
-        /* smooth curve */
-        if( filter_size > 1 )
-        {
-            assert( filter_size%2 == 1 );
-            for( int i = 0; i < rcc->num_entries; i++ )
-            {
-                ratecontrol_entry_t *rce = &rcc->entry[i];
-                double q = 0.0, sum = 0.0;
-
-                for( int j = 0; j < filter_size; j++ )
-                {
-                    int idx = i+j-filter_size/2;
-                    double d = idx-i;
-                    double coeff = qblur==0 ? 1.0 : exp( -d*d/(qblur*qblur) );
-                    if( idx < 0 || idx >= rcc->num_entries )
-                        continue;
-                    if( rce->pict_type != rcc->entry[idx].pict_type )
-                        continue;
-                    q += qscale[idx] * coeff;
-                    sum += coeff;
-                }
-                blurred_qscale[i] = q/sum;
-            }
-        }
-
-        /* find expected bits */
-        for( int i = 0; i < rcc->num_entries; i++ )
-        {
-            ratecontrol_entry_t *rce = &rcc->entry[i];
-            rce->new_qscale = clip_qscale( h, rce->pict_type, blurred_qscale[i] );
-            assert(rce->new_qscale >= 0);
-            expected_bits += qscale2bits( rce, rce->new_qscale );
-        }
-
-        if( expected_bits > all_available_bits )
-            rate_factor -= step;
-    }
-
-    x264_free( qscale );
-    if( filter_size > 1 )
-        x264_free( blurred_qscale );
-
-    if( rcc->b_vbv )
-        if( vbv_pass2( h, all_available_bits ) )
-            return -1;
-    expected_bits = count_expected_bits( h );
-
-    if( fabs( expected_bits/all_available_bits - 1.0 ) > 0.01 )
-    {
-        double avgq = 0;
-        for( int i = 0; i < rcc->num_entries; i++ )
-            avgq += rcc->entry[i].new_qscale;
-        avgq = qscale2qp( avgq / rcc->num_entries );
-
-        if( expected_bits > all_available_bits || !rcc->b_vbv )
-            x264_log( h, X264_LOG_WARNING, "Error: 2pass curve failed to converge\n" );
-        x264_log( h, X264_LOG_WARNING, "target: %.2f kbit/s, expected: %.2f kbit/s, avg QP: %.4f\n",
-                  (float)h->param.rc.i_bitrate,
-                  expected_bits * rcc->fps / (rcc->num_entries * 1000.),
-                  avgq );
-        if( expected_bits < all_available_bits && avgq < h->param.rc.i_qp_min + 2 )
-        {
-            if( h->param.rc.i_qp_min > 0 )
-                x264_log( h, X264_LOG_WARNING, "try reducing target bitrate or reducing qp_min (currently %d)\n", h->param.rc.i_qp_min );
-            else
-                x264_log( h, X264_LOG_WARNING, "try reducing target bitrate\n" );
-        }
-        else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 )
-        {
-            if( h->param.rc.i_qp_max < QP_MAX )
-                x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max );
-            else
-                x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n");
-        }
-        else if( !(rcc->b_2pass && rcc->b_vbv) )
-            x264_log( h, X264_LOG_WARNING, "internal error\n" );
-    }
-
-    return 0;
-fail:
-    return -1;
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/ratecontrol.h b/android/src/main/libenc/jni/libx264/encoder/ratecontrol.h
deleted file mode 100755
index 9f7e227..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/ratecontrol.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*****************************************************************************
- * ratecontrol.h: ratecontrol
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_RATECONTROL_H
-#define X264_RATECONTROL_H
-
-/* Completely arbitrary.  Ratecontrol lowers relative quality at higher framerates
- * and the reverse at lower framerates; this serves as the center of the curve.
- * Halve all the values for frame-packed 3D to compensate for the "doubled"
- * framerate. */
-#define BASE_FRAME_DURATION (0.04f / ((h->param.i_frame_packing == 5)+1))
-
-/* Arbitrary limitations as a sanity check. */
-#define MAX_FRAME_DURATION (1.00f / ((h->param.i_frame_packing == 5)+1))
-#define MIN_FRAME_DURATION (0.01f / ((h->param.i_frame_packing == 5)+1))
-
-#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
-
-int  x264_ratecontrol_new   ( x264_t * );
-void x264_ratecontrol_delete( x264_t * );
-
-void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
-int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param );
-
-void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
-int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
-int  x264_reference_build_list_optimal( x264_t *h );
-void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
-void x264_ratecontrol_zone_init( x264_t * );
-void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
-int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
-void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
-int  x264_ratecontrol_mb( x264_t *, int bits );
-int  x264_ratecontrol_qp( x264_t * );
-int  x264_ratecontrol_mb_qp( x264_t *h );
-int  x264_ratecontrol_end( x264_t *, int bits, int *filler );
-void x264_ratecontrol_summary( x264_t * );
-void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
-int  x264_ratecontrol_get_estimated_size( x264_t const *);
-int  x264_rc_analyse_slice( x264_t *h );
-int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
-void x264_threads_distribute_ratecontrol( x264_t *h );
-void x264_threads_merge_ratecontrol( x264_t *h );
-void x264_hrd_fullness( x264_t *h );
-#endif
-
diff --git a/android/src/main/libenc/jni/libx264/encoder/rdo.c b/android/src/main/libenc/jni/libx264/encoder/rdo.c
deleted file mode 100755
index ded0413..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/rdo.c
+++ /dev/null
@@ -1,1166 +0,0 @@
-/*****************************************************************************
- * rdo.c: rate-distortion optimization
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-/* duplicate all the writer functions, just calculating bit cost
- * instead of writing the bitstream.
- * TODO: use these for fast 1st pass too. */
-
-#define RDO_SKIP_BS 1
-
-/* Transition and size tables for abs<9 MVD and residual coding */
-/* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
-uint8_t x264_cabac_transition_unary[15][128];
-uint16_t x264_cabac_size_unary[15][128];
-/* Transition and size tables for abs>9 MVD */
-/* Consist of 5 1s and a bypass sign bit */
-static uint8_t cabac_transition_5ones[128];
-static uint16_t cabac_size_5ones[128];
-
-/* CAVLC: produces exactly the same bit count as a normal encode */
-/* this probably still leaves some unnecessary computations */
-#define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
-#define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
-#define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
-#define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
-#define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
-#define x264_macroblock_write_cavlc  static x264_macroblock_size_cavlc
-#include "cavlc.c"
-
-/* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
- * fractional bits, but only finite precision. */
-#undef  x264_cabac_encode_decision
-#undef  x264_cabac_encode_decision_noup
-#undef  x264_cabac_encode_bypass
-#undef  x264_cabac_encode_terminal
-#define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
-#define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
-#define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
-#define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
-#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
-#define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
-#include "cabac.c"
-
-#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
-        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
-#define COPY_CABAC_PART( pos, size )\
-        memcpy( &cb->state[pos], &h->cabac.state[pos], size )
-
-static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
-{
-    static const uint8_t hadamard_shift_x[4] = {4,   4,   3,   3};
-    static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
-    static const uint8_t  hadamard_offset[4] = {0,   1,   3,   5};
-    int cache_index = (x >> hadamard_shift_x[size]) + (y >> hadamard_shift_y[size])
-                    + hadamard_offset[size];
-    uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
-    if( res )
-        return res - 1;
-    else
-    {
-        pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
-        res = h->pixf.hadamard_ac[size]( fenc, FENC_STRIDE );
-        h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
-        return res;
-    }
-}
-
-static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
-{
-    static const uint8_t satd_shift_x[3] = {3,   2,   2};
-    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
-    static const uint8_t  satd_offset[3] = {0,   8,   16};
-    ALIGNED_16( static pixel zero[16] ) = {0};
-    int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
-                    + satd_offset[size - PIXEL_8x4];
-    int res = h->mb.pic.fenc_satd_cache[cache_index];
-    if( res )
-        return res - 1;
-    else
-    {
-        pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
-        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
-        res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
-        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
-        return res;
-    }
-}
-
-/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
-/* SATD and SA8D are used to measure block complexity. */
-/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
-/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
-
-/* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
-/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
-/* This optimization can also be used in non-RD transform decision. */
-
-static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
-{
-    ALIGNED_16( static pixel zero[16] ) = {0};
-    int satd = 0;
-    pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
-    pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
-    if( p == 0 && h->mb.i_psy_rd )
-    {
-        /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
-        if( size <= PIXEL_8x8 )
-        {
-            uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
-            uint64_t fenc_acs = cached_hadamard( h, size, x, y );
-            satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
-                 + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
-            satd >>= 1;
-        }
-        else
-        {
-            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
-            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
-        }
-        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
-    }
-    return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
-}
-
-static inline int ssd_mb( x264_t *h )
-{
-    int chroma_size = h->luma2chroma_pixel[PIXEL_16x16];
-    int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
-    chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
-}
-
-static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
-{
-    int b_transform_bak = h->mb.b_transform_8x8;
-    int i_ssd;
-    int i_bits;
-    int type_bak = h->mb.i_type;
-
-    x264_macroblock_encode( h );
-
-    if( h->mb.b_deblock_rdo )
-        x264_macroblock_deblock( h );
-
-    i_ssd = ssd_mb( h );
-
-    if( IS_SKIP( h->mb.i_type ) )
-    {
-        i_bits = (1 * i_lambda2 + 128) >> 8;
-    }
-    else if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_macroblock_size_cabac( h, &cabac_tmp );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
-    }
-    else
-    {
-        x264_macroblock_size_cavlc( h );
-        i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-
-    h->mb.b_transform_8x8 = b_transform_bak;
-    h->mb.i_type = type_bak;
-
-    return X264_MIN( i_ssd + i_bits, COST_MAX );
-}
-
-/* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
-
-static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
-{
-    uint64_t i_ssd, i_bits;
-
-    x264_macroblock_encode_p4x4( h, i4 );
-    if( i_pixel == PIXEL_8x4 )
-        x264_macroblock_encode_p4x4( h, i4+1 );
-    if( i_pixel == PIXEL_4x8 )
-        x264_macroblock_encode_p4x4( h, i4+2 );
-
-    i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
-    if( CHROMA444 )
-    {
-        int chromassd = ssd_plane( h, i_pixel, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
-                      + ssd_plane( h, i_pixel, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
-        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-        i_ssd += chromassd;
-    }
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-    else
-        i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
-
-    return (i_ssd<<8) + i_bits;
-}
-
-uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
-{
-    uint64_t i_ssd, i_bits;
-    int i8 = i4 >> 2;
-
-    if( i_pixel == PIXEL_16x16 )
-    {
-        int i_cost = x264_rd_cost_mb( h, i_lambda2 );
-        return i_cost;
-    }
-
-    if( i_pixel > PIXEL_8x8 )
-        return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
-
-    h->mb.i_cbp_luma = 0;
-
-    x264_macroblock_encode_p8x8( h, i8 );
-    if( i_pixel == PIXEL_16x8 )
-        x264_macroblock_encode_p8x8( h, i8+1 );
-    if( i_pixel == PIXEL_8x16 )
-        x264_macroblock_encode_p8x8( h, i8+2 );
-
-    int ssd_x = 8*(i8&1);
-    int ssd_y = 8*(i8>>1);
-    i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
-    int chromapix = h->luma2chroma_pixel[i_pixel];
-    int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT )
-                  + ssd_plane( h, chromapix, 2, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT );
-    i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-    else
-        i_bits = (uint64_t)x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
-
-    return (i_ssd<<8) + i_bits;
-}
-
-static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[4][32] )
-{
-    uint64_t i_ssd, i_bits;
-    int plane_count = CHROMA444 ? 3 : 1;
-    int i_qp = h->mb.i_qp;
-    h->mb.i_cbp_luma &= ~(1<<i8);
-    h->mb.b_transform_8x8 = 1;
-
-    for( int p = 0; p < plane_count; p++ )
-    {
-        x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p], 1 );
-        i_qp = h->mb.i_chroma_qp;
-    }
-
-    i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
-    if( CHROMA444 )
-    {
-        int chromassd = ssd_plane( h, PIXEL_8x8, 1, (i8&1)*8, (i8>>1)*8 )
-                      + ssd_plane( h, PIXEL_8x8, 2, (i8&1)*8, (i8>>1)*8 );
-        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-        i_ssd += chromassd;
-    }
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-    else
-        i_bits = (uint64_t)x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
-
-    return (i_ssd<<8) + i_bits;
-}
-
-static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
-{
-    uint64_t i_ssd, i_bits;
-    int plane_count = CHROMA444 ? 3 : 1;
-    int i_qp = h->mb.i_qp;
-
-    for( int p = 0; p < plane_count; p++ )
-    {
-        x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode, 1 );
-        i_qp = h->mb.i_chroma_qp;
-    }
-
-    i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
-    if( CHROMA444 )
-    {
-        int chromassd = ssd_plane( h, PIXEL_4x4, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
-                      + ssd_plane( h, PIXEL_4x4, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
-        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-        i_ssd += chromassd;
-    }
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-    else
-        i_bits = (uint64_t)x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
-
-    return (i_ssd<<8) + i_bits;
-}
-
-static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
-{
-    uint64_t i_ssd, i_bits;
-
-    if( b_dct )
-        x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp );
-
-    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
-    i_ssd = ssd_plane( h, chromapix, 1, 0, 0 )
-          + ssd_plane( h, chromapix, 2, 0, 0 );
-
-    h->mb.i_chroma_pred_mode = i_mode;
-
-    if( h->param.b_cabac )
-    {
-        x264_cabac_t cabac_tmp;
-        COPY_CABAC;
-        x264_chroma_size_cabac( h, &cabac_tmp );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
-    }
-    else
-        i_bits = (uint64_t)x264_chroma_size_cavlc( h ) * i_lambda2;
-
-    return (i_ssd<<8) + i_bits;
-}
-/****************************************************************************
- * Trellis RD quantization
- ****************************************************************************/
-
-#define TRELLIS_SCORE_MAX -1LL // negative marks the node as invalid
-#define TRELLIS_SCORE_BIAS 1LL<<60; // bias so that all valid scores are positive, even after negative contributions from psy
-#define CABAC_SIZE_BITS 8
-#define LAMBDA_BITS 4
-
-/* precalculate the cost of coding various combinations of bits in a single context */
-void x264_rdo_init( void )
-{
-    for( int i_prefix = 0; i_prefix < 15; i_prefix++ )
-    {
-        for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
-        {
-            int f8_bits = 0;
-            uint8_t ctx = i_ctx;
-
-            for( int i = 1; i < i_prefix; i++ )
-                f8_bits += x264_cabac_size_decision2( &ctx, 1 );
-            if( i_prefix > 0 && i_prefix < 14 )
-                f8_bits += x264_cabac_size_decision2( &ctx, 0 );
-            f8_bits += 1 << CABAC_SIZE_BITS; //sign
-
-            x264_cabac_size_unary[i_prefix][i_ctx] = f8_bits;
-            x264_cabac_transition_unary[i_prefix][i_ctx] = ctx;
-        }
-    }
-    for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
-    {
-        int f8_bits = 0;
-        uint8_t ctx = i_ctx;
-
-        for( int i = 0; i < 5; i++ )
-            f8_bits += x264_cabac_size_decision2( &ctx, 1 );
-        f8_bits += 1 << CABAC_SIZE_BITS; //sign
-
-        cabac_size_5ones[i_ctx] = f8_bits;
-        cabac_transition_5ones[i_ctx] = ctx;
-    }
-}
-
-typedef struct
-{
-    uint64_t score;
-    int level_idx; // index into level_tree[]
-    uint8_t cabac_state[4]; // just contexts 0,4,8,9 of the 10 relevant to coding abs_level_m1
-} trellis_node_t;
-
-typedef struct
-{
-    uint16_t next;
-    uint16_t abs_level;
-} trellis_level_t;
-
-// TODO:
-// save cabac state between blocks?
-// use trellis' RD score instead of x264_mb_decimate_score?
-// code 8x8 sig/last flags forwards with deadzone and save the contexts at
-//   each position?
-// change weights when using CQMs?
-
-// possible optimizations:
-// make scores fit in 32bit
-// save quantized coefs during rd, to avoid a duplicate trellis in the final encode
-// if trellissing all MBRD modes, finish SSD calculation so we can skip all of
-//   the normal dequant/idct/ssd/cabac
-
-// the unquant_mf here is not the same as dequant_mf:
-// in normal operation (dct->quant->dequant->idct) the dct and idct are not
-// normalized. quant/dequant absorb those scaling factors.
-// in this function, we just do (quant->unquant) and want the output to be
-// comparable to the input. so unquant is the direct inverse of quant,
-// and uses the dct scaling factors, not the idct ones.
-
-#define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
-
-#define SET_LEVEL(ndst, nsrc, l) {\
-    if( sizeof(trellis_level_t) == sizeof(uint32_t) )\
-        M32( &level_tree[levels_used] ) = pack16to32( nsrc.level_idx, l );\
-    else\
-        level_tree[levels_used] = (trellis_level_t){ nsrc.level_idx, l };\
-    ndst.level_idx = levels_used;\
-    levels_used++;\
-}
-
-// encode all values of the dc coef in a block which is known to have no ac
-static NOINLINE
-int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef_weight, int lambda2, uint8_t *cabac_state, int cost_sig )
-{
-    uint64_t bscore = TRELLIS_SCORE_MAX;
-    int ret = 0;
-    int q = abs( quant_coef );
-    for( int abs_level = q-1; abs_level <= q; abs_level++ )
-    {
-        int unquant_abs_level = (unquant_mf * abs_level + 128) >> 8;
-
-        /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
-        int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);
-        uint64_t score = (uint64_t)d*d * coef_weight;
-
-        /* code the proposed level, and count how much entropy it would take */
-        if( abs_level )
-        {
-            unsigned f8_bits = cost_sig;
-            int prefix = X264_MIN( abs_level - 1, 14 );
-            f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 );
-            f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]];
-            if( abs_level >= 15 )
-                f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
-            score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
-        }
-
-        COPY2_IF_LT( bscore, score, ret, abs_level );
-    }
-    return SIGN(ret, sign_coef);
-}
-
-// encode one value of one coef in one context
-static ALWAYS_INLINE
-int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost,
-                  int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3],
-                  trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                  trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state )
-{
-    uint64_t score = nodes_prev[j].score + ssd;
-    /* code the proposed level, and count how much entropy it would take */
-    unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
-    uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx];
-    f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)];
-    uint8_t levelgt1_state;
-    if( const_level > 1 )
-    {
-        levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
-        f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
-    }
-    else
-        f8_bits += 1 << CABAC_SIZE_BITS;
-    score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
-
-    /* save the node if it's better than any existing node with the same cabac ctx */
-    if( score < nodes_cur[node_ctx].score )
-    {
-        nodes_cur[node_ctx].score = score;
-        if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state
-            M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12);
-        else if( j >= 3 )
-            M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state);
-        if( j >= 3 ) // skip the transition if we're not going to reuse the context
-            nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
-        if( const_level > 1 && node_ctx == 7 )
-            nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
-        nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
-        SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
-    }
-    return levels_used;
-}
-
-// encode one value of one coef in all contexts, templated by which value that is.
-// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure.
-// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning,
-// so exploiting that would be more complicated.
-static NOINLINE
-int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used )
-{
-    nodes_cur[0].score = nodes_prev[0].score + ssd0;
-    nodes_cur[0].level_idx = nodes_prev[0].level_idx;
-    for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ )
-    {
-        nodes_cur[j].score = nodes_prev[j].score;
-        if( j >= 3 )
-            M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
-        SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
-    }
-    return levels_used;
-}
-
-static NOINLINE
-int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used )
-{
-    for( int j = 1; j < 8; j++ )
-        // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0.
-        if( (int64_t)nodes_prev[j].score >= 0 )
-        {
-            nodes_cur[j].score = nodes_prev[j].score;
-            if( j >= 3 )
-                M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
-            SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
-        }
-    return levels_used;
-}
-
-#define COEF(const_level, ctx_hi, j, ...)\
-    if( !j || (int64_t)nodes_prev[j].score >= 0 )\
-        levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\
-                                    j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\
-                                    level_tree, levels_used, lambda2, level_state );\
-    else if( !ctx_hi )\
-        return levels_used;
-
-static NOINLINE
-int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
-                     trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used, int lambda2,
-                     uint8_t *level_state )
-{
-    int abs_level = 1, prefix = 1, suffix_cost = 0;
-    COEF( 1, 0, 0, 1, 1, 0 );
-    COEF( 1, 0, 1, 2, 2, 0 );
-    COEF( 1, 0, 2, 3, 3, 0 );
-    COEF( 1, 0, 3, 3, 4, 0 );
-    return levels_used;
-}
-
-static NOINLINE
-int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
-                     trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used, int lambda2,
-                     uint8_t *level_state )
-{
-    int abs_level = 1, prefix = 1, suffix_cost = 0;
-    COEF( 1, 1, 1, 2, 2, 0 );
-    COEF( 1, 1, 2, 3, 3, 0 );
-    COEF( 1, 1, 3, 3, 4, 0 );
-    COEF( 1, 1, 4, 4, 0, 0 );
-    COEF( 1, 1, 5, 5, 0, 0 );
-    COEF( 1, 1, 6, 6, 0, 0 );
-    COEF( 1, 1, 7, 7, 0, 0 );
-    return levels_used;
-}
-
-static NOINLINE
-int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
-                     trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used, int lambda2,
-                     uint8_t *level_state, int levelgt1_ctx )
-{
-    int prefix = X264_MIN( abs_level-1, 14 );
-    int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
-    COEF( 2, 0, 0, 4, 1, 5 );
-    COEF( 2, 0, 1, 4, 2, 5 );
-    COEF( 2, 0, 2, 4, 3, 5 );
-    COEF( 2, 0, 3, 4, 4, 5 );
-    return levels_used;
-}
-
-static NOINLINE
-int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
-                     trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
-                     trellis_level_t *level_tree, int levels_used, int lambda2,
-                     uint8_t *level_state, int levelgt1_ctx )
-{
-    int prefix = X264_MIN( abs_level-1, 14 );
-    int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
-    COEF( 2, 1, 1, 4, 2, 5 );
-    COEF( 2, 1, 2, 4, 3, 5 );
-    COEF( 2, 1, 3, 4, 4, 5 );
-    COEF( 2, 1, 4, 5, 0, 6 );
-    COEF( 2, 1, 5, 6, 0, 7 );
-    COEF( 2, 1, 6, 7, 0, 8 );
-    COEF( 2, 1, 7, 7, 0, levelgt1_ctx );
-    return levels_used;
-}
-
-static ALWAYS_INLINE
-int quant_trellis_cabac( x264_t *h, dctcoef *dct,
-                         udctcoef *quant_mf, udctcoef *quant_bias, const int *unquant_mf,
-                         const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
-                         int b_chroma, int dc, int num_coefs, int idx )
-{
-    ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
-    ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
-    const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
-    const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
-    const int b_interlaced = MB_INTERLACED;
-    uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
-    uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
-    int levelgt1_ctx = b_chroma && dc ? 8 : 9;
-
-    if( dc )
-    {
-        if( num_coefs == 16 )
-        {
-            memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
-            if( !h->quantf.quant_4x4_dc( dct, quant_mf[0] >> 1, quant_bias[0] << 1 ) )
-                return 0;
-            h->zigzagf.scan_4x4( quant_coefs, dct );
-        }
-        else
-        {
-            memcpy( orig_coefs, dct, sizeof(dctcoef)*num_coefs );
-            int nz = h->quantf.quant_2x2_dc( &dct[0], quant_mf[0] >> 1, quant_bias[0] << 1 );
-            if( num_coefs == 8 )
-                nz |= h->quantf.quant_2x2_dc( &dct[4], quant_mf[0] >> 1, quant_bias[0] << 1 );
-            if( !nz )
-                return 0;
-            for( int i = 0; i < num_coefs; i++ )
-                quant_coefs[i] = dct[zigzag[i]];
-        }
-    }
-    else
-    {
-        if( num_coefs == 64 )
-        {
-            h->mc.memcpy_aligned( orig_coefs, dct, sizeof(dctcoef)*64 );
-            if( !h->quantf.quant_8x8( dct, quant_mf, quant_bias ) )
-                return 0;
-            h->zigzagf.scan_8x8( quant_coefs, dct );
-        }
-        else //if( num_coefs == 16 )
-        {
-            memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
-            if( !h->quantf.quant_4x4( dct, quant_mf, quant_bias ) )
-                return 0;
-            h->zigzagf.scan_4x4( quant_coefs, dct );
-        }
-    }
-
-    int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
-    uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ];
-
-    /* shortcut for dc-only blocks.
-     * this doesn't affect the output, but saves some unnecessary computation. */
-    if( last_nnz == 0 && !dc )
-    {
-        int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 )
-                     + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 );
-        dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig );
-        return !!dct[0];
-    }
-
-#if HAVE_MMX && ARCH_X86_64
-#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
-                     cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
-    if( num_coefs == 16 && !dc )
-        if( b_chroma || !h->mb.i_psy_trellis )
-            return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac );
-        else
-            return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis );
-    else if( num_coefs == 64 && !dc )
-        if( b_chroma || !h->mb.i_psy_trellis )
-            return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced );
-        else
-            return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis);
-    else if( num_coefs == 8 && dc )
-        return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS );
-    else if( dc )
-        return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 );
-#endif
-
-    // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
-    // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
-    // but it takes more time to remove dead states than you gain in reduced memory.
-    trellis_level_t level_tree[64*8*2];
-    int levels_used = 1;
-    /* init trellis */
-    trellis_node_t nodes[2][8];
-    trellis_node_t *nodes_cur = nodes[0];
-    trellis_node_t *nodes_prev = nodes[1];
-    trellis_node_t *bnode;
-    for( int j = 1; j < 4; j++ )
-        nodes_cur[j].score = TRELLIS_SCORE_MAX;
-    nodes_cur[0].score = TRELLIS_SCORE_BIAS;
-    nodes_cur[0].level_idx = 0;
-    level_tree[0].abs_level = 0;
-    level_tree[0].next = 0;
-    ALIGNED_4( uint8_t level_state[16] );
-    memcpy( level_state, cabac_state, 10 );
-    level_state[12] = cabac_state[0]; // packed subset for copying into trellis_node_t
-    level_state[13] = cabac_state[4];
-    level_state[14] = cabac_state[8];
-    level_state[15] = cabac_state[9];
-
-    idx &= num_coefs == 64 ? 3 : 15;
-
-    // coefs are processed in reverse order, because that's how the abs value is coded.
-    // last_coef and significant_coef flags are normally coded in forward order, but
-    // we have to reverse them to match the levels.
-    // in 4x4 blocks, last_coef and significant_coef use a separate context for each
-    // position, so the order doesn't matter, and we don't even have to update their contexts.
-    // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
-    // cabac isn't too sensitive.
-    int i = last_nnz;
-#define TRELLIS_LOOP(ctx_hi)\
-    for( ; i >= b_ac; i-- )\
-    {\
-        /* skip 0s: this doesn't affect the output, but saves some unnecessary computation. */\
-        if( !quant_coefs[i] )\
-        {\
-            /* no need to calculate ssd of 0s: it's the same in all nodes.\
-             * no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
-             * subtracting from one score is equivalent to adding to the rest. */\
-            if( !ctx_hi )\
-            {\
-                int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
-                               b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
-                uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\
-                                   * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
-                nodes_cur[0].score -= cost_sig0;\
-            }\
-            for( int j = 1; j < (ctx_hi?8:4); j++ )\
-                SET_LEVEL( nodes_cur[j], nodes_cur[j], 0 );\
-            continue;\
-        }\
-\
-        int sign_coef = orig_coefs[zigzag[i]];\
-        int abs_coef = abs( sign_coef );\
-        int q = abs( quant_coefs[i] );\
-        int cost_siglast[3]; /* { zero, nonzero, nonzero-and-last } */\
-        XCHG( trellis_node_t*, nodes_cur, nodes_prev );\
-        for( int j = ctx_hi; j < 8; j++ )\
-            nodes_cur[j].score = TRELLIS_SCORE_MAX;\
-\
-        if( i < num_coefs-1 || ctx_hi )\
-        {\
-            int sigindex  = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
-                            b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
-            int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\
-                            b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
-            cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\
-            int cost_sig1   = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\
-            cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\
-            if( !ctx_hi )\
-                cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;\
-        }\
-        else\
-        {\
-            cost_siglast[0] = cost_siglast[1] = cost_siglast[2] = 0;\
-        }\
-\
-        /* there are a few cases where increasing the coeff magnitude helps,\
-         * but it's only around .003 dB, and skipping them ~doubles the speed of trellis.\
-         * could also try q-2: that sometimes helps, but also sometimes decimates blocks\
-         * that are better left coded, especially at QP > 40. */\
-        uint64_t ssd0[2], ssd1[2];\
-        for( int k = 0; k < 2; k++ )\
-        {\
-            int abs_level = q-1+k;\
-            int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);\
-            int d = abs_coef - unquant_abs_level;\
-            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */\
-            if( h->mb.i_psy_trellis && i && !dc && !b_chroma )\
-            {\
-                int orig_coef = (num_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];\
-                int predicted_coef = orig_coef - sign_coef;\
-                int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\
-                int psy_weight = coef_weight1[zigzag[i]] * h->mb.i_psy_trellis;\
-                ssd1[k] = (uint64_t)d*d * coef_weight2[zigzag[i]] - psy_weight * psy_value;\
-            }\
-            else\
-            /* FIXME: for i16x16 dc is this weight optimal? */\
-                ssd1[k] = (uint64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\
-            ssd0[k] = ssd1[k];\
-            if( !i && !dc && !ctx_hi )\
-            {\
-                /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */\
-                d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\
-                ssd0[k] = (uint64_t)d*d * coef_weight2[zigzag[i]];\
-            }\
-        }\
-\
-        /* argument passing imposes some significant overhead here. gcc's interprocedural register allocation isn't up to it. */\
-        switch( q )\
-        {\
-        case 1:\
-            ssd1[0] += (uint64_t)cost_siglast[0] * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
-            levels_used = trellis_coef0_##ctx_hi( ssd0[0]-ssd1[0], nodes_cur, nodes_prev, level_tree, levels_used );\
-            levels_used = trellis_coef1_##ctx_hi( ssd0[1]-ssd1[0], ssd1[1]-ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
-            goto next##ctx_hi;\
-        case 2:\
-            levels_used = trellis_coef1_##ctx_hi( ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
-            levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
-            goto next1;\
-        default:\
-            levels_used = trellis_coefn_##ctx_hi( q-1, ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
-            levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
-            goto next1;\
-        }\
-        next##ctx_hi:;\
-    }\
-    /* output levels from the best path through the trellis */\
-    bnode = &nodes_cur[ctx_hi];\
-    for( int j = ctx_hi+1; j < (ctx_hi?8:4); j++ )\
-        if( nodes_cur[j].score < bnode->score )\
-            bnode = &nodes_cur[j];
-
-    // keep 2 versions of the main quantization loop, depending on which subsets of the node_ctxs are live
-    // node_ctx 0..3, i.e. having not yet encountered any coefs that might be quantized to >1
-    TRELLIS_LOOP(0);
-
-    if( bnode == &nodes_cur[0] )
-    {
-        /* We only need to zero an empty 4x4 block. 8x8 can be
-           implicitly emptied via zero nnz, as can dc. */
-        if( num_coefs == 16 && !dc )
-            memset( dct, 0, 16 * sizeof(dctcoef) );
-        return 0;
-    }
-
-    if(0) // accessible only by goto, not fallthrough
-    {
-        // node_ctx 1..7 (ctx0 ruled out because we never try both level0 and level2+ on the same coef)
-        TRELLIS_LOOP(1);
-    }
-
-    int level = bnode->level_idx;
-    for( i = b_ac; i <= last_nnz; i++ )
-    {
-        dct[zigzag[i]] = SIGN(level_tree[level].abs_level, dct[zigzag[i]]);
-        level = level_tree[level].next;
-    }
-
-    return 1;
-}
-
-/* FIXME: This is a gigantic hack.  See below.
- *
- * CAVLC is much more difficult to trellis than CABAC.
- *
- * CABAC has only three states to track: significance map, last, and the
- * level state machine.
- * CAVLC, by comparison, has five: coeff_token (trailing + total),
- * total_zeroes, zero_run, and the level state machine.
- *
- * I know of no paper that has managed to design a close-to-optimal trellis
- * that covers all five of these and isn't exponential-time.  As a result, this
- * "trellis" isn't: it's just a QNS search.  Patches welcome for something better.
- * It's actually surprisingly fast, albeit not quite optimal.  It's pretty close
- * though; since CAVLC only has 2^16 possible rounding modes (assuming only two
- * roundings as options), a bruteforce search is feasible.  Testing shows
- * that this QNS is reasonably close to optimal in terms of compression.
- *
- * TODO:
- *  Don't bother changing large coefficients when it wouldn't affect bit cost
- *  (e.g. only affecting bypassed suffix bits).
- *  Don't re-run all parts of CAVLC bit cost calculation when not necessary.
- *  e.g. when changing a coefficient from one non-zero value to another in
- *  such a way that trailing ones and suffix length isn't affected. */
-static ALWAYS_INLINE
-int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
-                         const udctcoef *quant_mf, const int *unquant_mf,
-                         const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
-                         int b_chroma, int dc, int num_coefs, int idx, int b_8x8 )
-{
-    ALIGNED_16( dctcoef quant_coefs[2][16] );
-    ALIGNED_16( dctcoef coefs[16] ) = {0};
-    const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
-    const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
-    int delta_distortion[16];
-    int64_t score = 1ULL<<62;
-    int i, j;
-    const int f = 1<<15;
-    int nC = b_chroma && dc ? 3 + (num_coefs>>2)
-                            : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];
-
-    /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
-     * step/start/end than internal processing. */
-    int step = 1;
-    int start = b_ac;
-    int end = num_coefs - 1;
-    if( b_8x8 )
-    {
-        start = idx&3;
-        end = 60 + start;
-        step = 4;
-    }
-    idx &= 15;
-
-    lambda2 <<= LAMBDA_BITS;
-
-    /* Find last non-zero coefficient. */
-    for( i = end; i >= start; i -= step )
-        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
-            break;
-
-    if( i < start )
-        goto zeroblock;
-
-    /* Prepare for QNS search: calculate distortion caused by each DCT coefficient
-     * rounding to be searched.
-     *
-     * We only search two roundings (nearest and nearest-1) like in CABAC trellis,
-     * so we just store the difference in distortion between them. */
-    int last_nnz = b_8x8 ? i >> 2 : i;
-    int coef_mask = 0;
-    int round_mask = 0;
-    for( i = b_ac, j = start; i <= last_nnz; i++, j += step )
-    {
-        int coef = dct[zigzag[j]];
-        int abs_coef = abs(coef);
-        int sign = coef < 0 ? -1 : 1;
-        int nearest_quant = ( f + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
-        quant_coefs[1][i] = quant_coefs[0][i] = sign * nearest_quant;
-        coefs[i] = quant_coefs[1][i];
-        if( nearest_quant )
-        {
-            /* We initialize the trellis with a deadzone halfway between nearest rounding
-             * and always-round-down.  This gives much better results than initializing to either
-             * extreme.
-             * FIXME: should we initialize to the deadzones used by deadzone quant? */
-            int deadzone_quant = ( f/2 + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16;
-            int unquant1 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-0) + 128) >> 8);
-            int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8);
-            int d1 = abs_coef - unquant1;
-            int d0 = abs_coef - unquant0;
-            delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight2[zigzag[j]]);
-
-            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
-            if( h->mb.i_psy_trellis && j && !dc && !b_chroma )
-            {
-                int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]];
-                int predicted_coef = orig_coef - coef;
-                int psy_weight = coef_weight1[zigzag[j]];
-                int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign);
-                int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign);
-                delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight;
-            }
-
-            quant_coefs[0][i] = sign * (nearest_quant-1);
-            if( deadzone_quant != nearest_quant )
-                coefs[i] = quant_coefs[0][i];
-            else
-                round_mask |= 1 << i;
-        }
-        else
-            delta_distortion[i] = 0;
-        coef_mask |= (!!coefs[i]) << i;
-    }
-
-    /* Calculate the cost of the starting state. */
-    h->out.bs.i_bits_encoded = 0;
-    if( !coef_mask )
-        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
-    else
-        x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
-    score = (int64_t)h->out.bs.i_bits_encoded * lambda2;
-
-    /* QNS loop: pick the change that improves RD the most, apply it, repeat.
-     * coef_mask and round_mask are used to simplify tracking of nonzeroness
-     * and rounding modes chosen. */
-    while( 1 )
-    {
-        int64_t iter_score = score;
-        int iter_distortion_delta = 0;
-        int iter_coef = -1;
-        int iter_mask = coef_mask;
-        int iter_round = round_mask;
-        for( i = b_ac; i <= last_nnz; i++ )
-        {
-            if( !delta_distortion[i] )
-                continue;
-
-            /* Set up all the variables for this iteration. */
-            int cur_round = round_mask ^ (1 << i);
-            int round_change = (cur_round >> i)&1;
-            int old_coef = coefs[i];
-            int new_coef = quant_coefs[round_change][i];
-            int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i);
-            int cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1);
-            int64_t cur_score = cur_distortion_delta;
-            coefs[i] = new_coef;
-
-            /* Count up bits. */
-            h->out.bs.i_bits_encoded = 0;
-            if( !cur_mask )
-                bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );
-            else
-                x264_cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC );
-            cur_score += (int64_t)h->out.bs.i_bits_encoded * lambda2;
-
-            coefs[i] = old_coef;
-            if( cur_score < iter_score )
-            {
-                iter_score = cur_score;
-                iter_coef = i;
-                iter_mask = cur_mask;
-                iter_round = cur_round;
-                iter_distortion_delta = cur_distortion_delta;
-            }
-        }
-        if( iter_coef >= 0 )
-        {
-            score = iter_score - iter_distortion_delta;
-            coef_mask = iter_mask;
-            round_mask = iter_round;
-            coefs[iter_coef] = quant_coefs[((round_mask >> iter_coef)&1)][iter_coef];
-            /* Don't try adjusting coefficients we've already adjusted.
-             * Testing suggests this doesn't hurt results -- and sometimes actually helps. */
-            delta_distortion[iter_coef] = 0;
-        }
-        else
-            break;
-    }
-
-    if( coef_mask )
-    {
-        for( i = b_ac, j = start; i < num_coefs; i++, j += step )
-            dct[zigzag[j]] = coefs[i];
-        return 1;
-    }
-
-zeroblock:
-    if( !dc )
-    {
-        if( b_8x8 )
-            for( i = start; i <= end; i+=step )
-                dct[zigzag[i]] = 0;
-        else
-            memset( dct, 0, 16*sizeof(dctcoef) );
-    }
-    return 0;
-}
-
-int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx )
-{
-    if( h->param.b_cabac )
-        return quant_trellis_cabac( h, dct,
-            h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp],
-            h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );
-
-    return quant_trellis_cavlc( h, dct,
-        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
-        DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
-}
-
-static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 };
-static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 };
-
-int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx )
-{
-    const uint8_t *zigzag;
-    int num_coefs;
-    int quant_cat = CQM_4IC+1 - b_intra;
-
-    if( CHROMA_FORMAT == CHROMA_422 )
-    {
-        zigzag = x264_zigzag_scan2x4;
-        num_coefs = 8;
-    }
-    else
-    {
-        zigzag = x264_zigzag_scan2x2;
-        num_coefs = 4;
-    }
-
-    if( h->param.b_cabac )
-        return quant_trellis_cabac( h, dct,
-            h->quant4_mf[quant_cat][i_qp], h->quant4_bias0[quant_cat][i_qp],
-            h->unquant4_mf[quant_cat][i_qp], zigzag,
-            DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );
-
-    return quant_trellis_cavlc( h, dct,
-        h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], zigzag,
-        DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
-}
-
-int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                            int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
-{
-    static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};
-    int b_ac = ctx_ac[ctx_block_cat];
-    if( h->param.b_cabac )
-        return quant_trellis_cabac( h, dct,
-            h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp],
-            h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx );
-
-    return quant_trellis_cavlc( h, dct,
-            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-            x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 );
-}
-
-int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                            int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
-{
-    if( h->param.b_cabac )
-    {
-        return quant_trellis_cabac( h, dct,
-            h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias0[i_quant_cat][i_qp],
-            h->unquant8_mf[i_quant_cat][i_qp], x264_zigzag_scan8[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx );
-    }
-
-    /* 8x8 CAVLC is split into 4 4x4 blocks */
-    int nzaccum = 0;
-    for( int i = 0; i < 4; i++ )
-    {
-        int nz = quant_trellis_cavlc( h, dct,
-            h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
-            x264_zigzag_scan8[MB_INTERLACED],
-            DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx*4+i, 1 );
-        /* Set up nonzero count for future calls */
-        h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
-        nzaccum |= nz;
-    }
-    STORE_8x8_NNZ( 0, idx, 0 );
-    return nzaccum;
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/set.c b/android/src/main/libenc/jni/libx264/encoder/set.c
deleted file mode 100755
index a3a550c..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/set.c
+++ /dev/null
@@ -1,850 +0,0 @@
-/*****************************************************************************
- * set: header writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "set.h"
-
-#define bs_write_ue bs_write_ue_big
-
-// Indexed by pic_struct values
-static const uint8_t num_clock_ts[10] = { 0, 1, 1, 1, 2, 2, 3, 3, 2, 3 };
-const static uint8_t avcintra_uuid[] = {0xF7, 0x49, 0x3E, 0xB3, 0xD4, 0x00, 0x47, 0x96, 0x86, 0x86, 0xC9, 0x70, 0x7B, 0x64, 0x37, 0x2A};
-
-static void transpose( uint8_t *buf, int w )
-{
-    for( int i = 0; i < w; i++ )
-        for( int j = 0; j < i; j++ )
-            XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
-}
-
-static void scaling_list_write( bs_t *s, x264_pps_t *pps, int idx )
-{
-    const int len = idx<4 ? 16 : 64;
-    const uint8_t *zigzag = idx<4 ? x264_zigzag_scan4[0] : x264_zigzag_scan8[0];
-    const uint8_t *list = pps->scaling_list[idx];
-    const uint8_t *def_list = (idx==CQM_4IC) ? pps->scaling_list[CQM_4IY]
-                            : (idx==CQM_4PC) ? pps->scaling_list[CQM_4PY]
-                            : (idx==CQM_8IC+4) ? pps->scaling_list[CQM_8IY+4]
-                            : (idx==CQM_8PC+4) ? pps->scaling_list[CQM_8PY+4]
-                            : x264_cqm_jvt[idx];
-    if( !memcmp( list, def_list, len ) )
-        bs_write1( s, 0 );   // scaling_list_present_flag
-    else if( !memcmp( list, x264_cqm_jvt[idx], len ) )
-    {
-        bs_write1( s, 1 );   // scaling_list_present_flag
-        bs_write_se( s, -8 ); // use jvt list
-    }
-    else
-    {
-        int run;
-        bs_write1( s, 1 );   // scaling_list_present_flag
-
-        // try run-length compression of trailing values
-        for( run = len; run > 1; run-- )
-            if( list[zigzag[run-1]] != list[zigzag[run-2]] )
-                break;
-        if( run < len && len - run < bs_size_se( (int8_t)-list[zigzag[run]] ) )
-            run = len;
-
-        for( int j = 0; j < run; j++ )
-            bs_write_se( s, (int8_t)(list[zigzag[j]] - (j>0 ? list[zigzag[j-1]] : 8)) ); // delta
-
-        if( run < len )
-            bs_write_se( s, (int8_t)-list[zigzag[run]] );
-    }
-}
-
-void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type )
-{
-    int i;
-
-    bs_realign( s );
-
-    for( i = 0; i <= payload_type-255; i += 255 )
-        bs_write( s, 8, 255 );
-    bs_write( s, 8, payload_type-i );
-
-    for( i = 0; i <= payload_size-255; i += 255 )
-        bs_write( s, 8, 255 );
-    bs_write( s, 8, payload_size-i );
-
-    for( i = 0; i < payload_size; i++ )
-        bs_write( s, 8, payload[i] );
-
-    bs_rbsp_trailing( s );
-    bs_flush( s );
-}
-
-void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
-{
-    int csp = param->i_csp & X264_CSP_MASK;
-
-    sps->i_id = i_id;
-    sps->i_mb_width = ( param->i_width + 15 ) / 16;
-    sps->i_mb_height= ( param->i_height + 15 ) / 16;
-    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
-                               csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420;
-
-    sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
-    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
-        sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
-    else if( sps->i_chroma_format_idc == CHROMA_422 )
-        sps->i_profile_idc  = PROFILE_HIGH422;
-    else if( BIT_DEPTH > 8 )
-        sps->i_profile_idc  = PROFILE_HIGH10;
-    else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
-        sps->i_profile_idc  = PROFILE_HIGH;
-    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
-        sps->i_profile_idc  = PROFILE_MAIN;
-    else
-        sps->i_profile_idc  = PROFILE_BASELINE;
-
-    sps->b_constraint_set0  = sps->i_profile_idc == PROFILE_BASELINE;
-    /* x264 doesn't support the features that are in Baseline and not in Main,
-     * namely arbitrary_slice_order and slice_groups. */
-    sps->b_constraint_set1  = sps->i_profile_idc <= PROFILE_MAIN;
-    /* Never set constraint_set2, it is not necessary and not used in real world. */
-    sps->b_constraint_set2  = 0;
-    sps->b_constraint_set3  = 0;
-
-    sps->i_level_idc = param->i_level_idc;
-    if( param->i_level_idc == 9 && ( sps->i_profile_idc == PROFILE_BASELINE || sps->i_profile_idc == PROFILE_MAIN ) )
-    {
-        sps->b_constraint_set3 = 1; /* level 1b with Baseline or Main profile is signalled via constraint_set3 */
-        sps->i_level_idc      = 11;
-    }
-    /* Intra profiles */
-    if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH )
-        sps->b_constraint_set3 = 1;
-
-    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
-    /* extra slot with pyramid so that we don't have to override the
-     * order of forgetting old pictures */
-    sps->vui.i_max_dec_frame_buffering =
-    sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
-                            param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
-    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
-    if( param->i_keyint_max == 1 )
-    {
-        sps->i_num_ref_frames = 0;
-        sps->vui.i_max_dec_frame_buffering = 0;
-    }
-
-    /* number of refs + current frame */
-    int max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1;
-    /* Intra refresh cannot write a recovery time greater than max frame num-1 */
-    if( param->b_intra_refresh )
-    {
-        int time_to_recovery = X264_MIN( sps->i_mb_width - 1, param->i_keyint_max ) + param->i_bframe - 1;
-        max_frame_num = X264_MAX( max_frame_num, time_to_recovery+1 );
-    }
-
-    sps->i_log2_max_frame_num = 4;
-    while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
-        sps->i_log2_max_frame_num++;
-
-    sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2;
-    if( sps->i_poc_type == 0 )
-    {
-        int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
-        sps->i_log2_max_poc_lsb = 4;
-        while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 )
-            sps->i_log2_max_poc_lsb++;
-    }
-
-    sps->b_vui = 1;
-
-    sps->b_gaps_in_frame_num_value_allowed = 0;
-    sps->b_frame_mbs_only = !(param->b_interlaced || param->b_fake_interlaced);
-    if( !sps->b_frame_mbs_only )
-        sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1;
-    sps->b_mb_adaptive_frame_field = param->b_interlaced;
-    sps->b_direct8x8_inference = 1;
-
-    x264_sps_init_reconfigurable( sps, param );
-
-    sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2;
-    if( sps->vui.b_overscan_info_present )
-        sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 );
-
-    sps->vui.b_signal_type_present = 0;
-    sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 );
-    sps->vui.b_fullrange = ( param->vui.b_fullrange >= 0 && param->vui.b_fullrange <= 1 ? param->vui.b_fullrange :
-                           ( csp >= X264_CSP_BGR ? 1 : 0 ) );
-    sps->vui.b_color_description_present = 0;
-
-    sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 12 ? param->vui.i_colorprim : 2 );
-    sps->vui.i_transfer  = ( param->vui.i_transfer  >= 0 && param->vui.i_transfer  <= 17 ? param->vui.i_transfer  : 2 );
-    sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 11 ? param->vui.i_colmatrix :
-                           ( csp >= X264_CSP_BGR ? 0 : 2 ) );
-    if( sps->vui.i_colorprim != 2 ||
-        sps->vui.i_transfer  != 2 ||
-        sps->vui.i_colmatrix != 2 )
-    {
-        sps->vui.b_color_description_present = 1;
-    }
-
-    if( sps->vui.i_vidformat != 5 ||
-        sps->vui.b_fullrange ||
-        sps->vui.b_color_description_present )
-    {
-        sps->vui.b_signal_type_present = 1;
-    }
-
-    /* FIXME: not sufficient for interlaced video */
-    sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 &&
-                                         sps->i_chroma_format_idc == CHROMA_420;
-    if( sps->vui.b_chroma_loc_info_present )
-    {
-        sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc;
-        sps->vui.i_chroma_loc_bottom = param->vui.i_chroma_loc;
-    }
-
-    sps->vui.b_timing_info_present = param->i_timebase_num > 0 && param->i_timebase_den > 0;
-
-    if( sps->vui.b_timing_info_present )
-    {
-        sps->vui.i_num_units_in_tick = param->i_timebase_num;
-        sps->vui.i_time_scale = param->i_timebase_den * 2;
-        sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
-    }
-
-    sps->vui.b_vcl_hrd_parameters_present = 0; // we don't support VCL HRD
-    sps->vui.b_nal_hrd_parameters_present = !!param->i_nal_hrd;
-    sps->vui.b_pic_struct_present = param->b_pic_struct;
-
-    // NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable
-
-    sps->vui.b_bitstream_restriction = param->i_keyint_max > 1;
-    if( sps->vui.b_bitstream_restriction )
-    {
-        sps->vui.b_motion_vectors_over_pic_boundaries = 1;
-        sps->vui.i_max_bytes_per_pic_denom = 0;
-        sps->vui.i_max_bits_per_mb_denom = 0;
-        sps->vui.i_log2_max_mv_length_horizontal =
-        sps->vui.i_log2_max_mv_length_vertical = (int)log2f( X264_MAX( 1, param->analyse.i_mv_range*4-1 ) ) + 1;
-    }
-}
-
-void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param )
-{
-    sps->crop.i_left   = param->crop_rect.i_left;
-    sps->crop.i_top    = param->crop_rect.i_top;
-    sps->crop.i_right  = param->crop_rect.i_right + sps->i_mb_width*16 - param->i_width;
-    sps->crop.i_bottom = (param->crop_rect.i_bottom + sps->i_mb_height*16 - param->i_height) >> !sps->b_frame_mbs_only;
-    sps->b_crop = sps->crop.i_left  || sps->crop.i_top ||
-                  sps->crop.i_right || sps->crop.i_bottom;
-
-    sps->vui.b_aspect_ratio_info_present = 0;
-    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
-    {
-        sps->vui.b_aspect_ratio_info_present = 1;
-        sps->vui.i_sar_width = param->vui.i_sar_width;
-        sps->vui.i_sar_height= param->vui.i_sar_height;
-    }
-}
-
-void x264_sps_write( bs_t *s, x264_sps_t *sps )
-{
-    bs_realign( s );
-    bs_write( s, 8, sps->i_profile_idc );
-    bs_write1( s, sps->b_constraint_set0 );
-    bs_write1( s, sps->b_constraint_set1 );
-    bs_write1( s, sps->b_constraint_set2 );
-    bs_write1( s, sps->b_constraint_set3 );
-
-    bs_write( s, 4, 0 );    /* reserved */
-
-    bs_write( s, 8, sps->i_level_idc );
-
-    bs_write_ue( s, sps->i_id );
-
-    if( sps->i_profile_idc >= PROFILE_HIGH )
-    {
-        bs_write_ue( s, sps->i_chroma_format_idc );
-        if( sps->i_chroma_format_idc == CHROMA_444 )
-            bs_write1( s, 0 ); // separate_colour_plane_flag
-        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
-        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
-        bs_write1( s, sps->b_qpprime_y_zero_transform_bypass );
-        bs_write1( s, 0 ); // seq_scaling_matrix_present_flag
-    }
-
-    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
-    bs_write_ue( s, sps->i_poc_type );
-    if( sps->i_poc_type == 0 )
-        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
-    bs_write_ue( s, sps->i_num_ref_frames );
-    bs_write1( s, sps->b_gaps_in_frame_num_value_allowed );
-    bs_write_ue( s, sps->i_mb_width - 1 );
-    bs_write_ue( s, (sps->i_mb_height >> !sps->b_frame_mbs_only) - 1);
-    bs_write1( s, sps->b_frame_mbs_only );
-    if( !sps->b_frame_mbs_only )
-        bs_write1( s, sps->b_mb_adaptive_frame_field );
-    bs_write1( s, sps->b_direct8x8_inference );
-
-    bs_write1( s, sps->b_crop );
-    if( sps->b_crop )
-    {
-        int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
-        int v_shift = sps->i_chroma_format_idc == CHROMA_420;
-        bs_write_ue( s, sps->crop.i_left   >> h_shift );
-        bs_write_ue( s, sps->crop.i_right  >> h_shift );
-        bs_write_ue( s, sps->crop.i_top    >> v_shift );
-        bs_write_ue( s, sps->crop.i_bottom >> v_shift );
-    }
-
-    bs_write1( s, sps->b_vui );
-    if( sps->b_vui )
-    {
-        bs_write1( s, sps->vui.b_aspect_ratio_info_present );
-        if( sps->vui.b_aspect_ratio_info_present )
-        {
-            int i;
-            static const struct { uint8_t w, h, sar; } sar[] =
-            {
-                // aspect_ratio_idc = 0 -> unspecified
-                {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
-                { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
-                { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
-                {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
-                // aspect_ratio_idc = [17..254] -> reserved
-                { 0, 0, 255 }
-            };
-            for( i = 0; sar[i].sar != 255; i++ )
-            {
-                if( sar[i].w == sps->vui.i_sar_width &&
-                    sar[i].h == sps->vui.i_sar_height )
-                    break;
-            }
-            bs_write( s, 8, sar[i].sar );
-            if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
-            {
-                bs_write( s, 16, sps->vui.i_sar_width );
-                bs_write( s, 16, sps->vui.i_sar_height );
-            }
-        }
-
-        bs_write1( s, sps->vui.b_overscan_info_present );
-        if( sps->vui.b_overscan_info_present )
-            bs_write1( s, sps->vui.b_overscan_info );
-
-        bs_write1( s, sps->vui.b_signal_type_present );
-        if( sps->vui.b_signal_type_present )
-        {
-            bs_write( s, 3, sps->vui.i_vidformat );
-            bs_write1( s, sps->vui.b_fullrange );
-            bs_write1( s, sps->vui.b_color_description_present );
-            if( sps->vui.b_color_description_present )
-            {
-                bs_write( s, 8, sps->vui.i_colorprim );
-                bs_write( s, 8, sps->vui.i_transfer );
-                bs_write( s, 8, sps->vui.i_colmatrix );
-            }
-        }
-
-        bs_write1( s, sps->vui.b_chroma_loc_info_present );
-        if( sps->vui.b_chroma_loc_info_present )
-        {
-            bs_write_ue( s, sps->vui.i_chroma_loc_top );
-            bs_write_ue( s, sps->vui.i_chroma_loc_bottom );
-        }
-
-        bs_write1( s, sps->vui.b_timing_info_present );
-        if( sps->vui.b_timing_info_present )
-        {
-            bs_write32( s, sps->vui.i_num_units_in_tick );
-            bs_write32( s, sps->vui.i_time_scale );
-            bs_write1( s, sps->vui.b_fixed_frame_rate );
-        }
-
-        bs_write1( s, sps->vui.b_nal_hrd_parameters_present );
-        if( sps->vui.b_nal_hrd_parameters_present )
-        {
-            bs_write_ue( s, sps->vui.hrd.i_cpb_cnt - 1 );
-            bs_write( s, 4, sps->vui.hrd.i_bit_rate_scale );
-            bs_write( s, 4, sps->vui.hrd.i_cpb_size_scale );
-
-            bs_write_ue( s, sps->vui.hrd.i_bit_rate_value - 1 );
-            bs_write_ue( s, sps->vui.hrd.i_cpb_size_value - 1 );
-
-            bs_write1( s, sps->vui.hrd.b_cbr_hrd );
-
-            bs_write( s, 5, sps->vui.hrd.i_initial_cpb_removal_delay_length - 1 );
-            bs_write( s, 5, sps->vui.hrd.i_cpb_removal_delay_length - 1 );
-            bs_write( s, 5, sps->vui.hrd.i_dpb_output_delay_length - 1 );
-            bs_write( s, 5, sps->vui.hrd.i_time_offset_length );
-        }
-
-        bs_write1( s, sps->vui.b_vcl_hrd_parameters_present );
-
-        if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
-            bs_write1( s, 0 );   /* low_delay_hrd_flag */
-
-        bs_write1( s, sps->vui.b_pic_struct_present );
-        bs_write1( s, sps->vui.b_bitstream_restriction );
-        if( sps->vui.b_bitstream_restriction )
-        {
-            bs_write1( s, sps->vui.b_motion_vectors_over_pic_boundaries );
-            bs_write_ue( s, sps->vui.i_max_bytes_per_pic_denom );
-            bs_write_ue( s, sps->vui.i_max_bits_per_mb_denom );
-            bs_write_ue( s, sps->vui.i_log2_max_mv_length_horizontal );
-            bs_write_ue( s, sps->vui.i_log2_max_mv_length_vertical );
-            bs_write_ue( s, sps->vui.i_num_reorder_frames );
-            bs_write_ue( s, sps->vui.i_max_dec_frame_buffering );
-        }
-    }
-
-    bs_rbsp_trailing( s );
-    bs_flush( s );
-}
-
-void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
-{
-    pps->i_id = i_id;
-    pps->i_sps_id = sps->i_id;
-    pps->b_cabac = param->b_cabac;
-
-    pps->b_pic_order = !param->i_avcintra_class && param->b_interlaced;
-    pps->i_num_slice_groups = 1;
-
-    pps->i_num_ref_idx_l0_default_active = param->i_frame_reference;
-    pps->i_num_ref_idx_l1_default_active = 1;
-
-    pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
-    pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
-
-    pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
-    pps->i_pic_init_qs = 26 + QP_BD_OFFSET;
-
-    pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
-    pps->b_deblocking_filter_control = 1;
-    pps->b_constrained_intra_pred = param->b_constrained_intra;
-    pps->b_redundant_pic_cnt = 0;
-
-    pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
-
-    pps->i_cqm_preset = param->i_cqm_preset;
-
-    switch( pps->i_cqm_preset )
-    {
-    case X264_CQM_FLAT:
-        for( int i = 0; i < 8; i++ )
-            pps->scaling_list[i] = x264_cqm_flat16;
-        break;
-    case X264_CQM_JVT:
-        for( int i = 0; i < 8; i++ )
-            pps->scaling_list[i] = x264_cqm_jvt[i];
-        break;
-    case X264_CQM_CUSTOM:
-        /* match the transposed DCT & zigzag */
-        transpose( param->cqm_4iy, 4 );
-        transpose( param->cqm_4py, 4 );
-        transpose( param->cqm_4ic, 4 );
-        transpose( param->cqm_4pc, 4 );
-        transpose( param->cqm_8iy, 8 );
-        transpose( param->cqm_8py, 8 );
-        transpose( param->cqm_8ic, 8 );
-        transpose( param->cqm_8pc, 8 );
-        pps->scaling_list[CQM_4IY] = param->cqm_4iy;
-        pps->scaling_list[CQM_4PY] = param->cqm_4py;
-        pps->scaling_list[CQM_4IC] = param->cqm_4ic;
-        pps->scaling_list[CQM_4PC] = param->cqm_4pc;
-        pps->scaling_list[CQM_8IY+4] = param->cqm_8iy;
-        pps->scaling_list[CQM_8PY+4] = param->cqm_8py;
-        pps->scaling_list[CQM_8IC+4] = param->cqm_8ic;
-        pps->scaling_list[CQM_8PC+4] = param->cqm_8pc;
-        for( int i = 0; i < 8; i++ )
-            for( int j = 0; j < (i < 4 ? 16 : 64); j++ )
-                if( pps->scaling_list[i][j] == 0 )
-                    pps->scaling_list[i] = x264_cqm_jvt[i];
-        break;
-    }
-}
-
-void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
-{
-    bs_realign( s );
-    bs_write_ue( s, pps->i_id );
-    bs_write_ue( s, pps->i_sps_id );
-
-    bs_write1( s, pps->b_cabac );
-    bs_write1( s, pps->b_pic_order );
-    bs_write_ue( s, pps->i_num_slice_groups - 1 );
-
-    bs_write_ue( s, pps->i_num_ref_idx_l0_default_active - 1 );
-    bs_write_ue( s, pps->i_num_ref_idx_l1_default_active - 1 );
-    bs_write1( s, pps->b_weighted_pred );
-    bs_write( s, 2, pps->b_weighted_bipred );
-
-    bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
-    bs_write_se( s, pps->i_pic_init_qs - 26 - QP_BD_OFFSET );
-    bs_write_se( s, pps->i_chroma_qp_index_offset );
-
-    bs_write1( s, pps->b_deblocking_filter_control );
-    bs_write1( s, pps->b_constrained_intra_pred );
-    bs_write1( s, pps->b_redundant_pic_cnt );
-
-    if( pps->b_transform_8x8_mode || pps->i_cqm_preset != X264_CQM_FLAT )
-    {
-        bs_write1( s, pps->b_transform_8x8_mode );
-        bs_write1( s, (pps->i_cqm_preset != X264_CQM_FLAT) );
-        if( pps->i_cqm_preset != X264_CQM_FLAT )
-        {
-            scaling_list_write( s, pps, CQM_4IY );
-            scaling_list_write( s, pps, CQM_4IC );
-            bs_write1( s, 0 ); // Cr = Cb
-            scaling_list_write( s, pps, CQM_4PY );
-            scaling_list_write( s, pps, CQM_4PC );
-            bs_write1( s, 0 ); // Cr = Cb
-            if( pps->b_transform_8x8_mode )
-            {
-                scaling_list_write( s, pps, CQM_8IY+4 );
-                scaling_list_write( s, pps, CQM_8PY+4 );
-                if( sps->i_chroma_format_idc == CHROMA_444 )
-                {
-                    scaling_list_write( s, pps, CQM_8IC+4 );
-                    scaling_list_write( s, pps, CQM_8PC+4 );
-                    bs_write1( s, 0 ); // Cr = Cb
-                    bs_write1( s, 0 ); // Cr = Cb
-                }
-            }
-        }
-        bs_write_se( s, pps->i_chroma_qp_index_offset );
-    }
-
-    bs_rbsp_trailing( s );
-    bs_flush( s );
-}
-
-void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
-{
-    bs_t q;
-    ALIGNED_4( uint8_t tmp_buf[100] );
-    M32( tmp_buf ) = 0; // shut up gcc
-    bs_init( &q, tmp_buf, 100 );
-
-    bs_realign( &q );
-
-    bs_write_ue( &q, recovery_frame_cnt ); // recovery_frame_cnt
-    bs_write1( &q, 1 );   //exact_match_flag 1
-    bs_write1( &q, 0 );   //broken_link_flag 0
-    bs_write( &q, 2, 0 ); //changing_slice_group 0
-
-    bs_align_10( &q );
-    bs_flush( &q );
-
-    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT );
-}
-
-int x264_sei_version_write( x264_t *h, bs_t *s )
-{
-    // random ID number generated according to ISO-11578
-    static const uint8_t uuid[16] =
-    {
-        0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7,
-        0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
-    };
-    char *opts = x264_param2string( &h->param, 0 );
-    char *payload;
-    int length;
-
-    if( !opts )
-        return -1;
-    CHECKED_MALLOC( payload, 200 + strlen( opts ) );
-
-    memcpy( payload, uuid, 16 );
-    sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
-             "Copy%s 2003-2016 - http://www.videolan.org/x264.html - options: %s",
-             X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
-    length = strlen(payload)+1;
-
-    x264_sei_write( s, (uint8_t *)payload, length, SEI_USER_DATA_UNREGISTERED );
-
-    x264_free( opts );
-    x264_free( payload );
-    return 0;
-fail:
-    x264_free( opts );
-    return -1;
-}
-
-void x264_sei_buffering_period_write( x264_t *h, bs_t *s )
-{
-    x264_sps_t *sps = h->sps;
-    bs_t q;
-    ALIGNED_4( uint8_t tmp_buf[100] );
-    M32( tmp_buf ) = 0; // shut up gcc
-    bs_init( &q, tmp_buf, 100 );
-
-    bs_realign( &q );
-    bs_write_ue( &q, sps->i_id );
-
-    if( sps->vui.b_nal_hrd_parameters_present )
-    {
-        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay );
-        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay_offset );
-    }
-
-    bs_align_10( &q );
-    bs_flush( &q );
-
-    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_BUFFERING_PERIOD );
-}
-
-void x264_sei_pic_timing_write( x264_t *h, bs_t *s )
-{
-    x264_sps_t *sps = h->sps;
-    bs_t q;
-    ALIGNED_4( uint8_t tmp_buf[100] );
-    M32( tmp_buf ) = 0; // shut up gcc
-    bs_init( &q, tmp_buf, 100 );
-
-    bs_realign( &q );
-
-    if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
-    {
-        bs_write( &q, sps->vui.hrd.i_cpb_removal_delay_length, h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset );
-        bs_write( &q, sps->vui.hrd.i_dpb_output_delay_length, h->fenc->i_dpb_output_delay );
-    }
-
-    if( sps->vui.b_pic_struct_present )
-    {
-        bs_write( &q, 4, h->fenc->i_pic_struct-1 ); // We use index 0 for "Auto"
-
-        // These clock timestamps are not standardised so we don't set them
-        // They could be time of origin, capture or alternative ideal display
-        for( int i = 0; i < num_clock_ts[h->fenc->i_pic_struct]; i++ )
-            bs_write1( &q, 0 ); // clock_timestamp_flag
-    }
-
-    bs_align_10( &q );
-    bs_flush( &q );
-
-    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_PIC_TIMING );
-}
-
-void x264_sei_frame_packing_write( x264_t *h, bs_t *s )
-{
-    int quincunx_sampling_flag = h->param.i_frame_packing == 0;
-    bs_t q;
-    ALIGNED_4( uint8_t tmp_buf[100] );
-    M32( tmp_buf ) = 0; // shut up gcc
-    bs_init( &q, tmp_buf, 100 );
-
-    bs_realign( &q );
-
-    bs_write_ue( &q, 0 );                         // frame_packing_arrangement_id
-    bs_write1( &q, 0 );                           // frame_packing_arrangement_cancel_flag
-    bs_write ( &q, 7, h->param.i_frame_packing ); // frame_packing_arrangement_type
-    bs_write1( &q, quincunx_sampling_flag );      // quincunx_sampling_flag
-
-    // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
-    bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type
-
-    bs_write1( &q, 0 );                           // spatial_flipping_flag
-    bs_write1( &q, 0 );                           // frame0_flipped_flag
-    bs_write1( &q, 0 );                           // field_views_flag
-    bs_write1( &q, h->param.i_frame_packing == 5 && !(h->fenc->i_frame&1) ); // current_frame_is_frame0_flag
-    bs_write1( &q, 0 );                           // frame0_self_contained_flag
-    bs_write1( &q, 0 );                           // frame1_self_contained_flag
-    if ( quincunx_sampling_flag == 0 && h->param.i_frame_packing != 5 )
-    {
-        bs_write( &q, 4, 0 );                     // frame0_grid_position_x
-        bs_write( &q, 4, 0 );                     // frame0_grid_position_y
-        bs_write( &q, 4, 0 );                     // frame1_grid_position_x
-        bs_write( &q, 4, 0 );                     // frame1_grid_position_y
-    }
-    bs_write( &q, 8, 0 );                         // frame_packing_arrangement_reserved_byte
-    // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output"
-    // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence
-    bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period
-    bs_write1( &q, 0 );                           // frame_packing_arrangement_extension_flag
-
-    bs_align_10( &q );
-    bs_flush( &q );
-
-    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_FRAME_PACKING );
-}
-
-void x264_filler_write( x264_t *h, bs_t *s, int filler )
-{
-    bs_realign( s );
-
-    for( int i = 0; i < filler; i++ )
-        bs_write( s, 8, 0xff );
-
-    bs_rbsp_trailing( s );
-    bs_flush( s );
-}
-
-void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s )
-{
-    x264_slice_header_t *sh = &h->sh_backup;
-    bs_t q;
-    ALIGNED_4( uint8_t tmp_buf[100] );
-    M32( tmp_buf ) = 0; // shut up gcc
-    bs_init( &q, tmp_buf, 100 );
-
-    bs_realign( &q );
-
-    /* We currently only use this for repeating B-refs, as required by Blu-ray. */
-    bs_write1( &q, 0 );                 //original_idr_flag
-    bs_write_ue( &q, sh->i_frame_num ); //original_frame_num
-    if( !h->sps->b_frame_mbs_only )
-        bs_write1( &q, 0 );             //original_field_pic_flag
-
-    bs_write1( &q, sh->i_mmco_command_count > 0 );
-    if( sh->i_mmco_command_count > 0 )
-    {
-        for( int i = 0; i < sh->i_mmco_command_count; i++ )
-        {
-            bs_write_ue( &q, 1 );
-            bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 );
-        }
-        bs_write_ue( &q, 0 );
-    }
-
-    bs_align_10( &q );
-    bs_flush( &q );
-
-    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING );
-}
-
-int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s )
-{
-    uint8_t data[512];
-    const char *msg = "UMID";
-    const int len = 497;
-
-    memset( data, 0xff, len );
-    memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
-    memcpy( data+16, msg, strlen(msg) );
-
-    data[20] = 0x13;
-    /* These bytes appear to be some sort of frame/seconds counter in certain applications,
-     * but others jump around, so leave them as zero for now */
-    data[22] = data[23] = data[25] = data[26] = 0;
-    data[28] = 0x14;
-    data[30] = data[31] = data[33] = data[34] = 0;
-    data[36] = 0x60;
-    data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */
-    data[60] = 0x62;
-    data[62] = data[63] = data[65] = data[66] = 0;
-    data[68] = 0x63;
-    data[70] = data[71] = data[73] = data[74] = 0;
-
-    x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
-
-    return 0;
-}
-
-int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
-{
-    uint8_t data[6000];
-    const char *msg = "VANC";
-    if( len > sizeof(data) )
-    {
-        x264_log( h, X264_LOG_ERROR, "AVC-Intra SEI is too large (%d)\n", len );
-        return -1;
-    }
-
-    memset( data, 0xff, len );
-    memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
-    memcpy( data+16, msg, strlen(msg) );
-
-    x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
-
-    return 0;
-}
-
-const x264_level_t x264_levels[] =
-{
-    { 10,    1485,    99,    396,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
-    {  9,    1485,    99,    396,    128,    350,  64, 64,  0, 2, 0, 0, 1 }, /* "1b" */
-    { 11,    3000,   396,    900,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
-    { 12,    6000,   396,   2376,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
-    { 13,   11880,   396,   2376,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
-    { 20,   11880,   396,   2376,   2000,   2000, 128, 64,  0, 2, 0, 0, 1 },
-    { 21,   19800,   792,   4752,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
-    { 22,   20250,  1620,   8100,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
-    { 30,   40500,  1620,   8100,  10000,  10000, 256, 32, 22, 2, 0, 1, 0 },
-    { 31,  108000,  3600,  18000,  14000,  14000, 512, 16, 60, 4, 1, 1, 0 },
-    { 32,  216000,  5120,  20480,  20000,  20000, 512, 16, 60, 4, 1, 1, 0 },
-    { 40,  245760,  8192,  32768,  20000,  25000, 512, 16, 60, 4, 1, 1, 0 },
-    { 41,  245760,  8192,  32768,  50000,  62500, 512, 16, 24, 2, 1, 1, 0 },
-    { 42,  522240,  8704,  34816,  50000,  62500, 512, 16, 24, 2, 1, 1, 1 },
-    { 50,  589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
-    { 51,  983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
-    { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
-    { 0 }
-};
-
-#define ERROR(...)\
-{\
-    if( verbose )\
-        x264_log( h, X264_LOG_WARNING, __VA_ARGS__ );\
-    ret = 1;\
-}
-
-int x264_validate_levels( x264_t *h, int verbose )
-{
-    int ret = 0;
-    int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
-    int dpb = mbs * h->sps->vui.i_max_dec_frame_buffering;
-    int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
-                     h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
-                     h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
-
-    const x264_level_t *l = x264_levels;
-    while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
-        l++;
-
-    if( l->frame_size < mbs
-        || l->frame_size*8 < h->sps->i_mb_width * h->sps->i_mb_width
-        || l->frame_size*8 < h->sps->i_mb_height * h->sps->i_mb_height )
-        ERROR( "frame MB size (%dx%d) > level limit (%d)\n",
-               h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
-    if( dpb > l->dpb )
-        ERROR( "DPB size (%d frames, %d mbs) > level limit (%d frames, %d mbs)\n",
-                h->sps->vui.i_max_dec_frame_buffering, dpb, l->dpb / mbs, l->dpb );
-
-#define CHECK( name, limit, val ) \
-    if( (val) > (limit) ) \
-        ERROR( name " (%"PRId64") > level limit (%d)\n", (int64_t)(val), (limit) );
-
-    CHECK( "VBV bitrate", (l->bitrate * cbp_factor) / 4, h->param.rc.i_vbv_max_bitrate );
-    CHECK( "VBV buffer", (l->cpb * cbp_factor) / 4, h->param.rc.i_vbv_buffer_size );
-    CHECK( "MV range", l->mv_range, h->param.analyse.i_mv_range );
-    CHECK( "interlaced", !l->frame_only, h->param.b_interlaced );
-    CHECK( "fake interlaced", !l->frame_only, h->param.b_fake_interlaced );
-
-    if( h->param.i_fps_den > 0 )
-        CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den );
-
-    /* TODO check the rest of the limits */
-    return ret;
-}
diff --git a/android/src/main/libenc/jni/libx264/encoder/set.h b/android/src/main/libenc/jni/libx264/encoder/set.h
deleted file mode 100755
index bb21086..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/set.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*****************************************************************************
- * set.h: header writing
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_ENCODER_SET_H
-#define X264_ENCODER_SET_H
-
-void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
-void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param );
-void x264_sps_write( bs_t *s, x264_sps_t *sps );
-void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
-void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps );
-void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
-int  x264_sei_version_write( x264_t *h, bs_t *s );
-int  x264_validate_levels( x264_t *h, int verbose );
-void x264_sei_buffering_period_write( x264_t *h, bs_t *s );
-void x264_sei_pic_timing_write( x264_t *h, bs_t *s );
-void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s );
-void x264_sei_frame_packing_write( x264_t *h, bs_t *s );
-int  x264_sei_avcintra_umid_write( x264_t *h, bs_t *s );
-int  x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len );
-void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type );
-void x264_filler_write( x264_t *h, bs_t *s, int filler );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/slicetype-cl.c b/android/src/main/libenc/jni/libx264/encoder/slicetype-cl.c
deleted file mode 100755
index 9ae8bd6..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/slicetype-cl.c
+++ /dev/null
@@ -1,780 +0,0 @@
-/*****************************************************************************
- * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
- *****************************************************************************
- * Copyright (C) 2012-2016 x264 project
- *
- * Authors: Steve Borho <sborho@multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-#include "me.h"
-
-#if HAVE_OPENCL
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
-
-/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
- * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
- * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
-#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
-
-#define OCLCHECK( method, ... )\
-do\
-{\
-    if( h->opencl.b_fatal_error )\
-        return -1;\
-    status = ocl->method( __VA_ARGS__ );\
-    if( status != CL_SUCCESS ) {\
-        h->param.b_opencl = 0;\
-        h->opencl.b_fatal_error = 1;\
-        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
-        return -1;\
-    }\
-} while( 0 )
-
-void x264_opencl_flush( x264_t *h )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-
-    ocl->clFinish( h->opencl.queue );
-
-    /* Finish copies from the GPU by copying from the page-locked buffer to
-     * their final destination */
-    for( int i = 0; i < h->opencl.num_copies; i++ )
-        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
-    h->opencl.num_copies = 0;
-    h->opencl.pl_occupancy = 0;
-}
-
-static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
-{
-    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
-        x264_opencl_flush( h );
-    assert( bytes < PAGE_LOCKED_BUF_SIZE );
-    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
-    h->opencl.pl_occupancy += bytes;
-    return ptr;
-}
-
-int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
-{
-    if( fenc->b_intra_calculated )
-        return 0;
-    fenc->b_intra_calculated = 1;
-
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
-
-#define CREATEBUF( out, flags, size )\
-    out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
-    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
-#define CREATEIMAGE( out, flags, pf, width, height )\
-    out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
-    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
-
-    int mb_count = h->mb.i_mb_count;
-    cl_int status;
-
-    if( !h->opencl.lowres_mv_costs )
-    {
-        /* Allocate shared memory buffers */
-        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
-        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
-
-        cl_image_format pixel_format;
-        pixel_format.image_channel_order = CL_R;
-        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
-        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
-
-        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-        {
-            pixel_format.image_channel_order = CL_RGBA;
-            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
-            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
-            width >>= 1;
-            height >>= 1;
-        }
-
-        CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
-        CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
-        CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
-        CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
-        CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
-        CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
-        CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
-        CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
-        CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
-        CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
-        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
-        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
-    }
-
-    if( !fenc->opencl.intra_cost )
-    {
-        /* Allocate per-frame buffers */
-        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
-        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
-
-        cl_image_format pixel_format;
-        pixel_format.image_channel_order = CL_R;
-        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
-        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
-
-        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-        {
-            pixel_format.image_channel_order = CL_RGBA;
-            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
-            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
-            width >>= 1;
-            height >>= 1;
-        }
-        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
-        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
-        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
-        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
-        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
-        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
-    }
-#undef CREATEBUF
-#undef CREATEIMAGE
-
-    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
-
-    char *locked = x264_opencl_alloc_locked( h, luma_length );
-    memcpy( locked, fenc->plane[0], luma_length );
-    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
-
-    size_t gdim[2];
-    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
-    {
-        int size = h->mb.i_mb_count * sizeof(int16_t);
-        locked = x264_opencl_alloc_locked( h, size );
-        memcpy( locked, fenc->i_inv_qscale_factor, size );
-        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    }
-    else
-    {
-        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
-        cl_uint arg = 0;
-        int16_t value = 256;
-        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
-        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
-        gdim[0] = h->mb.i_mb_count;
-        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
-    }
-
-    int stride = fenc->i_stride[0];
-    cl_uint arg = 0;
-    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
-    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
-    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
-    gdim[0] = 8 * h->mb.i_mb_width;
-    gdim[1] = 8 * h->mb.i_mb_height;
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
-
-    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
-    {
-        /* Workaround for AMD Southern Island:
-         *
-         * Alternate kernel instances.  No perf impact to this, so we do it for
-         * all GPUs.  It prevents the same kernel from being enqueued
-         * back-to-back, avoiding a dependency calculation bug in the driver.
-         */
-        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
-
-        arg = 0;
-        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
-        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
-        gdim[0] >>= 1;
-        gdim[1] >>= 1;
-        if( gdim[0] < 16 || gdim[1] < 16 )
-            break;
-        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
-    }
-
-    size_t ldim[2];
-    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
-    gdim[1] = 8*h->mb.i_mb_height;
-    ldim[0] = 32;
-    ldim[1] = 8;
-    arg = 0;
-
-    /* For presets slow, slower, and placebo, check all 10 intra modes that the
-     * C lookahead supports.  For faster presets, only check the most frequent 8
-     * modes
-     */
-    int slow = h->param.analyse.i_subpel_refine > 7;
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
-    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
-
-    gdim[0] = 256;
-    gdim[1] = h->mb.i_mb_height;
-    ldim[0] = 256;
-    ldim[1] = 1;
-    arg = 0;
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
-
-    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
-        x264_opencl_flush( h );
-
-    int size = h->mb.i_mb_count * sizeof(int16_t);
-    locked = x264_opencl_alloc_locked( h, size );
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].bytes = size;
-    h->opencl.num_copies++;
-
-    size = h->mb.i_mb_height * sizeof(int);
-    locked = x264_opencl_alloc_locked( h, size );
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].bytes = size;
-    h->opencl.num_copies++;
-
-    size = sizeof(int) * 4;
-    locked = x264_opencl_alloc_locked( h, size );
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
-    h->opencl.num_copies++;
-    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
-    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
-    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
-    h->opencl.num_copies++;
-
-    h->opencl.last_buf = !h->opencl.last_buf;
-    return 0;
-}
-
-/* This function was tested emprically on a number of AMD and NV GPUs.  Making a
- * function which returns perfect launch dimensions is impossible; some
- * applications will have self-tuning code to try many possible variables and
- * measure the runtime.  Here we simply make an educated guess based on what we
- * know GPUs typically prefer.  */
-static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
-    size_t preferred_multiple = 64;
-    cl_uint num_cus = 6;
-
-    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
-    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
-    ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
-
-    ldims[0] = preferred_multiple;
-    ldims[1] = 8;
-
-    /* make ldims[1] an even divisor of gdims[1] */
-    while( gdims[1] & (ldims[1] - 1) )
-    {
-        ldims[0] <<= 1;
-        ldims[1] >>= 1;
-    }
-    /* make total ldims fit under the max work-group dimensions for the device */
-    while( ldims[0] * ldims[1] > max_work_group )
-    {
-        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
-            ldims[1] >>= 1;
-        else
-            ldims[0] >>= 1;
-    }
-
-    if( ldims[0] > gdims[0] )
-    {
-        /* remove preferred multiples until we're close to gdims[0] */
-        while( gdims[0] + preferred_multiple < ldims[0] )
-            ldims[0] -= preferred_multiple;
-        gdims[0] = ldims[0];
-    }
-    else
-    {
-        /* make gdims an even multiple of ldims */
-        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
-        gdims[0] *= ldims[0];
-    }
-
-    /* make ldims smaller to spread work across compute units */
-    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
-    {
-        if( ldims[0] > preferred_multiple )
-            ldims[0] >>= 1;
-        else if( ldims[1] > 1 )
-            ldims[1] >>= 1;
-        else
-            break;
-    }
-    /* for smaller GPUs, try not to abuse their texture cache */
-    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
-        ldims[0] = 32;
-}
-
-int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    x264_frame_t *fenc = frames[b];
-    x264_frame_t *fref = frames[ref];
-
-    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
-    cl_mem ref_luma_hpel;
-    cl_int status;
-
-    if( w && w->weightfn )
-    {
-        size_t gdims[2];
-
-        gdims[0] = 8 * h->mb.i_mb_width;
-        gdims[1] = 8 * h->mb.i_mb_height;
-
-        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
-        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-        {
-            cl_uint arg = 0;
-            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
-            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
-            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
-            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
-            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
-            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
-
-            gdims[0] >>= 1;
-            gdims[1] >>= 1;
-            if( gdims[0] < 16 || gdims[1] < 16 )
-                break;
-        }
-
-        cl_uint arg = 0;
-        gdims[0] = 8 * h->mb.i_mb_width;
-        gdims[1] = 8 * h->mb.i_mb_height;
-
-        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
-        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
-        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
-        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
-        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
-        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
-
-        /* Use weighted reference planes for motion search */
-        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
-        ref_luma_hpel = h->opencl.weighted_luma_hpel;
-    }
-    else
-    {
-        /* Use unweighted reference planes for motion search */
-        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
-            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
-        ref_luma_hpel = fref->opencl.luma_hpel;
-    }
-
-    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
-    int b_first_iteration = 1;
-    int b_reverse_references = 1;
-    int A = 1;
-
-
-    int mb_per_group = 0;
-    int cost_local_size = 0;
-    int mvc_local_size = 0;
-    int mb_width;
-
-    size_t gdims[2];
-    size_t ldims[2];
-
-    /* scale 0 is 8x8 */
-    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
-    {
-        mb_width = h->mb.i_mb_width >> scale;
-        gdims[0] = mb_width;
-        gdims[1] = h->mb.i_mb_height >> scale;
-        if( gdims[0] < 2 || gdims[1] < 2 )
-            continue;
-        gdims[0] <<= 2;
-        x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
-
-        mb_per_group = (ldims[0] >> 2) * ldims[1];
-        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
-        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
-        int scaled_me_range = h->param.analyse.i_me_range >> scale;
-        int b_shift_index = 1;
-
-        cl_uint arg = 0;
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
-        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
-
-        for( int iter = 0; iter < num_iterations[scale]; iter++ )
-        {
-            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
-
-            b_shift_index = 0;
-            b_first_iteration = 0;
-
-            /* alternate top-left vs bot-right MB references at lower scales, so
-             * motion field smooths more quickly.  */
-            if( scale > 2 )
-                b_reverse_references ^= 1;
-            else
-                b_reverse_references = 0;
-            A = !A;
-            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
-            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
-            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
-            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
-            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
-        }
-    }
-
-    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
-    cl_uint arg = 0;
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
-
-    if( b_islist1 )
-    {
-        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
-        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
-    }
-    else
-    {
-        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
-        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
-    }
-
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
-    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
-
-    if( h->opencl.b_device_AMD_SI )
-    {
-        /* workaround for AMD Southern Island driver scheduling bug (fixed in
-         * July 2012), perform meaningless small copy to add a data dependency */
-        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
-    }
-
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
-
-    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
-
-    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
-        x264_opencl_flush( h );
-
-    char *locked = x264_opencl_alloc_locked( h, mvlen );
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
-
-    if( b_islist1 )
-    {
-        int mvs_offset = mvlen * (ref - b - 1);
-        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
-        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
-    }
-    else
-    {
-        int mvs_offset = mvlen * (b - ref - 1);
-        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
-        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
-    }
-
-    h->opencl.num_copies++;
-
-    return 0;
-}
-
-int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
-{
-    x264_opencl_function_t *ocl = h->opencl.ocl;
-    cl_int status;
-    x264_frame_t *fenc = frames[b];
-    x264_frame_t *fref0 = frames[p0];
-    x264_frame_t *fref1 = frames[p1];
-
-    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
-
-    /* Tasks for this kernel:
-     * 1. Select least cost mode (intra, ref0, ref1)
-     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
-     * 2. if B frame, try bidir predictions.
-     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
-    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
-    size_t ldim_bidir[2];
-    size_t *ldims = NULL;
-    int cost_local_size = 4;
-    int satd_local_size = 4;
-    if( b < p1 )
-    {
-        /* For B frames, use 4 threads per MB for BIDIR checks */
-        ldims = ldim_bidir;
-        gdims[0] <<= 2;
-        x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
-        int mb_per_group = (ldims[0] >> 2) * ldims[1];
-        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
-        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
-    }
-
-    cl_uint arg = 0;
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
-    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
-
-    /* Sum costs across rows, atomicAdd down frame */
-    size_t gdim[2] = { 256, h->mb.i_mb_height };
-    size_t ldim[2] = { 256, 1 };
-
-    arg = 0;
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
-    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
-    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
-
-    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
-        x264_opencl_flush( h );
-
-    int size =  h->mb.i_mb_count * sizeof(int16_t);
-    char *locked = x264_opencl_alloc_locked( h, size );
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
-    h->opencl.copies[h->opencl.num_copies].bytes = size;
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.num_copies++;
-
-    size =  h->mb.i_mb_height * sizeof(int);
-    locked = x264_opencl_alloc_locked( h, size );
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
-    h->opencl.copies[h->opencl.num_copies].bytes = size;
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.num_copies++;
-
-    size =  4 * sizeof(int);
-    locked = x264_opencl_alloc_locked( h, size );
-    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
-    h->opencl.last_buf = !h->opencl.last_buf;
-
-    h->opencl.copies[h->opencl.num_copies].src = locked;
-    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
-    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
-    h->opencl.num_copies++;
-    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
-    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
-    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
-    h->opencl.num_copies++;
-
-    if( b == p1 ) // P frames only
-    {
-        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
-        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
-        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
-        h->opencl.num_copies++;
-    }
-    return 0;
-}
-
-void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
-{
-    if( h->param.b_opencl )
-    {
-#ifdef _WIN32
-        /* Temporarily boost priority of this lookahead thread and the OpenCL
-         * driver's thread until the end of this function.  On AMD GPUs this
-         * greatly reduces the latency of enqueuing kernels and getting results
-         * on Windows. */
-        HANDLE id = GetCurrentThread();
-        h->opencl.lookahead_thread_pri = GetThreadPriority( id );
-        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
-        x264_opencl_function_t *ocl = h->opencl.ocl;
-        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
-        if( status == CL_SUCCESS )
-        {
-            h->opencl.opencl_thread_pri = GetThreadPriority( id );
-            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
-        }
-#endif
-
-        /* precalculate intra and I frames */
-        for( int i = 0; i <= num_frames; i++ )
-            x264_opencl_lowres_init( h, frames[i], lambda );
-        x264_opencl_flush( h );
-
-        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
-        {
-            /* For trellis B-Adapt, precompute exhaustive motion searches */
-            for( int b = 0; b <= num_frames; b++ )
-            {
-                for( int j = 1; j < h->param.i_bframe; j++ )
-                {
-                    int p0 = b - j;
-                    if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
-                    {
-                        const x264_weight_t *w = x264_weight_none;
-
-                        if( h->param.analyse.i_weighted_pred )
-                        {
-                            x264_emms();
-                            x264_weights_analyse( h, frames[b], frames[p0], 1 );
-                            w = frames[b]->weight[0];
-                        }
-                        frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
-                        x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
-                    }
-                    int p1 = b + j;
-                    if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
-                    {
-                        frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
-                        x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
-                    }
-                }
-            }
-
-            x264_opencl_flush( h );
-        }
-    }
-}
-
-
-void x264_opencl_slicetype_end( x264_t *h )
-{
-#ifdef _WIN32
-    if( h->param.b_opencl )
-    {
-        HANDLE id = GetCurrentThread();
-        SetThreadPriority( id, h->opencl.lookahead_thread_pri );
-        x264_opencl_function_t *ocl = h->opencl.ocl;
-        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
-        if( status == CL_SUCCESS )
-            SetThreadPriority( id, h->opencl.opencl_thread_pri );
-    }
-#endif
-}
-
-int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
-{
-    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
-        return 0;
-    else
-    {
-        int do_search[2];
-        int dist_scale_factor = 128;
-        const x264_weight_t *w = x264_weight_none;
-
-        // avoid duplicating work
-        frames[b]->i_cost_est[b-p0][p1-b] = 0;
-
-        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
-        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
-        if( do_search[0] )
-        {
-            if( h->param.analyse.i_weighted_pred && b == p1 )
-            {
-                x264_emms();
-                x264_weights_analyse( h, frames[b], frames[p0], 1 );
-                w = frames[b]->weight[0];
-            }
-            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
-        }
-        if( do_search[1] )
-            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
-        if( b == p1 )
-            frames[b]->i_intra_mbs[b-p0] = 0;
-        if( p1 != p0 )
-            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
-
-        frames[b]->i_cost_est[b-p0][p1-b] = 0;
-        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
-
-        x264_opencl_lowres_init( h, frames[b], lambda );
-
-        if( do_search[0] )
-        {
-            x264_opencl_lowres_init( h, frames[p0], lambda );
-            x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
-        }
-        if( do_search[1] )
-        {
-            x264_opencl_lowres_init( h, frames[p1], lambda );
-            x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
-        }
-        x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
-        return 1;
-    }
-}
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/encoder/slicetype.c b/android/src/main/libenc/jni/libx264/encoder/slicetype.c
deleted file mode 100755
index 73c0d67..0000000
--- a/android/src/main/libenc/jni/libx264/encoder/slicetype.c
+++ /dev/null
@@ -1,2032 +0,0 @@
-/*****************************************************************************
- * slicetype.c: lookahead analysis
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Fiona Glaser <fiona@x264.com>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Dylan Yudaken <dyudaken@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "macroblock.h"
-#include "me.h"
-
-// Indexed by pic_struct values
-static const uint8_t delta_tfi_divisor[10] = { 0, 2, 1, 1, 2, 2, 3, 3, 4, 6 };
-
-static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
-                                      x264_frame_t **frames, int p0, int p1, int b );
-
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
-
-#if HAVE_OPENCL
-int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
-int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
-int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
-int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
-void x264_opencl_flush( x264_t *h );
-void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
-void x264_opencl_slicetype_end( x264_t *h );
-#endif
-
-static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
-{
-    a->i_qp = X264_LOOKAHEAD_QP;
-    a->i_lambda = x264_lambda_tab[ a->i_qp ];
-    x264_mb_analyse_load_costs( h, a );
-    if( h->param.analyse.i_subpel_refine > 1 )
-    {
-        h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method );
-        h->mb.i_subpel_refine = 4;
-    }
-    else
-    {
-        h->mb.i_me_method = X264_ME_DIA;
-        h->mb.i_subpel_refine = 2;
-    }
-    h->mb.b_chroma_me = 0;
-}
-
-/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
-static void x264_weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w )
-{
-    w->i_offset = offset;
-    w->i_denom = 7;
-    w->i_scale = weight_nonh264;
-    while( w->i_denom > 0 && (w->i_scale > 127) )
-    {
-        w->i_denom--;
-        w->i_scale >>= 1;
-    }
-    w->i_scale = X264_MIN( w->i_scale, 127 );
-}
-
-static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dest )
-{
-    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
-    /* Note: this will never run during lookahead as weights_analyse is only called if no
-     * motion search has been done. */
-    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
-    {
-        int i_stride = fenc->i_stride_lowres;
-        int i_lines = fenc->i_lines_lowres;
-        int i_width = fenc->i_width_lowres;
-        int i_mb_xy = 0;
-        pixel *p = dest;
-
-        for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
-            for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
-            {
-                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
-                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
-                h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
-                               mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none );
-            }
-        x264_emms();
-        return dest;
-    }
-    x264_emms();
-    return ref->lowres[0];
-}
-
-/* How data is organized for 4:2:0/4:2:2 chroma weightp:
- * [U: ref] [U: fenc]
- * [V: ref] [V: fenc]
- * fenc = ref + offset
- * v = u + stride * chroma height */
-
-static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
-{
-    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
-    int i_stride = fenc->i_stride[1];
-    int i_offset = i_stride / 2;
-    int i_lines = fenc->i_lines[1];
-    int i_width = fenc->i_width[1];
-    int v_shift = CHROMA_V_SHIFT;
-    int cw = 8*h->mb.i_mb_width;
-    int ch = 16*h->mb.i_mb_height >> v_shift;
-    int height = 16 >> v_shift;
-
-    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
-    {
-        x264_frame_expand_border_chroma( h, ref, 1 );
-        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
-            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
-            {
-                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
-                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
-                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
-                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
-                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
-                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
-            }
-    }
-    else
-        h->mc.plane_copy_deinterleave( dstu, i_stride, dstv, i_stride, ref->plane[1], i_stride, cw, ch );
-    h->mc.plane_copy_deinterleave( dstu+i_offset, i_stride, dstv+i_offset, i_stride, fenc->plane[1], i_stride, cw, ch );
-    x264_emms();
-}
-
-static NOINLINE pixel *x264_weight_cost_init_chroma444( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dst, int p )
-{
-    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
-    int i_stride = fenc->i_stride[p];
-    int i_lines = fenc->i_lines[p];
-    int i_width = fenc->i_width[p];
-
-    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
-    {
-        x264_frame_expand_border_chroma( h, ref, p );
-        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 16, pel_offset_y = y*i_stride )
-            for( int x = 0, pel_offset_x = 0; x < i_width; x += 16, mb_xy++, pel_offset_x += 16 )
-            {
-                pixel *pix = dst + pel_offset_y + pel_offset_x;
-                pixel *src = ref->plane[p] + pel_offset_y + pel_offset_x;
-                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0] / 2;
-                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1] / 2;
-                /* We don't want to calculate hpels for fenc frames, so we round the motion
-                 * vectors to fullpel here.  It's not too bad, I guess? */
-                h->mc.copy_16x16_unaligned( pix, i_stride, src+mvx+mvy*i_stride, i_stride, 16 );
-            }
-        x264_emms();
-        return dst;
-    }
-    x264_emms();
-    return ref->plane[p];
-}
-
-static int x264_weight_slice_header_cost( x264_t *h, x264_weight_t *w, int b_chroma )
-{
-    /* Add cost of weights in the slice header. */
-    int lambda = x264_lambda_tab[X264_LOOKAHEAD_QP];
-    /* 4 times higher, because chroma is analyzed at full resolution. */
-    if( b_chroma )
-        lambda *= 4;
-    int numslices;
-    if( h->param.i_slice_count )
-        numslices = h->param.i_slice_count;
-    else if( h->param.i_slice_max_mbs )
-        numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
-    else
-        numslices = 1;
-    /* FIXME: find a way to account for --slice-max-size?
-     * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
-     * Cut denom cost in half if chroma, since it's shared between the two chroma planes. */
-    int denom_cost = bs_size_ue( w[0].i_denom ) * (2 - b_chroma);
-    return lambda * numslices * ( 10 + denom_cost + 2 * (bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset )) );
-}
-
-static NOINLINE unsigned int x264_weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
-{
-    unsigned int cost = 0;
-    int i_stride = fenc->i_stride_lowres;
-    int i_lines = fenc->i_lines_lowres;
-    int i_width = fenc->i_width_lowres;
-    pixel *fenc_plane = fenc->lowres[0];
-    ALIGNED_ARRAY_16( pixel, buf,[8*8] );
-    int pixoff = 0;
-    int i_mb = 0;
-
-    if( w )
-    {
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
-            {
-                w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
-                int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride );
-                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
-            }
-        cost += x264_weight_slice_header_cost( h, w, 0 );
-    }
-    else
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
-            {
-                int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride );
-                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
-            }
-    x264_emms();
-    return cost;
-}
-
-static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w )
-{
-    unsigned int cost = 0;
-    int i_stride = fenc->i_stride[1];
-    int i_lines = fenc->i_lines[1];
-    int i_width = fenc->i_width[1];
-    pixel *src = ref + (i_stride >> 1);
-    ALIGNED_ARRAY_16( pixel, buf, [8*16] );
-    int pixoff = 0;
-    int height = 16 >> CHROMA_V_SHIFT;
-    if( w )
-    {
-        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
-            {
-                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
-                /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
-                 * But testing shows that for chroma the DC coefficient is by far the most
-                 * important part of the coding cost.  Thus a more useful chroma weight is
-                 * obtained by comparing each block's DC coefficient instead of the actual
-                 * pixels. */
-                cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height );
-            }
-        cost += x264_weight_slice_header_cost( h, w, 1 );
-    }
-    else
-        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
-                cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height );
-    x264_emms();
-    return cost;
-}
-
-static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w, int p )
-{
-    unsigned int cost = 0;
-    int i_stride = fenc->i_stride[p];
-    int i_lines = fenc->i_lines[p];
-    int i_width = fenc->i_width[p];
-    pixel *src = fenc->plane[p];
-    ALIGNED_ARRAY_16( pixel, buf, [16*16] );
-    int pixoff = 0;
-    if( w )
-    {
-        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
-            {
-                w->weightfn[16>>2]( buf, 16, &ref[pixoff], i_stride, w, 16 );
-                cost += h->pixf.mbcmp[PIXEL_16x16]( buf, 16, &src[pixoff], i_stride );
-            }
-        cost += x264_weight_slice_header_cost( h, w, 1 );
-    }
-    else
-        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
-            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
-                cost += h->pixf.mbcmp[PIXEL_16x16]( &ref[pixoff], i_stride, &src[pixoff], i_stride );
-    x264_emms();
-    return cost;
-}
-
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
-{
-    int i_delta_index = fenc->i_frame - ref->i_frame - 1;
-    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
-    const float epsilon = 1.f/128.f;
-    x264_weight_t *weights = fenc->weight[0];
-    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
-    SET_WEIGHT( weights[1], 0, 1, 0, 0 );
-    SET_WEIGHT( weights[2], 0, 1, 0, 0 );
-    int chroma_initted = 0;
-    float guess_scale[3];
-    float fenc_mean[3];
-    float ref_mean[3];
-    for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
-    {
-        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
-        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
-        guess_scale[plane] = sqrtf( fenc_var / ref_var );
-        fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
-        ref_mean[plane]  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
-    }
-
-    int chroma_denom = 7;
-    if( !b_lookahead )
-    {
-        /* make sure both our scale factors fit */
-        while( chroma_denom > 0 )
-        {
-            float thresh = 127.f / (1<<chroma_denom);
-            if( guess_scale[1] < thresh && guess_scale[2] < thresh )
-                break;
-            chroma_denom--;
-        }
-    }
-
-    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
-    for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
-    {
-        int minoff, minscale, mindenom;
-        unsigned int minscore, origscore;
-        int found;
-
-        //early termination
-        if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
-        {
-            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
-            continue;
-        }
-
-        if( plane )
-        {
-            weights[plane].i_denom = chroma_denom;
-            weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
-            if( weights[plane].i_scale > 127 )
-            {
-                weights[1].weightfn = weights[2].weightfn = NULL;
-                break;
-            }
-        }
-        else
-            x264_weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] );
-
-        found = 0;
-        mindenom = weights[plane].i_denom;
-        minscale = weights[plane].i_scale;
-        minoff = 0;
-
-        pixel *mcbuf;
-        if( !plane )
-        {
-            if( !fenc->b_intra_calculated )
-            {
-                x264_mb_analysis_t a;
-                x264_lowres_context_init( h, &a );
-                x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0 );
-            }
-            mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
-            origscore = minscore = x264_weight_cost_luma( h, fenc, mcbuf, NULL );
-        }
-        else
-        {
-            if( CHROMA444 )
-            {
-                mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
-                origscore = minscore = x264_weight_cost_chroma444( h, fenc, mcbuf, NULL, plane );
-            }
-            else
-            {
-                pixel *dstu = h->mb.p_weight_buf[0];
-                pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
-                if( !chroma_initted++ )
-                    x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
-                mcbuf = plane == 1 ? dstu : dstv;
-                origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
-            }
-        }
-
-        if( !minscore )
-            continue;
-
-        /* Picked somewhat arbitrarily */
-        static const uint8_t weight_check_distance[][2] =
-        {
-            {0,0},{0,0},{0,1},{0,1},
-            {0,1},{0,1},{0,1},{1,1},
-            {1,1},{2,1},{2,1},{4,2}
-        };
-        int scale_dist =  b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0];
-        int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1];
-
-        int start_scale  = x264_clip3( minscale - scale_dist, 0, 127 );
-        int end_scale    = x264_clip3( minscale + scale_dist, 0, 127 );
-        for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
-        {
-            int cur_scale = i_scale;
-            int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead;
-            if( cur_offset < - 128 || cur_offset > 127 )
-            {
-                /* Rescale considering the constraints on cur_offset. We do it in this order
-                 * because scale has a much wider range than offset (because of denom), so
-                 * it should almost never need to be clamped. */
-                cur_offset = x264_clip3( cur_offset, -128, 127 );
-                cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f;
-                cur_scale = x264_clip3( cur_scale, 0, 127 );
-            }
-            int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 );
-            int end_offset   = x264_clip3( cur_offset + offset_dist, -128, 127 );
-            for( int i_off = start_offset; i_off <= end_offset; i_off++ )
-            {
-                SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
-                unsigned int s;
-                if( plane )
-                {
-                    if( CHROMA444 )
-                        s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
-                    else
-                        s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
-                }
-                else
-                    s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
-                COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
-
-                // Don't check any more offsets if the previous one had a lower cost than the current one
-                if( minoff == start_offset && i_off != start_offset )
-                    break;
-            }
-        }
-        x264_emms();
-
-        /* Use a smaller denominator if possible */
-        if( !plane )
-        {
-            while( mindenom > 0 && !(minscale&1) )
-            {
-                mindenom--;
-                minscale >>= 1;
-            }
-        }
-
-        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
-        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
-        if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
-        {
-            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
-            continue;
-        }
-        else
-            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
-
-        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
-            fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
-    }
-
-    /* Optimize and unify denominator */
-    if( weights[1].weightfn || weights[2].weightfn )
-    {
-        int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom;
-        int both_weighted = weights[1].weightfn && weights[2].weightfn;
-        /* If only one plane is weighted, the other has an implicit scale of 1<<denom.
-         * With denom==7, this comes out to 128, which is invalid, so don't allow that. */
-        while( (!both_weighted && denom==7) ||
-               (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1))
-                         && !(weights[2].weightfn && (weights[2].i_scale&1))) )
-        {
-            denom--;
-            for( int i = 1; i <= 2; i++ )
-                if( weights[i].weightfn )
-                {
-                    weights[i].i_scale >>= 1;
-                    weights[i].i_denom = denom;
-                }
-        }
-    }
-    for( int i = 1; i <= 2; i++ )
-        if( weights[i].weightfn )
-            h->mc.weight_cache( h, &weights[i] );
-
-    if( weights[0].weightfn && b_lookahead )
-    {
-        //scale lowres in lookahead for slicetype_frame_cost
-        pixel *src = ref->buffer_lowres[0];
-        pixel *dst = h->mb.p_weight_buf[0];
-        int width = ref->i_width_lowres + PADH*2;
-        int height = ref->i_lines_lowres + PADV*2;
-        x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
-                                 width, height, &weights[0] );
-        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
-    }
-}
-
-/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
- * in multithreaded lookahead. */
-#define PAD_SIZE 32
-/* cost_est, cost_est_aq, intra_mbs, num rows */
-#define NUM_INTS 4
-#define COST_EST 0
-#define COST_EST_AQ 1
-#define INTRA_MBS 2
-#define NUM_ROWS 3
-#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))
-
-static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
-                                    x264_frame_t **frames, int p0, int p1, int b,
-                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w,
-                                    int *output_inter, int *output_intra )
-{
-    x264_frame_t *fref0 = frames[p0];
-    x264_frame_t *fref1 = frames[p1];
-    x264_frame_t *fenc  = frames[b];
-    const int b_bidir = (b < p1);
-    const int i_mb_x = h->mb.i_mb_x;
-    const int i_mb_y = h->mb.i_mb_y;
-    const int i_mb_stride = h->mb.i_mb_width;
-    const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
-    const int i_stride = fenc->i_stride_lowres;
-    const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride);
-    const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
-    int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] };
-    int (*fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
-    int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 &&
-                            i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) ||
-                            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
-
-    ALIGNED_ARRAY_16( pixel, pix1,[9*FDEC_STRIDE] );
-    pixel *pix2 = pix1+8;
-    x264_me_t m[2];
-    int i_bcost = COST_MAX;
-    int list_used = 0;
-    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
-    int lowres_penalty = 4;
-
-    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
-    h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );
-
-    if( p0 == p1 )
-        goto lowres_intra_mb;
-
-    // no need for h->mb.mv_min[]
-    h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
-    h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
-    h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
-    h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
-    if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
-    {
-        h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
-        h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
-        h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
-        h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
-    }
-
-#define LOAD_HPELS_LUMA(dst, src) \
-    { \
-        (dst)[0] = &(src)[0][i_pel_offset]; \
-        (dst)[1] = &(src)[1][i_pel_offset]; \
-        (dst)[2] = &(src)[2][i_pel_offset]; \
-        (dst)[3] = &(src)[3][i_pel_offset]; \
-    }
-#define LOAD_WPELS_LUMA(dst,src) \
-    (dst) = &(src)[i_pel_offset];
-
-#define CLIP_MV( mv ) \
-    { \
-        mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
-        mv[1] = x264_clip3( mv[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); \
-    }
-#define TRY_BIDIR( mv0, mv1, penalty ) \
-    { \
-        int i_cost; \
-        if( h->param.analyse.i_subpel_refine <= 1 ) \
-        { \
-            int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
-            int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
-            pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
-            pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
-            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
-        } \
-        else \
-        { \
-            intptr_t stride1 = 16, stride2 = 16; \
-            pixel *src1, *src2; \
-            src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
-                                  (mv0)[0], (mv0)[1], 8, 8, w ); \
-            src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
-                                  (mv1)[0], (mv1)[1], 8, 8, w ); \
-            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
-        } \
-        i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
-                           m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
-        COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
-    }
-
-    m[0].i_pixel = PIXEL_8x8;
-    m[0].p_cost_mv = a->p_cost_mv;
-    m[0].i_stride[0] = i_stride;
-    m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
-    m[0].weight = w;
-    m[0].i_ref = 0;
-    LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
-    m[0].p_fref_w = m[0].p_fref[0];
-    if( w[0].weightfn )
-        LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
-
-    if( b_bidir )
-    {
-        ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
-
-        m[1].i_pixel = PIXEL_8x8;
-        m[1].p_cost_mv = a->p_cost_mv;
-        m[1].i_stride[0] = i_stride;
-        m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
-        m[1].i_ref = 0;
-        m[1].weight = x264_weight_none;
-        LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
-        m[1].p_fref_w = m[1].p_fref[0];
-
-        if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF )
-        {
-            int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
-            dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
-            dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
-            dmv[1][0] = dmv[0][0] - mvr[0];
-            dmv[1][1] = dmv[0][1] - mvr[1];
-            CLIP_MV( dmv[0] );
-            CLIP_MV( dmv[1] );
-            if( h->param.analyse.i_subpel_refine <= 1 )
-                M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
-        }
-        else
-            M64( dmv ) = 0;
-
-        TRY_BIDIR( dmv[0], dmv[1], 0 );
-        if( M64( dmv ) )
-        {
-            int i_cost;
-            h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
-            i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
-            COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
-        }
-    }
-
-    for( int l = 0; l < 1 + b_bidir; l++ )
-    {
-        if( do_search[l] )
-        {
-            int i_mvc = 0;
-            int16_t (*fenc_mv)[2] = fenc_mvs[l];
-            ALIGNED_4( int16_t mvc[4][2] );
-
-            /* Reverse-order MV prediction. */
-            M32( mvc[0] ) = 0;
-            M32( mvc[2] ) = 0;
-#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
-            if( i_mb_x < h->mb.i_mb_width - 1 )
-                MVC( fenc_mv[1] );
-            if( i_mb_y < h->i_threadslice_end - 1 )
-            {
-                MVC( fenc_mv[i_mb_stride] );
-                if( i_mb_x > 0 )
-                    MVC( fenc_mv[i_mb_stride-1] );
-                if( i_mb_x < h->mb.i_mb_width - 1 )
-                    MVC( fenc_mv[i_mb_stride+1] );
-            }
-#undef MVC
-            if( i_mvc <= 1 )
-                CP32( m[l].mvp, mvc[0] );
-            else
-                x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
-
-            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
-             * since anything else is likely to have enough residual to not trigger the skip. */
-            if( !M32( m[l].mvp ) )
-            {
-                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
-                if( m[l].cost < 64 )
-                {
-                    M32( m[l].mv ) = 0;
-                    goto skip_motionest;
-                }
-            }
-
-            x264_me_search( h, &m[l], mvc, i_mvc );
-            m[l].cost -= a->p_cost_mv[0]; // remove mvcost from skip mbs
-            if( M32( m[l].mv ) )
-                m[l].cost += 5 * a->i_lambda;
-
-skip_motionest:
-            CP32( fenc_mvs[l], m[l].mv );
-            *fenc_costs[l] = m[l].cost;
-        }
-        else
-        {
-            CP32( m[l].mv, fenc_mvs[l] );
-            m[l].cost = *fenc_costs[l];
-        }
-        COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
-    }
-
-    if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
-        TRY_BIDIR( m[0].mv, m[1].mv, 5 );
-
-lowres_intra_mb:
-    if( !fenc->b_intra_calculated )
-    {
-        ALIGNED_ARRAY_16( pixel, edge,[36] );
-        pixel *pix = &pix1[8+FDEC_STRIDE];
-        pixel *src = &fenc->lowres[0][i_pel_offset];
-        const int intra_penalty = 5 * a->i_lambda;
-        int satds[3];
-        int pixoff = 4 / sizeof(pixel);
-
-        /* Avoid store forwarding stalls by writing larger chunks */
-        memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) );
-        for( int i = -1; i < 8; i++ )
-            M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] );
-
-        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
-        int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
-
-        if( h->param.analyse.i_subpel_refine > 1 )
-        {
-            h->predict_8x8c[I_PRED_CHROMA_P]( pix );
-            int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
-            i_icost = X264_MIN( i_icost, satd );
-            h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-            for( int i = 3; i < 9; i++ )
-            {
-                h->predict_8x8[i]( pix, edge );
-                satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
-                i_icost = X264_MIN( i_icost, satd );
-            }
-        }
-
-        i_icost = ((i_icost + intra_penalty) >> (BIT_DEPTH - 8)) + lowres_penalty;
-        fenc->i_intra_cost[i_mb_xy] = i_icost;
-        int i_icost_aq = i_icost;
-        if( h->param.rc.i_aq_mode )
-            i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        output_intra[ROW_SATD] += i_icost_aq;
-        if( b_frame_score_mb )
-        {
-            output_intra[COST_EST] += i_icost;
-            output_intra[COST_EST_AQ] += i_icost_aq;
-        }
-    }
-    i_bcost = (i_bcost >> (BIT_DEPTH - 8)) + lowres_penalty;
-
-    /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
-    /* FIXME: Should we still forbid them now that we cache intra scores? */
-    if( !b_bidir )
-    {
-        int i_icost = fenc->i_intra_cost[i_mb_xy];
-        int b_intra = i_icost < i_bcost;
-        if( b_intra )
-        {
-            i_bcost = i_icost;
-            list_used = 0;
-        }
-        if( b_frame_score_mb )
-            output_inter[INTRA_MBS] += b_intra;
-    }
-
-    /* In an I-frame, we've already added the results above in the intra section. */
-    if( p0 != p1 )
-    {
-        int i_bcost_aq = i_bcost;
-        if( h->param.rc.i_aq_mode )
-            i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        output_inter[ROW_SATD] += i_bcost_aq;
-        if( b_frame_score_mb )
-        {
-            /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
-            output_inter[COST_EST] += i_bcost;
-            output_inter[COST_EST_AQ] += i_bcost_aq;
-        }
-    }
-
-    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
-}
-#undef TRY_BIDIR
-
-#define NUM_MBS\
-   (h->mb.i_mb_width > 2 && h->mb.i_mb_height > 2 ?\
-   (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
-    h->mb.i_mb_width * h->mb.i_mb_height)
-
-typedef struct
-{
-    x264_t *h;
-    x264_mb_analysis_t *a;
-    x264_frame_t **frames;
-    int p0;
-    int p1;
-    int b;
-    int dist_scale_factor;
-    int *do_search;
-    const x264_weight_t *w;
-    int *output_inter;
-    int *output_intra;
-} x264_slicetype_slice_t;
-
-static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
-{
-    x264_t *h = s->h;
-
-    /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
-     * This considerably improves MV prediction overall. */
-
-    /* The edge mbs seem to reduce the predictive quality of the
-     * whole frame's score, but are needed for a spatial distribution. */
-    int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
-
-    int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
-    int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
-    int start_x = h->mb.i_mb_width - 2 + do_edges;
-    int end_x = 1 - do_edges;
-
-    for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
-        for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
-            x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
-                                    s->do_search, s->w, s->output_inter, s->output_intra );
-}
-
-static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
-                                      x264_frame_t **frames, int p0, int p1, int b )
-{
-    int i_score = 0;
-    int do_search[2];
-    const x264_weight_t *w = x264_weight_none;
-    x264_frame_t *fenc = frames[b];
-
-    /* Check whether we already evaluated this frame
-     * If we have tried this frame as P, then we have also tried
-     * the preceding frames as B. (is this still true?) */
-    /* Also check that we already calculated the row SATDs for the current frame. */
-    if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
-        i_score = fenc->i_cost_est[b-p0][p1-b];
-    else
-    {
-        int dist_scale_factor = 128;
-
-        /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
-        do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
-        do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
-        if( do_search[0] )
-        {
-            if( h->param.analyse.i_weighted_pred && b == p1 )
-            {
-                x264_emms();
-                x264_weights_analyse( h, fenc, frames[p0], 1 );
-                w = fenc->weight[0];
-            }
-            fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
-        }
-        if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
-
-        if( p1 != p0 )
-            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
-
-        int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
-        int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
-        int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
-        output_inter[0] = h->scratch_buffer2;
-        output_intra[0] = output_inter[0] + output_buf_size;
-
-#if HAVE_OPENCL
-        if( h->param.b_opencl )
-        {
-            x264_opencl_lowres_init(h, fenc, a->i_lambda );
-            if( do_search[0] )
-            {
-                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
-                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
-            }
-            if( do_search[1] )
-            {
-                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
-                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL );
-            }
-            if( b != p0 )
-                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
-            x264_opencl_flush( h );
-
-            i_score = fenc->i_cost_est[b-p0][p1-b];
-        }
-        else
-#endif
-        {
-            if( h->param.i_lookahead_threads > 1 )
-            {
-                x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
-
-                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-                {
-                    x264_t *t = h->lookahead_thread[i];
-
-                    /* FIXME move this somewhere else */
-                    t->mb.i_me_method = h->mb.i_me_method;
-                    t->mb.i_subpel_refine = h->mb.i_subpel_refine;
-                    t->mb.b_chroma_me = h->mb.b_chroma_me;
-
-                    s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
-                        output_inter[i], output_intra[i] };
-
-                    t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
-                    t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
-
-                    int thread_height = t->i_threadslice_end - t->i_threadslice_start;
-                    int thread_output_size = thread_height + NUM_INTS;
-                    memset( output_inter[i], 0, thread_output_size * sizeof(int) );
-                    memset( output_intra[i], 0, thread_output_size * sizeof(int) );
-                    output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
-
-                    output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
-                    output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
-
-                    x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
-                }
-                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-                    x264_threadpool_wait( h->lookaheadpool, &s[i] );
-            }
-            else
-            {
-                h->i_threadslice_start = 0;
-                h->i_threadslice_end = h->mb.i_mb_height;
-                memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
-                memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
-                output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
-                x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
-                    output_inter[0], output_intra[0] };
-                x264_slicetype_slice_cost( &s );
-            }
-
-            /* Sum up accumulators */
-            if( b == p1 )
-                fenc->i_intra_mbs[b-p0] = 0;
-            if( !fenc->b_intra_calculated )
-            {
-                fenc->i_cost_est[0][0] = 0;
-                fenc->i_cost_est_aq[0][0] = 0;
-            }
-            fenc->i_cost_est[b-p0][p1-b] = 0;
-            fenc->i_cost_est_aq[b-p0][p1-b] = 0;
-
-            int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
-            int *row_satd_intra = fenc->i_row_satds[0][0];
-            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-            {
-                if( b == p1 )
-                    fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
-                if( !fenc->b_intra_calculated )
-                {
-                    fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
-                    fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
-                }
-
-                fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
-                fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
-
-                if( h->param.rc.i_vbv_buffer_size )
-                {
-                    int row_count = output_inter[i][NUM_ROWS];
-                    memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
-                    if( !fenc->b_intra_calculated )
-                        memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
-                    row_satd_inter += row_count;
-                    row_satd_intra += row_count;
-                }
-            }
-
-            i_score = fenc->i_cost_est[b-p0][p1-b];
-            if( b != p1 )
-                i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
-            else
-                fenc->b_intra_calculated = 1;
-
-            fenc->i_cost_est[b-p0][p1-b] = i_score;
-            x264_emms();
-        }
-    }
-
-    return i_score;
-}
-
-/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
- * re-running lookahead. */
-static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
-{
-    int i_score = 0;
-    int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
-    float *qp_offset = IS_X264_TYPE_B(frames[b]->i_type) ? frames[b]->f_qp_offset_aq : frames[b]->f_qp_offset;
-    x264_emms();
-    for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
-    {
-        row_satd[ h->mb.i_mb_y ] = 0;
-        for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
-        {
-            int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
-            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
-            float qp_adj = qp_offset[i_mb_xy];
-            i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
-            row_satd[ h->mb.i_mb_y ] += i_mb_cost;
-            if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->mb.i_mb_height - 1 &&
-                 h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->mb.i_mb_width - 1) ||
-                 h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
-            {
-                i_score += i_mb_cost;
-            }
-        }
-    }
-    return i_score;
-}
-
-/* Trade off precision in mbtree for increased range */
-#define MBTREE_PRECISION 0.5f
-
-static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
-{
-    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION );
-    float weightdelta = 0.0;
-    if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
-        weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
-
-    /* Allow the strength to be adjusted via qcompress, since the two
-     * concepts are very similar. */
-    float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
-    for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
-    {
-        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
-        if( intra_cost )
-        {
-            int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor + 128) >> 8;
-            float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
-            frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
-        }
-    }
-}
-
-static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
-{
-    uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
-    int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
-    int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
-    int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
-    int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
-    int16_t *buf = h->scratch_buffer;
-    uint16_t *propagate_cost = frames[b]->i_propagate_cost;
-    uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b];
-
-    x264_emms();
-    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION;
-
-    /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
-    if( !referenced )
-        memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );
-
-    for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->mb.i_mb_height; h->mb.i_mb_y++ )
-    {
-        int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
-        h->mc.mbtree_propagate_cost( buf, propagate_cost,
-            frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index,
-            frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
-        if( referenced )
-            propagate_cost += h->mb.i_mb_width;
-
-        h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index],
-                                     bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 );
-        if( b != p1 )
-        {
-            h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index],
-                                         bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 );
-        }
-    }
-
-    if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
-        x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
-}
-
-static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
-{
-    int idx = !b_intra;
-    int last_nonb, cur_nonb = 1;
-    int bframes = 0;
-
-    x264_emms();
-    float total_duration = 0.0;
-    for( int j = 0; j <= num_frames; j++ )
-        total_duration += frames[j]->f_duration;
-    float average_duration = total_duration / (num_frames + 1);
-
-    int i = num_frames;
-
-    if( b_intra )
-        x264_slicetype_frame_cost( h, a, frames, 0, 0, 0 );
-
-    while( i > 0 && IS_X264_TYPE_B( frames[i]->i_type ) )
-        i--;
-    last_nonb = i;
-
-    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
-     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
-     * lookahead=0, so that's what's currently implemented. */
-    if( !h->param.rc.i_lookahead )
-    {
-        if( b_intra )
-        {
-            memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
-            memcpy( frames[0]->f_qp_offset, frames[0]->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
-            return;
-        }
-        XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
-        memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
-    }
-    else
-    {
-        if( last_nonb < idx )
-            return;
-        memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
-    }
-
-    while( i-- > idx )
-    {
-        cur_nonb = i;
-        while( IS_X264_TYPE_B( frames[cur_nonb]->i_type ) && cur_nonb > 0 )
-            cur_nonb--;
-        if( cur_nonb < idx )
-            break;
-        x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb );
-        memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
-        bframes = last_nonb - cur_nonb - 1;
-        if( h->param.i_bframe_pyramid && bframes > 1 )
-        {
-            int middle = (bframes + 1)/2 + cur_nonb;
-            x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, middle );
-            memset( frames[middle]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
-            while( i > cur_nonb )
-            {
-                int p0 = i > middle ? middle : cur_nonb;
-                int p1 = i < middle ? middle : last_nonb;
-                if( i != middle )
-                {
-                    x264_slicetype_frame_cost( h, a, frames, p0, p1, i );
-                    x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
-                }
-                i--;
-            }
-            x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
-        }
-        else
-        {
-            while( i > cur_nonb )
-            {
-                x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i );
-                x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
-                i--;
-            }
-        }
-        x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
-        last_nonb = cur_nonb;
-    }
-
-    if( !h->param.rc.i_lookahead )
-    {
-        x264_slicetype_frame_cost( h, a, frames, 0, last_nonb, last_nonb );
-        x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
-        XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
-    }
-
-    x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
-    if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
-        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
-}
-
-static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
-{
-    int cost = x264_slicetype_frame_cost( h, a, frames, p0, p1, b );
-    if( h->param.rc.i_aq_mode )
-    {
-        if( h->param.rc.b_mb_tree )
-            return x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
-        else
-            return frames[b]->i_cost_est_aq[b-p0][p1-b];
-    }
-    return cost;
-}
-
-static void x264_calculate_durations( x264_t *h, x264_frame_t *cur_frame, x264_frame_t *prev_frame, int64_t *i_cpb_delay, int64_t *i_coded_fields )
-{
-    cur_frame->i_cpb_delay = *i_cpb_delay;
-    cur_frame->i_dpb_output_delay = cur_frame->i_field_cnt - *i_coded_fields;
-
-    // add a correction term for frame reordering
-    cur_frame->i_dpb_output_delay += h->sps->vui.i_num_reorder_frames*2;
-
-    // fix possible negative dpb_output_delay because of pulldown changes and reordering
-    if( cur_frame->i_dpb_output_delay < 0 )
-    {
-        cur_frame->i_cpb_delay += cur_frame->i_dpb_output_delay;
-        cur_frame->i_dpb_output_delay = 0;
-        if( prev_frame )
-            prev_frame->i_cpb_duration += cur_frame->i_dpb_output_delay;
-    }
-
-    // don't reset cpb delay for IDR frames when using intra-refresh
-    if( cur_frame->b_keyframe && !h->param.b_intra_refresh )
-        *i_cpb_delay = 0;
-
-    *i_cpb_delay += cur_frame->i_duration;
-    *i_coded_fields += cur_frame->i_duration;
-    cur_frame->i_cpb_duration = cur_frame->i_duration;
-}
-
-static void x264_vbv_lookahead( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int keyframe )
-{
-    int last_nonb = 0, cur_nonb = 1, idx = 0;
-    x264_frame_t *prev_frame = NULL;
-    int prev_frame_idx = 0;
-    while( cur_nonb < num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
-        cur_nonb++;
-    int next_nonb = keyframe ? last_nonb : cur_nonb;
-
-    if( frames[cur_nonb]->i_coded_fields_lookahead >= 0 )
-    {
-        h->i_coded_fields_lookahead = frames[cur_nonb]->i_coded_fields_lookahead;
-        h->i_cpb_delay_lookahead = frames[cur_nonb]->i_cpb_delay_lookahead;
-    }
-
-    while( cur_nonb < num_frames )
-    {
-        /* P/I cost: This shouldn't include the cost of next_nonb */
-        if( next_nonb != cur_nonb )
-        {
-            int p0 = IS_X264_TYPE_I( frames[cur_nonb]->i_type ) ? cur_nonb : last_nonb;
-            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, p0, cur_nonb, cur_nonb );
-            frames[next_nonb]->i_planned_type[idx] = frames[cur_nonb]->i_type;
-            frames[cur_nonb]->i_coded_fields_lookahead = h->i_coded_fields_lookahead;
-            frames[cur_nonb]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead;
-            x264_calculate_durations( h, frames[cur_nonb], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead );
-            if( prev_frame )
-            {
-                frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration *
-                                                                            h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-            }
-            frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[cur_nonb]->i_cpb_duration *
-                                                             h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-            prev_frame = frames[cur_nonb];
-            prev_frame_idx = idx;
-            idx++;
-        }
-        /* Handle the B-frames: coded order */
-        for( int i = last_nonb+1; i < cur_nonb; i++, idx++ )
-        {
-            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, last_nonb, cur_nonb, i );
-            frames[next_nonb]->i_planned_type[idx] = X264_TYPE_B;
-            frames[i]->i_coded_fields_lookahead = h->i_coded_fields_lookahead;
-            frames[i]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead;
-            x264_calculate_durations( h, frames[i], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead );
-            if( prev_frame )
-            {
-                frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration *
-                                                                            h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-            }
-            frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[i]->i_cpb_duration *
-                                                             h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-            prev_frame = frames[i];
-            prev_frame_idx = idx;
-        }
-        last_nonb = cur_nonb;
-        cur_nonb++;
-        while( cur_nonb <= num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
-            cur_nonb++;
-    }
-    frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
-}
-
-static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
-{
-    int loc = 1;
-    int cost = 0;
-    int cur_nonb = 0;
-    path--; /* Since the 1st path element is really the second frame */
-    while( path[loc] )
-    {
-        int next_nonb = loc;
-        /* Find the location of the next non-B-frame. */
-        while( path[next_nonb] == 'B' )
-            next_nonb++;
-
-        /* Add the cost of the non-B-frame found above */
-        if( path[next_nonb] == 'P' )
-            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_nonb );
-        else /* I-frame */
-            cost += x264_slicetype_frame_cost( h, a, frames, next_nonb, next_nonb, next_nonb );
-        /* Early terminate if the cost we have found is larger than the best path cost so far */
-        if( cost > threshold )
-            break;
-
-        if( h->param.i_bframe_pyramid && next_nonb - cur_nonb > 2 )
-        {
-            int middle = cur_nonb + (next_nonb - cur_nonb)/2;
-            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, middle );
-            for( int next_b = loc; next_b < middle && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, middle, next_b );
-            for( int next_b = middle+1; next_b < next_nonb && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_nonb, next_b );
-        }
-        else
-            for( int next_b = loc; next_b < next_nonb && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_b );
-
-        loc = next_nonb + 1;
-        cur_nonb = next_nonb;
-    }
-    return cost;
-}
-
-/* Viterbi/trellis slicetype decision algorithm. */
-/* Uses strings due to the fact that the speed of the control functions is
-   negligible compared to the cost of running slicetype_frame_cost, and because
-   it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, char (*best_paths)[X264_LOOKAHEAD_MAX+1] )
-{
-    char paths[2][X264_LOOKAHEAD_MAX+1];
-    int num_paths = X264_MIN( h->param.i_bframe+1, length );
-    int best_cost = COST_MAX;
-    int best_possible = 0;
-    int idx = 0;
-
-    /* Iterate over all currently possible paths */
-    for( int path = 0; path < num_paths; path++ )
-    {
-        /* Add suffixes to the current path */
-        int len = length - (path + 1);
-        memcpy( paths[idx], best_paths[len % (X264_BFRAME_MAX+1)], len );
-        memset( paths[idx]+len, 'B', path );
-        strcpy( paths[idx]+len+path, "P" );
-
-        int possible = 1;
-        for( int i = 1; i <= length; i++ )
-        {
-            int i_type = frames[i]->i_type;
-            if( i_type == X264_TYPE_AUTO )
-                continue;
-            if( IS_X264_TYPE_B( i_type ) )
-                possible = possible && (i < len || i == length || paths[idx][i-1] == 'B');
-            else
-            {
-                possible = possible && (i < len || paths[idx][i-1] != 'B');
-                paths[idx][i-1] = IS_X264_TYPE_I( i_type ) ? 'I' : 'P';
-            }
-        }
-
-        if( possible || !best_possible )
-        {
-            if( possible && !best_possible )
-                best_cost = COST_MAX;
-            /* Calculate the actual cost of the current path */
-            int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
-            if( cost < best_cost )
-            {
-                best_cost = cost;
-                best_possible = possible;
-                idx ^= 1;
-            }
-        }
-    }
-
-    /* Store the best path. */
-    memcpy( best_paths[length % (X264_BFRAME_MAX+1)], paths[idx^1], length );
-}
-
-static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut )
-{
-    x264_frame_t *frame = frames[p1];
-
-    /* Don't do scenecuts on the right view of a frame-packed video. */
-    if( real_scenecut && h->param.i_frame_packing == 5 && (frame->i_frame&1) )
-        return 0;
-
-    x264_slicetype_frame_cost( h, a, frames, p0, p1, p1 );
-
-    int icost = frame->i_cost_est[0][0];
-    int pcost = frame->i_cost_est[p1-p0][0];
-    float f_bias;
-    int i_gop_size = frame->i_frame - h->lookahead->i_last_keyframe;
-    float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
-    /* magic numbers pulled out of thin air */
-    float f_thresh_min = f_thresh_max * 0.25;
-    int res;
-
-    if( h->param.i_keyint_min == h->param.i_keyint_max )
-        f_thresh_min = f_thresh_max;
-    if( i_gop_size <= h->param.i_keyint_min / 4 || h->param.b_intra_refresh )
-        f_bias = f_thresh_min / 4;
-    else if( i_gop_size <= h->param.i_keyint_min )
-        f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
-    else
-    {
-        f_bias = f_thresh_min
-                 + ( f_thresh_max - f_thresh_min )
-                 * ( i_gop_size - h->param.i_keyint_min )
-                 / ( h->param.i_keyint_max - h->param.i_keyint_min );
-    }
-
-    res = pcost >= (1.0 - f_bias) * icost;
-    if( res && real_scenecut )
-    {
-        int imb = frame->i_intra_mbs[p1-p0];
-        int pmb = NUM_MBS - imb;
-        x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
-                  frame->i_frame,
-                  icost, pcost, 1. - (double)pcost / icost,
-                  f_bias, i_gop_size, imb, pmb );
-    }
-    return res;
-}
-
-static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut, int num_frames, int i_max_search )
-{
-    /* Only do analysis during a normal scenecut check. */
-    if( real_scenecut && h->param.i_bframe )
-    {
-        int origmaxp1 = p0 + 1;
-        /* Look ahead to avoid coding short flashes as scenecuts. */
-        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
-            /* Don't analyse any more frames than the trellis would have covered. */
-            origmaxp1 += h->param.i_bframe;
-        else
-            origmaxp1++;
-        int maxp1 = X264_MIN( origmaxp1, num_frames );
-
-        /* Where A and B are scenes: AAAAAABBBAAAAAA
-         * If BBB is shorter than (maxp1-p0), it is detected as a flash
-         * and not considered a scenecut. */
-        for( int curp1 = p1; curp1 <= maxp1; curp1++ )
-            if( !scenecut_internal( h, a, frames, p0, curp1, 0 ) )
-                /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
-                for( int i = curp1; i > p0; i-- )
-                    frames[i]->b_scenecut = 0;
-
-        /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
-         * If each of BB ... EE are shorter than (maxp1-p0), they are
-         * detected as flashes and not considered scenecuts.
-         * Instead, the first F frame becomes a scenecut.
-         * If the video ends before F, no frame becomes a scenecut. */
-        for( int curp0 = p0; curp0 <= maxp1; curp0++ )
-            if( origmaxp1 > i_max_search || (curp0 < maxp1 && scenecut_internal( h, a, frames, curp0, maxp1, 0 )) )
-                /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
-                    frames[curp0]->b_scenecut = 0;
-    }
-
-    /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
-    if( !frames[p1]->b_scenecut )
-        return 0;
-    return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
-}
-
-#define IS_X264_TYPE_AUTO_OR_I(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_I(x))
-#define IS_X264_TYPE_AUTO_OR_B(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_B(x))
-
-void x264_slicetype_analyse( x264_t *h, int intra_minigop )
-{
-    x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
-    int num_frames, orig_num_frames, keyint_limit, framecnt;
-    int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
-    int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead;
-    /* For determinism we should limit the search to the number of frames lookahead has for sure
-     * in h->lookahead->next.list buffer, except at the end of stream.
-     * For normal calls with (intra_minigop == 0) that is h->lookahead->i_slicetype_length + 1 frames.
-     * And for I-frame calls (intra_minigop != 0) we already removed intra_minigop frames from there. */
-    if( h->param.b_deterministic )
-        i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + 1 - intra_minigop );
-    int keyframe = !!intra_minigop;
-
-    assert( h->frames.b_have_lowres );
-
-    if( !h->lookahead->last_nonb )
-        return;
-    frames[0] = h->lookahead->last_nonb;
-    for( framecnt = 0; framecnt < i_max_search; framecnt++ )
-        frames[framecnt+1] = h->lookahead->next.list[framecnt];
-
-    x264_lowres_context_init( h, &a );
-
-    if( !framecnt )
-    {
-        if( h->param.rc.b_mb_tree )
-            x264_macroblock_tree( h, &a, frames, 0, keyframe );
-        return;
-    }
-
-    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_keyframe - 1;
-    orig_num_frames = num_frames = h->param.b_intra_refresh ? framecnt : X264_MIN( framecnt, keyint_limit );
-
-    /* This is important psy-wise: if we have a non-scenecut keyframe,
-     * there will be significant visual artifacts if the frames just before
-     * go down in quality due to being referenced less, despite it being
-     * more RD-optimal. */
-    if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || vbv_lookahead )
-        num_frames = framecnt;
-    else if( h->param.b_open_gop && num_frames < framecnt )
-        num_frames++;
-    else if( num_frames == 0 )
-    {
-        frames[1]->i_type = X264_TYPE_I;
-        return;
-    }
-
-    if( IS_X264_TYPE_AUTO_OR_I( frames[1]->i_type ) &&
-        h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) )
-    {
-        if( frames[1]->i_type == X264_TYPE_AUTO )
-            frames[1]->i_type = X264_TYPE_I;
-        return;
-    }
-
-#if HAVE_OPENCL
-    x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda );
-#endif
-
-    /* Replace forced keyframes with I/IDR-frames */
-    for( int j = 1; j <= num_frames; j++ )
-    {
-        if( frames[j]->i_type == X264_TYPE_KEYFRAME )
-            frames[j]->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;
-    }
-
-    /* Close GOP at IDR-frames */
-    for( int j = 2; j <= num_frames; j++ )
-    {
-        if( frames[j]->i_type == X264_TYPE_IDR && IS_X264_TYPE_AUTO_OR_B( frames[j-1]->i_type ) )
-            frames[j-1]->i_type = X264_TYPE_P;
-    }
-
-    int num_analysed_frames = num_frames;
-    int reset_start;
-
-    if( h->param.i_bframe )
-    {
-        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
-        {
-            if( num_frames > 1 )
-            {
-                char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX+1] = {"","P"};
-                int best_path_index = num_frames % (X264_BFRAME_MAX+1);
-
-                /* Perform the frametype analysis. */
-                for( int j = 2; j <= num_frames; j++ )
-                    x264_slicetype_path( h, &a, frames, j, best_paths );
-
-                /* Load the results of the analysis into the frame types. */
-                for( int j = 1; j < num_frames; j++ )
-                {
-                    if( best_paths[best_path_index][j-1] != 'B' )
-                    {
-                        if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
-                            frames[j]->i_type = X264_TYPE_P;
-                    }
-                    else
-                    {
-                        if( frames[j]->i_type == X264_TYPE_AUTO )
-                            frames[j]->i_type = X264_TYPE_B;
-                    }
-                }
-            }
-        }
-        else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
-        {
-            int last_nonb = 0;
-            int num_bframes = h->param.i_bframe;
-            char path[X264_LOOKAHEAD_MAX+1];
-            for( int j = 1; j < num_frames; j++ )
-            {
-                if( j-1 > 0 && IS_X264_TYPE_B( frames[j-1]->i_type ) )
-                    num_bframes--;
-                else
-                {
-                    last_nonb = j-1;
-                    num_bframes = h->param.i_bframe;
-                }
-                if( !num_bframes )
-                {
-                    if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
-                        frames[j]->i_type = X264_TYPE_P;
-                    continue;
-                }
-
-                if( frames[j]->i_type != X264_TYPE_AUTO )
-                    continue;
-
-                if( IS_X264_TYPE_B( frames[j+1]->i_type ) )
-                {
-                    frames[j]->i_type = X264_TYPE_P;
-                    continue;
-                }
-
-                int bframes = j - last_nonb - 1;
-                memset( path, 'B', bframes );
-                strcpy( path+bframes, "PP" );
-                int cost_p = x264_slicetype_path_cost( h, &a, frames+last_nonb, path, COST_MAX );
-                strcpy( path+bframes, "BP" );
-                int cost_b = x264_slicetype_path_cost( h, &a, frames+last_nonb, path, cost_p );
-
-                if( cost_b < cost_p )
-                    frames[j]->i_type = X264_TYPE_B;
-                else
-                    frames[j]->i_type = X264_TYPE_P;
-            }
-        }
-        else
-        {
-            int num_bframes = h->param.i_bframe;
-            for( int j = 1; j < num_frames; j++ )
-            {
-                if( !num_bframes )
-                {
-                    if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
-                        frames[j]->i_type = X264_TYPE_P;
-                }
-                else if( frames[j]->i_type == X264_TYPE_AUTO )
-                {
-                    if( IS_X264_TYPE_B( frames[j+1]->i_type ) )
-                        frames[j]->i_type = X264_TYPE_P;
-                    else
-                        frames[j]->i_type = X264_TYPE_B;
-                }
-                if( IS_X264_TYPE_B( frames[j]->i_type ) )
-                    num_bframes--;
-                else
-                    num_bframes = h->param.i_bframe;
-            }
-        }
-        if( IS_X264_TYPE_AUTO_OR_B( frames[num_frames]->i_type ) )
-            frames[num_frames]->i_type = X264_TYPE_P;
-
-        int num_bframes = 0;
-        while( num_bframes < num_frames && IS_X264_TYPE_B( frames[num_bframes+1]->i_type ) )
-            num_bframes++;
-
-        /* Check scenecut on the first minigop. */
-        for( int j = 1; j < num_bframes+1; j++ )
-        {
-            if( frames[j]->i_forced_type == X264_TYPE_AUTO && IS_X264_TYPE_AUTO_OR_I( frames[j+1]->i_forced_type ) &&
-                h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) )
-            {
-                frames[j]->i_type = X264_TYPE_P;
-                num_analysed_frames = j;
-                break;
-            }
-        }
-
-        reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
-    }
-    else
-    {
-        for( int j = 1; j <= num_frames; j++ )
-            if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
-                frames[j]->i_type = X264_TYPE_P;
-        reset_start = !keyframe + 1;
-    }
-
-    /* Perform the actual macroblock tree analysis.
-     * Don't go farther than the maximum keyframe interval; this helps in short GOPs. */
-    if( h->param.rc.b_mb_tree )
-        x264_macroblock_tree( h, &a, frames, X264_MIN(num_frames, h->param.i_keyint_max), keyframe );
-
-    /* Enforce keyframe limit. */
-    if( !h->param.b_intra_refresh )
-    {
-        int last_keyframe = h->lookahead->i_last_keyframe;
-        int last_possible = 0;
-        for( int j = 1; j <= num_frames; j++ )
-        {
-            x264_frame_t *frm = frames[j];
-            int keyframe_dist = frm->i_frame - last_keyframe;
-
-            if( IS_X264_TYPE_AUTO_OR_I( frm->i_forced_type ) )
-            {
-                if( h->param.b_open_gop || !IS_X264_TYPE_B( frames[j-1]->i_forced_type ) )
-                    last_possible = j;
-            }
-            if( keyframe_dist >= h->param.i_keyint_max )
-            {
-                if( last_possible != 0 && last_possible != j )
-                {
-                    j = last_possible;
-                    frm = frames[j];
-                    keyframe_dist = frm->i_frame - last_keyframe;
-                }
-                last_possible = 0;
-                if( frm->i_type != X264_TYPE_IDR )
-                    frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;
-            }
-            if( frm->i_type == X264_TYPE_I && keyframe_dist >= h->param.i_keyint_min )
-            {
-                if( h->param.b_open_gop )
-                {
-                    last_keyframe = frm->i_frame;
-                    if( h->param.b_bluray_compat )
-                    {
-                        // Use bluray order
-                        int bframes = 0;
-                        while( bframes < j-1 && IS_X264_TYPE_B( frames[j-1-bframes]->i_type ) )
-                            bframes++;
-                        last_keyframe -= bframes;
-                    }
-                }
-                else if( frm->i_forced_type != X264_TYPE_I )
-                    frm->i_type = X264_TYPE_IDR;
-            }
-            if( frm->i_type == X264_TYPE_IDR )
-            {
-                last_keyframe = frm->i_frame;
-                if( j > 1 && IS_X264_TYPE_B( frames[j-1]->i_type ) )
-                    frames[j-1]->i_type = X264_TYPE_P;
-            }
-        }
-    }
-
-    if( vbv_lookahead )
-        x264_vbv_lookahead( h, &a, frames, num_frames, keyframe );
-
-    /* Restore frametypes for all frames that haven't actually been decided yet. */
-    for( int j = reset_start; j <= num_frames; j++ )
-        frames[j]->i_type = frames[j]->i_forced_type;
-
-#if HAVE_OPENCL
-    x264_opencl_slicetype_end( h );
-#endif
-}
-
-void x264_slicetype_decide( x264_t *h )
-{
-    x264_frame_t *frames[X264_BFRAME_MAX+2];
-    x264_frame_t *frm;
-    int bframes;
-    int brefs;
-
-    if( !h->lookahead->next.i_size )
-        return;
-
-    int lookahead_size = h->lookahead->next.i_size;
-
-    for( int i = 0; i < h->lookahead->next.i_size; i++ )
-    {
-        if( h->param.b_vfr_input )
-        {
-            if( lookahead_size-- > 1 )
-                h->lookahead->next.list[i]->i_duration = 2 * (h->lookahead->next.list[i+1]->i_pts - h->lookahead->next.list[i]->i_pts);
-            else
-                h->lookahead->next.list[i]->i_duration = h->i_prev_duration;
-        }
-        else
-            h->lookahead->next.list[i]->i_duration = delta_tfi_divisor[h->lookahead->next.list[i]->i_pic_struct];
-        h->i_prev_duration = h->lookahead->next.list[i]->i_duration;
-        h->lookahead->next.list[i]->f_duration = (double)h->lookahead->next.list[i]->i_duration
-                                               * h->sps->vui.i_num_units_in_tick
-                                               / h->sps->vui.i_time_scale;
-
-        if( h->lookahead->next.list[i]->i_frame > h->i_disp_fields_last_frame && lookahead_size > 0 )
-        {
-            h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields;
-            h->i_disp_fields += h->lookahead->next.list[i]->i_duration;
-            h->i_disp_fields_last_frame = h->lookahead->next.list[i]->i_frame;
-        }
-        else if( lookahead_size == 0 )
-        {
-            h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields;
-            h->lookahead->next.list[i]->i_duration = h->i_prev_duration;
-        }
-    }
-
-    if( h->param.rc.b_stat_read )
-    {
-        /* Use the frame types from the first pass */
-        for( int i = 0; i < h->lookahead->next.i_size; i++ )
-            h->lookahead->next.list[i]->i_type =
-                x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
-    }
-    else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
-             || h->param.i_scenecut_threshold
-             || h->param.rc.b_mb_tree
-             || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
-        x264_slicetype_analyse( h, 0 );
-
-    for( bframes = 0, brefs = 0;; bframes++ )
-    {
-        frm = h->lookahead->next.list[bframes];
-
-        if( frm->i_forced_type != X264_TYPE_AUTO && frm->i_type != frm->i_forced_type &&
-            !(frm->i_forced_type == X264_TYPE_KEYFRAME && IS_X264_TYPE_I( frm->i_type )) )
-        {
-            x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d was changed to frame type (%d)\n",
-                      frm->i_forced_type, frm->i_frame, frm->i_type );
-        }
-
-        if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL &&
-            brefs == h->param.i_bframe_pyramid )
-        {
-            frm->i_type = X264_TYPE_B;
-            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s \n",
-                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid] );
-        }
-        /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
-           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
-        else if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL &&
-            brefs && h->param.i_frame_reference <= (brefs+3) )
-        {
-            frm->i_type = X264_TYPE_B;
-            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s and %d reference frames\n",
-                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid], h->param.i_frame_reference );
-        }
-
-        if( frm->i_type == X264_TYPE_KEYFRAME )
-            frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;
-
-        /* Limit GOP size */
-        if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max )
-        {
-            if( frm->i_type == X264_TYPE_AUTO || frm->i_type == X264_TYPE_I )
-                frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR;
-            int warn = frm->i_type != X264_TYPE_IDR;
-            if( warn && h->param.b_open_gop )
-                warn &= frm->i_type != X264_TYPE_I;
-            if( warn )
-            {
-                x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame );
-                frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR;
-            }
-        }
-        if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min )
-        {
-            if( h->param.b_open_gop )
-            {
-                h->lookahead->i_last_keyframe = frm->i_frame; // Use display order
-                if( h->param.b_bluray_compat )
-                    h->lookahead->i_last_keyframe -= bframes; // Use bluray order
-                frm->b_keyframe = 1;
-            }
-            else
-                frm->i_type = X264_TYPE_IDR;
-        }
-        if( frm->i_type == X264_TYPE_IDR )
-        {
-            /* Close GOP */
-            h->lookahead->i_last_keyframe = frm->i_frame;
-            frm->b_keyframe = 1;
-            if( bframes > 0 )
-            {
-                bframes--;
-                h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
-            }
-        }
-
-        if( bframes == h->param.i_bframe ||
-            !h->lookahead->next.list[bframes+1] )
-        {
-            if( IS_X264_TYPE_B( frm->i_type ) )
-                x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
-            if( frm->i_type == X264_TYPE_AUTO
-                || IS_X264_TYPE_B( frm->i_type ) )
-                frm->i_type = X264_TYPE_P;
-        }
-
-        if( frm->i_type == X264_TYPE_BREF )
-            brefs++;
-
-        if( frm->i_type == X264_TYPE_AUTO )
-            frm->i_type = X264_TYPE_B;
-
-        else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
-    }
-
-    if( bframes )
-        h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
-    h->lookahead->next.list[bframes]->i_bframes = bframes;
-
-    /* insert a bref into the sequence */
-    if( h->param.i_bframe_pyramid && bframes > 1 && !brefs )
-    {
-        h->lookahead->next.list[(bframes-1)/2]->i_type = X264_TYPE_BREF;
-        brefs++;
-    }
-
-    /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
-    if( h->param.rc.i_rc_method != X264_RC_CQP )
-    {
-        x264_mb_analysis_t a;
-        int p0, p1, b;
-        p1 = b = bframes + 1;
-
-        x264_lowres_context_init( h, &a );
-
-        frames[0] = h->lookahead->last_nonb;
-        memcpy( &frames[1], h->lookahead->next.list, (bframes+1) * sizeof(x264_frame_t*) );
-        if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
-            p0 = bframes + 1;
-        else // P
-            p0 = 0;
-
-        x264_slicetype_frame_cost( h, &a, frames, p0, p1, b );
-
-        if( (p0 != p1 || bframes) && h->param.rc.i_vbv_buffer_size )
-        {
-            /* We need the intra costs for row SATDs. */
-            x264_slicetype_frame_cost( h, &a, frames, b, b, b );
-
-            /* We need B-frame costs for row SATDs. */
-            p0 = 0;
-            for( b = 1; b <= bframes; b++ )
-            {
-                if( frames[b]->i_type == X264_TYPE_B )
-                    for( p1 = b; frames[p1]->i_type == X264_TYPE_B; )
-                        p1++;
-                else
-                    p1 = bframes + 1;
-                x264_slicetype_frame_cost( h, &a, frames, p0, p1, b );
-                if( frames[b]->i_type == X264_TYPE_BREF )
-                    p0 = b;
-            }
-        }
-    }
-
-    /* Analyse for weighted P frames */
-    if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
-        && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )
-    {
-        x264_emms();
-        x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
-    }
-
-    /* shift sequence to coded order.
-       use a small temporary list to avoid shifting the entire next buffer around */
-    int i_coded = h->lookahead->next.list[0]->i_frame;
-    if( bframes )
-    {
-        int idx_list[] = { brefs+1, 1 };
-        for( int i = 0; i < bframes; i++ )
-        {
-            int idx = idx_list[h->lookahead->next.list[i]->i_type == X264_TYPE_BREF]++;
-            frames[idx] = h->lookahead->next.list[i];
-            frames[idx]->i_reordered_pts = h->lookahead->next.list[idx]->i_pts;
-        }
-        frames[0] = h->lookahead->next.list[bframes];
-        frames[0]->i_reordered_pts = h->lookahead->next.list[0]->i_pts;
-        memcpy( h->lookahead->next.list, frames, (bframes+1) * sizeof(x264_frame_t*) );
-    }
-
-    for( int i = 0; i <= bframes; i++ )
-    {
-        h->lookahead->next.list[i]->i_coded = i_coded++;
-        if( i )
-        {
-            x264_calculate_durations( h, h->lookahead->next.list[i], h->lookahead->next.list[i-1], &h->i_cpb_delay, &h->i_coded_fields );
-            h->lookahead->next.list[0]->f_planned_cpb_duration[i-1] = (double)h->lookahead->next.list[i]->i_cpb_duration *
-                                                                      h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-        }
-        else
-            x264_calculate_durations( h, h->lookahead->next.list[i], NULL, &h->i_cpb_delay, &h->i_coded_fields );
-    }
-}
-
-int x264_rc_analyse_slice( x264_t *h )
-{
-    int p0 = 0, p1, b;
-    int cost;
-    x264_emms();
-
-    if( IS_X264_TYPE_I(h->fenc->i_type) )
-        p1 = b = 0;
-    else if( h->fenc->i_type == X264_TYPE_P )
-        p1 = b = h->fenc->i_bframes + 1;
-    else //B
-    {
-        p1 = (h->fref_nearest[1]->i_poc - h->fref_nearest[0]->i_poc)/2;
-        b  = (h->fenc->i_poc - h->fref_nearest[0]->i_poc)/2;
-    }
-    /* We don't need to assign p0/p1 since we are not performing any real analysis here. */
-    x264_frame_t **frames = &h->fenc - b;
-
-    /* cost should have been already calculated by x264_slicetype_decide */
-    cost = frames[b]->i_cost_est[b-p0][p1-b];
-    assert( cost >= 0 );
-
-    if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
-    {
-        cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
-        if( b && h->param.rc.i_vbv_buffer_size )
-            x264_slicetype_frame_cost_recalculate( h, frames, b, b, b );
-    }
-    /* In AQ, use the weighted score instead. */
-    else if( h->param.rc.i_aq_mode )
-        cost = frames[b]->i_cost_est_aq[b-p0][p1-b];
-
-    h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
-    h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
-    h->fdec->i_satd = cost;
-    memcpy( h->fdec->i_row_satd, h->fenc->i_row_satd, h->mb.i_mb_height * sizeof(int) );
-    if( !IS_X264_TYPE_I(h->fenc->i_type) )
-        memcpy( h->fdec->i_row_satds[0][0], h->fenc->i_row_satds[0][0], h->mb.i_mb_height * sizeof(int) );
-
-    if( h->param.b_intra_refresh && h->param.rc.i_vbv_buffer_size && h->fenc->i_type == X264_TYPE_P )
-    {
-        int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */
-        for( int y = 0; y < h->mb.i_mb_height; y++ )
-        {
-            int mb_xy = y * h->mb.i_mb_stride + h->fdec->i_pir_start_col;
-            for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
-            {
-                int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
-                int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
-                int diff = intra_cost - inter_cost;
-                if( h->param.rc.i_aq_mode )
-                    h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
-                else
-                    h->fdec->i_row_satd[y] += diff;
-                cost += diff;
-            }
-        }
-    }
-
-    return cost;
-}
diff --git a/android/src/main/libenc/jni/libx264/example.c b/android/src/main/libenc/jni/libx264/example.c
deleted file mode 100755
index 1cbbd93..0000000
--- a/android/src/main/libenc/jni/libx264/example.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*****************************************************************************
- * example.c: libx264 API usage example
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifdef _WIN32
-#include <io.h>       /* _setmode() */
-#include <fcntl.h>    /* _O_BINARY */
-#endif
-
-#include <stdint.h>
-#include <stdio.h>
-#include <x264.h>
-
-#define FAIL_IF_ERROR( cond, ... )\
-do\
-{\
-    if( cond )\
-    {\
-        fprintf( stderr, __VA_ARGS__ );\
-        goto fail;\
-    }\
-} while( 0 )
-
-int main( int argc, char **argv )
-{
-    int width, height;
-    x264_param_t param;
-    x264_picture_t pic;
-    x264_picture_t pic_out;
-    x264_t *h;
-    int i_frame = 0;
-    int i_frame_size;
-    x264_nal_t *nal;
-    int i_nal;
-
-#ifdef _WIN32
-    _setmode( _fileno( stdin ),  _O_BINARY );
-    _setmode( _fileno( stdout ), _O_BINARY );
-    _setmode( _fileno( stderr ), _O_BINARY );
-#endif
-
-    FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" );
-    FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" );
-
-    /* Get default params for preset/tuning */
-    if( x264_param_default_preset( &param, "medium", NULL ) < 0 )
-        goto fail;
-
-    /* Configure non-default params */
-    param.i_csp = X264_CSP_I420;
-    param.i_width  = width;
-    param.i_height = height;
-    param.b_vfr_input = 0;
-    param.b_repeat_headers = 1;
-    param.b_annexb = 1;
-
-    /* Apply profile restrictions. */
-    if( x264_param_apply_profile( &param, "high" ) < 0 )
-        goto fail;
-
-    if( x264_picture_alloc( &pic, param.i_csp, param.i_width, param.i_height ) < 0 )
-        goto fail;
-#undef fail
-#define fail fail2
-
-    h = x264_encoder_open( &param );
-    if( !h )
-        goto fail;
-#undef fail
-#define fail fail3
-
-    int luma_size = width * height;
-    int chroma_size = luma_size / 4;
-    /* Encode frames */
-    for( ;; i_frame++ )
-    {
-        /* Read input frame */
-        if( fread( pic.img.plane[0], 1, luma_size, stdin ) != luma_size )
-            break;
-        if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != chroma_size )
-            break;
-        if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != chroma_size )
-            break;
-
-        pic.i_pts = i_frame;
-        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, &pic, &pic_out );
-        if( i_frame_size < 0 )
-            goto fail;
-        else if( i_frame_size )
-        {
-            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
-                goto fail;
-        }
-    }
-    /* Flush delayed frames */
-    while( x264_encoder_delayed_frames( h ) )
-    {
-        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
-        if( i_frame_size < 0 )
-            goto fail;
-        else if( i_frame_size )
-        {
-            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
-                goto fail;
-        }
-    }
-
-    x264_encoder_close( h );
-    x264_picture_clean( &pic );
-    return 0;
-
-#undef fail
-fail3:
-    x264_encoder_close( h );
-fail2:
-    x264_picture_clean( &pic );
-fail:
-    return -1;
-}
diff --git a/android/src/main/libenc/jni/libx264/extras/avisynth_c.h b/android/src/main/libenc/jni/libx264/extras/avisynth_c.h
deleted file mode 100755
index 8159879..0000000
--- a/android/src/main/libenc/jni/libx264/extras/avisynth_c.h
+++ /dev/null
@@ -1,984 +0,0 @@
-// Avisynth C Interface Version 0.20
-// Copyright 2003 Kevin Atkinson
-
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
-// http://www.gnu.org/copyleft/gpl.html .
-//
-// As a special exception, I give you permission to link to the
-// Avisynth C interface with independent modules that communicate with
-// the Avisynth C interface solely through the interfaces defined in
-// avisynth_c.h, regardless of the license terms of these independent
-// modules, and to copy and distribute the resulting combined work
-// under terms of your choice, provided that every copy of the
-// combined work is accompanied by a complete copy of the source code
-// of the Avisynth C interface and Avisynth itself (with the version
-// used to produce the combined work), being distributed under the
-// terms of the GNU General Public License plus this exception.  An
-// independent module is a module which is not derived from or based
-// on Avisynth C Interface, such as 3rd-party filters, import and
-// export plugins, or graphical user interfaces.
-
-// NOTE: this is a partial update of the Avisynth C interface to recognize
-// new color spaces added in Avisynth 2.60. By no means is this document
-// completely Avisynth 2.60 compliant.
-
-#ifndef __AVISYNTH_C__
-#define __AVISYNTH_C__
-
-#ifdef __cplusplus
-#  define EXTERN_C extern "C"
-#else
-#  define EXTERN_C
-#endif
-
-#define AVSC_USE_STDCALL 1
-
-#ifndef AVSC_USE_STDCALL
-#  define AVSC_CC __cdecl
-#else
-#  define AVSC_CC __stdcall
-#endif
-
-#define AVSC_INLINE static __inline
-
-#ifdef AVISYNTH_C_EXPORTS
-#  define AVSC_EXPORT __declspec(dllexport)
-#  define AVSC_API(ret, name) EXTERN_C AVSC_EXPORT ret AVSC_CC name
-#else
-#  define AVSC_EXPORT __declspec(dllimport)
-#  ifndef AVSC_NO_DECLSPEC
-#    define AVSC_API(ret, name) EXTERN_C AVSC_EXPORT ret AVSC_CC name
-#  else
-#    define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
-#  endif
-#endif
-
-typedef unsigned char BYTE;
-#ifdef __GNUC__
-typedef long long int INT64;
-#else
-typedef __int64 INT64;
-#endif
-
-
-/////////////////////////////////////////////////////////////////////
-//
-// Constants
-//
-
-#ifndef __AVISYNTH_6_H__
-enum { AVISYNTH_INTERFACE_VERSION = 6 };
-#endif
-
-enum {AVS_SAMPLE_INT8  = 1<<0,
-      AVS_SAMPLE_INT16 = 1<<1,
-      AVS_SAMPLE_INT24 = 1<<2,
-      AVS_SAMPLE_INT32 = 1<<3,
-      AVS_SAMPLE_FLOAT = 1<<4};
-
-enum {AVS_PLANAR_Y=1<<0,
-      AVS_PLANAR_U=1<<1,
-      AVS_PLANAR_V=1<<2,
-      AVS_PLANAR_ALIGNED=1<<3,
-      AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_A=1<<4,
-      AVS_PLANAR_R=1<<5,
-      AVS_PLANAR_G=1<<6,
-      AVS_PLANAR_B=1<<7,
-      AVS_PLANAR_A_ALIGNED=AVS_PLANAR_A|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_R_ALIGNED=AVS_PLANAR_R|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_G_ALIGNED=AVS_PLANAR_G|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_B_ALIGNED=AVS_PLANAR_B|AVS_PLANAR_ALIGNED};
-
-  // Colorspace properties.
-enum {AVS_CS_BGR = 1<<28,
-      AVS_CS_YUV = 1<<29,
-      AVS_CS_INTERLEAVED = 1<<30,
-      AVS_CS_PLANAR = 1<<31,
-
-      AVS_CS_SHIFT_SUB_WIDTH   = 0,
-      AVS_CS_SHIFT_SUB_HEIGHT  = 8,
-      AVS_CS_SHIFT_SAMPLE_BITS = 16,
-
-      AVS_CS_SUB_WIDTH_MASK    = 7 << AVS_CS_SHIFT_SUB_WIDTH,
-      AVS_CS_SUB_WIDTH_1       = 3 << AVS_CS_SHIFT_SUB_WIDTH, // YV24
-      AVS_CS_SUB_WIDTH_2       = 0 << AVS_CS_SHIFT_SUB_WIDTH, // YV12, I420, YV16
-      AVS_CS_SUB_WIDTH_4       = 1 << AVS_CS_SHIFT_SUB_WIDTH, // YUV9, YV411
-
-      AVS_CS_VPLANEFIRST       = 1 << 3, // YV12, YV16, YV24, YV411, YUV9
-      AVS_CS_UPLANEFIRST       = 1 << 4, // I420
-
-      AVS_CS_SUB_HEIGHT_MASK   = 7 << AVS_CS_SHIFT_SUB_HEIGHT,
-      AVS_CS_SUB_HEIGHT_1      = 3 << AVS_CS_SHIFT_SUB_HEIGHT, // YV16, YV24, YV411
-      AVS_CS_SUB_HEIGHT_2      = 0 << AVS_CS_SHIFT_SUB_HEIGHT, // YV12, I420
-      AVS_CS_SUB_HEIGHT_4      = 1 << AVS_CS_SHIFT_SUB_HEIGHT, // YUV9
-
-      AVS_CS_SAMPLE_BITS_MASK  = 7 << AVS_CS_SHIFT_SAMPLE_BITS,
-      AVS_CS_SAMPLE_BITS_8     = 0 << AVS_CS_SHIFT_SAMPLE_BITS,
-      AVS_CS_SAMPLE_BITS_16    = 1 << AVS_CS_SHIFT_SAMPLE_BITS,
-      AVS_CS_SAMPLE_BITS_32    = 2 << AVS_CS_SHIFT_SAMPLE_BITS,
-
-      AVS_CS_PLANAR_MASK       = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV | AVS_CS_BGR | AVS_CS_SAMPLE_BITS_MASK | AVS_CS_SUB_HEIGHT_MASK | AVS_CS_SUB_WIDTH_MASK,
-      AVS_CS_PLANAR_FILTER     = ~( AVS_CS_VPLANEFIRST | AVS_CS_UPLANEFIRST )};
-
-  // Specific colorformats
-enum {
-  AVS_CS_UNKNOWN = 0,
-  AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
-  AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
-  AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
-  //  AVS_CS_YV12  = 1<<3  Reserved
-  //  AVS_CS_I420  = 1<<4  Reserved
-  AVS_CS_RAW32 = 1<<5 | AVS_CS_INTERLEAVED,
-
-  AVS_CS_YV24  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_1,  // YVU 4:4:4 planar
-  AVS_CS_YV16  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_2,  // YVU 4:2:2 planar
-  AVS_CS_YV12  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2,  // YVU 4:2:0 planar
-  AVS_CS_I420  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_UPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2,  // YUV 4:2:0 planar
-  AVS_CS_IYUV  = AVS_CS_I420,
-  AVS_CS_YV411 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_4,  // YVU 4:1:1 planar
-  AVS_CS_YUV9  = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_4 | AVS_CS_SUB_WIDTH_4,  // YVU 4:1:0 planar
-  AVS_CS_Y8    = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8                                              // Y   4:0:0 planar
-};
-
-enum {
-  AVS_IT_BFF = 1<<0,
-  AVS_IT_TFF = 1<<1,
-  AVS_IT_FIELDBASED = 1<<2};
-
-enum {
-  AVS_FILTER_TYPE=1,
-  AVS_FILTER_INPUT_COLORSPACE=2,
-  AVS_FILTER_OUTPUT_TYPE=9,
-  AVS_FILTER_NAME=4,
-  AVS_FILTER_AUTHOR=5,
-  AVS_FILTER_VERSION=6,
-  AVS_FILTER_ARGS=7,
-  AVS_FILTER_ARGS_INFO=8,
-  AVS_FILTER_ARGS_DESCRIPTION=10,
-  AVS_FILTER_DESCRIPTION=11};
-
-enum {  //SUBTYPES
-  AVS_FILTER_TYPE_AUDIO=1,
-  AVS_FILTER_TYPE_VIDEO=2,
-  AVS_FILTER_OUTPUT_TYPE_SAME=3,
-  AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
-
-enum {
-  // New 2.6 explicitly defined cache hints.
-  AVS_CACHE_NOTHING=10, // Do not cache video.
-  AVS_CACHE_WINDOW=11, // Hard protect upto X frames within a range of X from the current frame N.
-  AVS_CACHE_GENERIC=12, // LRU cache upto X frames.
-  AVS_CACHE_FORCE_GENERIC=13, // LRU cache upto X frames, override any previous CACHE_WINDOW.
-
-  AVS_CACHE_GET_POLICY=30, // Get the current policy.
-  AVS_CACHE_GET_WINDOW=31, // Get the current window h_span.
-  AVS_CACHE_GET_RANGE=32, // Get the current generic frame range.
-
-  AVS_CACHE_AUDIO=50, // Explicitly do cache audio, X byte cache.
-  AVS_CACHE_AUDIO_NOTHING=51, // Explicitly do not cache audio.
-  AVS_CACHE_AUDIO_NONE=52, // Audio cache off (auto mode), X byte intial cache.
-  AVS_CACHE_AUDIO_AUTO=53, // Audio cache on (auto mode), X byte intial cache.
-
-  AVS_CACHE_GET_AUDIO_POLICY=70, // Get the current audio policy.
-  AVS_CACHE_GET_AUDIO_SIZE=71, // Get the current audio cache size.
-
-  AVS_CACHE_PREFETCH_FRAME=100, // Queue request to prefetch frame N.
-  AVS_CACHE_PREFETCH_GO=101, // Action video prefetches.
-
-  AVS_CACHE_PREFETCH_AUDIO_BEGIN=120, // Begin queue request transaction to prefetch audio (take critical section).
-  AVS_CACHE_PREFETCH_AUDIO_STARTLO=121, // Set low 32 bits of start.
-  AVS_CACHE_PREFETCH_AUDIO_STARTHI=122, // Set high 32 bits of start.
-  AVS_CACHE_PREFETCH_AUDIO_COUNT=123, // Set low 32 bits of length.
-  AVS_CACHE_PREFETCH_AUDIO_COMMIT=124, // Enqueue request transaction to prefetch audio (release critical section).
-  AVS_CACHE_PREFETCH_AUDIO_GO=125, // Action audio prefetches.
-
-  AVS_CACHE_GETCHILD_CACHE_MODE=200, // Cache ask Child for desired video cache mode.
-  AVS_CACHE_GETCHILD_CACHE_SIZE=201, // Cache ask Child for desired video cache size.
-  AVS_CACHE_GETCHILD_AUDIO_MODE=202, // Cache ask Child for desired audio cache mode.
-  AVS_CACHE_GETCHILD_AUDIO_SIZE=203, // Cache ask Child for desired audio cache size.
-
-  AVS_CACHE_GETCHILD_COST=220, // Cache ask Child for estimated processing cost.
-    AVS_CACHE_COST_ZERO=221, // Child response of zero cost (ptr arithmetic only).
-    AVS_CACHE_COST_UNIT=222, // Child response of unit cost (less than or equal 1 full frame blit).
-    AVS_CACHE_COST_LOW=223, // Child response of light cost. (Fast)
-    AVS_CACHE_COST_MED=224, // Child response of medium cost. (Real time)
-    AVS_CACHE_COST_HI=225, // Child response of heavy cost. (Slow)
-
-  AVS_CACHE_GETCHILD_THREAD_MODE=240, // Cache ask Child for thread safetyness.
-    AVS_CACHE_THREAD_UNSAFE=241, // Only 1 thread allowed for all instances. 2.5 filters default!
-    AVS_CACHE_THREAD_CLASS=242, // Only 1 thread allowed for each instance. 2.6 filters default!
-    AVS_CACHE_THREAD_SAFE=243, //  Allow all threads in any instance.
-    AVS_CACHE_THREAD_OWN=244, // Safe but limit to 1 thread, internally threaded.
-
-  AVS_CACHE_GETCHILD_ACCESS_COST=260, // Cache ask Child for preferred access pattern.
-    AVS_CACHE_ACCESS_RAND=261, // Filter is access order agnostic.
-    AVS_CACHE_ACCESS_SEQ0=262, // Filter prefers sequential access (low cost)
-    AVS_CACHE_ACCESS_SEQ1=263, // Filter needs sequential access (high cost)
-  };
-
-#define AVS_FRAME_ALIGN 16
-
-typedef struct AVS_Clip AVS_Clip;
-typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_VideoInfo
-//
-
-// AVS_VideoInfo is layed out identicly to VideoInfo
-typedef struct AVS_VideoInfo {
-  int width, height;    // width=0 means no video
-  unsigned fps_numerator, fps_denominator;
-  int num_frames;
-
-  int pixel_type;
-
-  int audio_samples_per_second;   // 0 means no audio
-  int sample_type;
-  INT64 num_audio_samples;
-  int nchannels;
-
-  // Imagetype properties
-
-  int image_type;
-} AVS_VideoInfo;
-
-// useful functions of the above
-AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p)
-        { return (p->width!=0); }
-
-AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p)
-        { return (p->audio_samples_per_second!=0); }
-
-AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p)
-        { return !!(p->pixel_type&AVS_CS_BGR); }
-
-AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p)
-        { return ((p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24) && ((p->pixel_type & AVS_CS_SAMPLE_BITS_MASK) == AVS_CS_SAMPLE_BITS_8); }
-
-AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p)
-        { return ((p->pixel_type&AVS_CS_BGR32)==AVS_CS_BGR32) && ((p->pixel_type & AVS_CS_SAMPLE_BITS_MASK) == AVS_CS_SAMPLE_BITS_8); }
-
-AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p)
-        { return !!(p->pixel_type&AVS_CS_YUV ); }
-
-AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }
-
-AVSC_API(int, avs_is_yv24)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yv16)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yv12)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yv411)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_y8)(const AVS_VideoInfo * p);
-
-#ifdef AVSC_NO_DECLSPEC
-AVSC_INLINE int avs_is_yv24(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV24  & AVS_CS_PLANAR_FILTER); }
-
-AVSC_INLINE int avs_is_yv16(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV16  & AVS_CS_PLANAR_FILTER); }
-
-AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV12  & AVS_CS_PLANAR_FILTER); }
-
-AVSC_INLINE int avs_is_yv411(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV411 & AVS_CS_PLANAR_FILTER); }
-
-AVSC_INLINE int avs_is_y8(const AVS_VideoInfo * p)
-        { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_Y8    & AVS_CS_PLANAR_FILTER); }
-#endif
-
-#if 1 // AviSynth+ extension
-AVSC_API(int, avs_is_rgb48)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_rgb64)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv444p16)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv422p16)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv420p16)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_y16)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv444ps)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv422ps)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuv420ps)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_y32)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_444)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_422)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_420)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_y)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_yuva)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_planar_rgb)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_is_planar_rgba)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_num_components)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_component_size)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_bits_per_component)(const AVS_VideoInfo * p);
-#endif
-
-AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property)
-        { return ((p->image_type & property)==property ); }
-
-AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p)
-        { return !!(p->pixel_type & AVS_CS_PLANAR); }
-
-AVSC_API(int, avs_is_color_space)(const AVS_VideoInfo * p, int c_space);
-
-AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p)
-        { return !!(p->image_type & AVS_IT_FIELDBASED); }
-
-AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p)
-        { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }
-
-AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p)
-        { return !!(p->image_type & AVS_IT_BFF); }
-
-AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p)
-        { return !!(p->image_type & AVS_IT_TFF); }
-
-AVSC_API(int, avs_get_plane_width_subsampling)(const AVS_VideoInfo * p, int plane);
-
-AVSC_API(int, avs_get_plane_height_subsampling)(const AVS_VideoInfo * p, int plane);
-
-
-AVSC_API(int, avs_bits_per_pixel)(const AVS_VideoInfo * p);
-
-AVSC_API(int, avs_bytes_from_pixels)(const AVS_VideoInfo * p, int pixels);
-
-AVSC_API(int, avs_row_size_p)(const AVS_VideoInfo * p, int plane);
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p)
-        { return avs_row_size_p(p, 0); }
-#endif
-
-AVSC_API(int, avs_bmp_size)(const AVS_VideoInfo * vi);
-
-AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p)
-        { return p->audio_samples_per_second; }
-
-
-AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p)
-{
-    switch (p->sample_type) {
-      case AVS_SAMPLE_INT8:  return sizeof(signed char);
-      case AVS_SAMPLE_INT16: return sizeof(signed short);
-      case AVS_SAMPLE_INT24: return 3;
-      case AVS_SAMPLE_INT32: return sizeof(signed int);
-      case AVS_SAMPLE_FLOAT: return sizeof(float);
-      default: return 0;
-    }
-}
-AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p)
-        { return p->nchannels*avs_bytes_per_channel_sample(p);}
-
-AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames)
-        { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }
-
-AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples)
-        { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }
-
-AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes)
-        { return bytes / avs_bytes_per_audio_sample(p); }
-
-AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples)
-        { return samples * avs_bytes_per_audio_sample(p); }
-
-AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p)
-        { return p->nchannels; }
-
-AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
-        { return p->sample_type;}
-
-// useful mutator
-AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property)
-        { p->image_type|=property; }
-
-AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property)
-        { p->image_type&=~property; }
-
-AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased)
-        { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }
-
-AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator)
-{
-    unsigned x=numerator, y=denominator;
-    while (y) {   // find gcd
-      unsigned t = x%y; x = y; y = t;
-    }
-    p->fps_numerator = numerator/x;
-    p->fps_denominator = denominator/x;
-}
-
-AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
-{
-        return (x->pixel_type == y->pixel_type)
-                || (avs_is_yv12(x) && avs_is_yv12(y));
-}
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_VideoFrame
-//
-
-// VideoFrameBuffer holds information about a memory block which is used
-// for video data.  For efficiency, instances of this class are not deleted
-// when the refcount reaches zero; instead they're stored in a linked list
-// to be reused.  The instances are deleted when the corresponding AVS
-// file is closed.
-
-// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
-// DO NOT USE THIS STRUCTURE DIRECTLY
-typedef struct AVS_VideoFrameBuffer {
-  BYTE * data;
-  int data_size;
-  // sequence_number is incremented every time the buffer is changed, so
-  // that stale views can tell they're no longer valid.
-  volatile long sequence_number;
-
-  volatile long refcount;
-} AVS_VideoFrameBuffer;
-
-// VideoFrame holds a "window" into a VideoFrameBuffer.
-
-// AVS_VideoFrame is layed out identicly to IVideoFrame
-// DO NOT USE THIS STRUCTURE DIRECTLY
-typedef struct AVS_VideoFrame {
-  volatile long refcount;
-  AVS_VideoFrameBuffer * vfb;
-  int offset, pitch, row_size, height, offsetU, offsetV, pitchUV;  // U&V offsets are from top of picture.
-  int row_sizeUV, heightUV;
-} AVS_VideoFrame;
-
-// Access functions for AVS_VideoFrame
-AVSC_API(int, avs_get_pitch_p)(const AVS_VideoFrame * p, int plane);
-
-#ifdef AVSC_NO_DECLSPEC
-AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) {
-        switch (plane) {
-          case AVS_PLANAR_U:
-          case AVS_PLANAR_V:
-            return p->pitchUV;
-        }
-        return p->pitch;
-}
-#endif
-
-AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
-        return avs_get_pitch_p(p, 0);}
-
-AVSC_API(int, avs_get_row_size_p)(const AVS_VideoFrame * p, int plane);
-
-AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
-        return p->row_size; }
-
-AVSC_API(int, avs_get_height_p)(const AVS_VideoFrame * p, int plane);
-
-AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
-        return p->height;}
-
-AVSC_API(const BYTE *, avs_get_read_ptr_p)(const AVS_VideoFrame * p, int plane);
-
-#ifdef AVSC_NO_DECLSPEC
-AVSC_INLINE const BYTE* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) {
-        switch (plane) {
-          case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
-          case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
-          default:           return p->vfb->data + p->offset;
-        }
-}
-#endif
-
-AVSC_INLINE const BYTE* avs_get_read_ptr(const AVS_VideoFrame * p) {
-        return avs_get_read_ptr_p(p, 0);}
-
-AVSC_API(int, avs_is_writable)(const AVS_VideoFrame * p);
-
-AVSC_API(BYTE *, avs_get_write_ptr_p)(const AVS_VideoFrame * p, int plane);
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE BYTE* avs_get_write_ptr(const AVS_VideoFrame * p) {
-        return avs_get_write_ptr_p(p, 0);}
-#endif
-
-AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *);
-// makes a shallow copy of a video frame
-AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *);
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
-  {avs_release_video_frame(f);}
-AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
-  {return avs_copy_video_frame(f);}
-#endif
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_Value
-//
-
-// Treat AVS_Value as a fat pointer.  That is use avs_copy_value
-// and avs_release_value appropiaty as you would if AVS_Value was
-// a pointer.
-
-// To maintain source code compatibility with future versions of the
-// avisynth_c API don't use the AVS_Value directly.  Use the helper
-// functions below.
-
-// AVS_Value is layed out identicly to AVSValue
-typedef struct AVS_Value AVS_Value;
-struct AVS_Value {
-  short type;  // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
-               // for some function e'rror
-  short array_size;
-  union {
-    void * clip; // do not use directly, use avs_take_clip
-    char boolean;
-    int integer;
-    float floating_pt;
-    const char * string;
-    const AVS_Value * array;
-  } d;
-};
-
-// AVS_Value should be initilized with avs_void.
-// Should also set to avs_void after the value is released
-// with avs_copy_value.  Consider it the equalvent of setting
-// a pointer to NULL
-static const AVS_Value avs_void = {'v'};
-
-AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src);
-AVSC_API(void, avs_release_value)(AVS_Value);
-
-AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
-AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
-AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
-AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
-AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
-AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
-AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
-AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }
-
-AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *);
-AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *);
-
-AVSC_INLINE int avs_as_bool(AVS_Value v)
-        { return v.d.boolean; }
-AVSC_INLINE int avs_as_int(AVS_Value v)
-        { return v.d.integer; }
-AVSC_INLINE const char * avs_as_string(AVS_Value v)
-        { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
-AVSC_INLINE double avs_as_float(AVS_Value v)
-        { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
-AVSC_INLINE const char * avs_as_error(AVS_Value v)
-        { return avs_is_error(v) ? v.d.string : 0; }
-AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
-        { return v.d.array; }
-AVSC_INLINE int avs_array_size(AVS_Value v)
-        { return avs_is_array(v) ? v.array_size : 1; }
-AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index)
-        { return avs_is_array(v) ? v.d.array[index] : v; }
-
-// only use these functions on an AVS_Value that does not already have
-// an active value.  Remember, treat AVS_Value as a fat pointer.
-AVSC_INLINE AVS_Value avs_new_value_bool(int v0)
-        { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; }
-AVSC_INLINE AVS_Value avs_new_value_int(int v0)
-        { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; }
-AVSC_INLINE AVS_Value avs_new_value_string(const char * v0)
-        { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
-AVSC_INLINE AVS_Value avs_new_value_float(float v0)
-        { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
-AVSC_INLINE AVS_Value avs_new_value_error(const char * v0)
-        { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
-        { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
-#endif
-AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
-        { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = (short)size; return v; }
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_Clip
-//
-
-AVSC_API(void, avs_release_clip)(AVS_Clip *);
-AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *);
-
-AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error
-
-AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *);
-
-AVSC_API(int, avs_get_version)(AVS_Clip *);
-
-AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n);
-// The returned video frame must be released with avs_release_video_frame
-
-AVSC_API(int, avs_get_parity)(AVS_Clip *, int n);
-// return field parity if field_based, else parity of first field in frame
-
-AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf,
-                             INT64 start, INT64 count);
-// start and count are in samples
-
-AVSC_API(int, avs_set_cache_hints)(AVS_Clip *,
-                                   int cachehints, int frame_range);
-
-// This is the callback type used by avs_add_function
-typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
-                        (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);
-
-typedef struct AVS_FilterInfo AVS_FilterInfo;
-struct AVS_FilterInfo
-{
-  // these members should not be modified outside of the AVS_ApplyFunc callback
-  AVS_Clip * child;
-  AVS_VideoInfo vi;
-  AVS_ScriptEnvironment * env;
-  AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
-  int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
-  int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf,
-                                  INT64 start, INT64 count);
-  int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints,
-                                        int frame_range);
-  void (AVSC_CC * free_filter)(AVS_FilterInfo *);
-
-  // Should be set when ever there is an error to report.
-  // It is cleared before any of the above methods are called
-  const char * error;
-  // this is to store whatever and may be modified at will
-  void * user_data;
-};
-
-// Create a new filter
-// fi is set to point to the AVS_FilterInfo so that you can
-//   modify it once it is initilized.
-// store_child should generally be set to true.  If it is not
-//    set than ALL methods (the function pointers) must be defined
-// If it is set than you do not need to worry about freeing the child
-//    clip.
-AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e,
-                                       AVS_FilterInfo * * fi,
-                                       AVS_Value child, int store_child);
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_ScriptEnvironment
-//
-
-// For GetCPUFlags.  These are backwards-compatible with those in VirtualDub.
-enum {
-                                /* slowest CPU to support extension */
-  AVS_CPU_FORCE        = 0x01,   // N/A
-  AVS_CPU_FPU          = 0x02,   // 386/486DX
-  AVS_CPU_MMX          = 0x04,   // P55C, K6, PII
-  AVS_CPU_INTEGER_SSE  = 0x08,   // PIII, Athlon
-  AVS_CPU_SSE          = 0x10,   // PIII, Athlon XP/MP
-  AVS_CPU_SSE2         = 0x20,   // PIV, Hammer
-  AVS_CPU_3DNOW        = 0x40,   // K6-2
-  AVS_CPU_3DNOW_EXT    = 0x80,   // Athlon
-  AVS_CPU_X86_64       = 0xA0,   // Hammer (note: equiv. to 3DNow + SSE2,
-                                 // which only Hammer will have anyway)
-  AVS_CPUF_SSE3       = 0x100,   //  PIV+, K8 Venice
-  AVS_CPUF_SSSE3      = 0x200,   //  Core 2
-  AVS_CPUF_SSE4       = 0x400,   //  Penryn, Wolfdale, Yorkfield
-  AVS_CPUF_SSE4_1     = 0x400,
-//AVS_CPUF_AVX        = 0x800,   //  Sandy Bridge, Bulldozer
-  AVS_CPUF_SSE4_2    = 0x1000,   //  Nehalem
-//AVS_CPUF_AVX2      = 0x2000,   //  Haswell
-//AVS_CPUF_AVX512    = 0x4000,   //  Knights Landing
-};
-
-
-AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error
-
-AVSC_API(long, avs_get_cpu_flags)(AVS_ScriptEnvironment *);
-AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version);
-
-AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length);
-AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...);
-
-AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, void* val);
- // note: val is really a va_list; I hope everyone typedefs va_list to a pointer
-
-AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *,
-                                const char * name, const char * params,
-                                AVS_ApplyFunc apply, void * user_data);
-
-AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name);
-
-AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name,
-                               AVS_Value args, const char** arg_names);
-// The returned value must be be released with avs_release_value
-
-AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name);
-// The returned value must be be released with avs_release_value
-
-AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val);
-
-AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);
-
-//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
-//void avs_pop_context(AVS_ScriptEnvironment *);
-
-AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *,
-                                          const AVS_VideoInfo * vi, int align);
-// align should be at least 16
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE
-AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env,
-                                     const AVS_VideoInfo * vi)
-  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
-
-AVSC_INLINE
-AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env,
-                               const AVS_VideoInfo * vi)
-  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
-#endif
-
-
-AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);
-
-AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, BYTE* dstp, int dst_pitch, const BYTE* srcp, int src_pitch, int row_size, int height);
-
-typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
-AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);
-
-AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
-// The returned video frame must be be released
-
-AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem);
-
-AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir);
-
-// avisynth.dll exports this; it's a way to use it as a library, without
-// writing an AVS script or without going through AVIFile.
-AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version);
-
-// this symbol is the entry point for the plugin and must
-// be defined
-AVSC_EXPORT
-const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);
-
-
-AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *);
-
-
-AVSC_API(AVS_VideoFrame *, avs_subframe_planar)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
-// The returned video frame must be be released
-
-#ifdef AVSC_NO_DECLSPEC
-// use LoadLibrary and related functions to dynamically load Avisynth instead of declspec(dllimport)
-/*
-  The following functions needs to have been declared, probably from windows.h
-
-  void* malloc(size_t)
-  void free(void*);
-
-  HMODULE LoadLibrary(const char*);
-  void* GetProcAddress(HMODULE, const char*);
-  FreeLibrary(HMODULE);
-*/
-
-
-typedef struct AVS_Library AVS_Library;
-
-#define AVSC_DECLARE_FUNC(name) name##_func name
-
-struct AVS_Library {
-  HMODULE handle;
-
-  AVSC_DECLARE_FUNC(avs_add_function);
-  AVSC_DECLARE_FUNC(avs_at_exit);
-  AVSC_DECLARE_FUNC(avs_bit_blt);
-  AVSC_DECLARE_FUNC(avs_check_version);
-  AVSC_DECLARE_FUNC(avs_clip_get_error);
-  AVSC_DECLARE_FUNC(avs_copy_clip);
-  AVSC_DECLARE_FUNC(avs_copy_value);
-  AVSC_DECLARE_FUNC(avs_copy_video_frame);
-  AVSC_DECLARE_FUNC(avs_create_script_environment);
-  AVSC_DECLARE_FUNC(avs_delete_script_environment);
-  AVSC_DECLARE_FUNC(avs_function_exists);
-  AVSC_DECLARE_FUNC(avs_get_audio);
-  AVSC_DECLARE_FUNC(avs_get_cpu_flags);
-  AVSC_DECLARE_FUNC(avs_get_frame);
-  AVSC_DECLARE_FUNC(avs_get_parity);
-  AVSC_DECLARE_FUNC(avs_get_var);
-  AVSC_DECLARE_FUNC(avs_get_version);
-  AVSC_DECLARE_FUNC(avs_get_video_info);
-  AVSC_DECLARE_FUNC(avs_invoke);
-  AVSC_DECLARE_FUNC(avs_make_writable);
-  AVSC_DECLARE_FUNC(avs_new_c_filter);
-  AVSC_DECLARE_FUNC(avs_new_video_frame_a);
-  AVSC_DECLARE_FUNC(avs_release_clip);
-  AVSC_DECLARE_FUNC(avs_release_value);
-  AVSC_DECLARE_FUNC(avs_release_video_frame);
-  AVSC_DECLARE_FUNC(avs_save_string);
-  AVSC_DECLARE_FUNC(avs_set_cache_hints);
-  AVSC_DECLARE_FUNC(avs_set_global_var);
-  AVSC_DECLARE_FUNC(avs_set_memory_max);
-  AVSC_DECLARE_FUNC(avs_set_to_clip);
-  AVSC_DECLARE_FUNC(avs_set_var);
-  AVSC_DECLARE_FUNC(avs_set_working_dir);
-  AVSC_DECLARE_FUNC(avs_sprintf);
-  AVSC_DECLARE_FUNC(avs_subframe);
-  AVSC_DECLARE_FUNC(avs_subframe_planar);
-  AVSC_DECLARE_FUNC(avs_take_clip);
-  AVSC_DECLARE_FUNC(avs_vsprintf);
-
-  AVSC_DECLARE_FUNC(avs_get_error);
-  AVSC_DECLARE_FUNC(avs_is_yv24);
-  AVSC_DECLARE_FUNC(avs_is_yv16);
-  AVSC_DECLARE_FUNC(avs_is_yv12);
-  AVSC_DECLARE_FUNC(avs_is_yv411);
-  AVSC_DECLARE_FUNC(avs_is_y8);
-  AVSC_DECLARE_FUNC(avs_is_color_space);
-
-  AVSC_DECLARE_FUNC(avs_get_plane_width_subsampling);
-  AVSC_DECLARE_FUNC(avs_get_plane_height_subsampling);
-  AVSC_DECLARE_FUNC(avs_bits_per_pixel);
-  AVSC_DECLARE_FUNC(avs_bytes_from_pixels);
-  AVSC_DECLARE_FUNC(avs_row_size_p);
-  AVSC_DECLARE_FUNC(avs_bmp_size);
-  AVSC_DECLARE_FUNC(avs_get_pitch_p);
-  AVSC_DECLARE_FUNC(avs_get_row_size_p);
-  AVSC_DECLARE_FUNC(avs_get_height_p);
-  AVSC_DECLARE_FUNC(avs_get_read_ptr_p);
-  AVSC_DECLARE_FUNC(avs_is_writable);
-  AVSC_DECLARE_FUNC(avs_get_write_ptr_p);
-};
-
-#undef AVSC_DECLARE_FUNC
-
-
-AVSC_INLINE AVS_Library * avs_load_library() {
-  AVS_Library *library = (AVS_Library *)malloc(sizeof(AVS_Library));
-  if (library == NULL)
-    return NULL;
-  library->handle = LoadLibrary("avisynth");
-  if (library->handle == NULL)
-    goto fail;
-
-#define __AVSC_STRINGIFY(x) #x
-#define AVSC_STRINGIFY(x) __AVSC_STRINGIFY(x)
-#define AVSC_LOAD_FUNC(name) {\
-  library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name));\
-  if (library->name == NULL)\
-    goto fail;\
-}
-
-  AVSC_LOAD_FUNC(avs_add_function);
-  AVSC_LOAD_FUNC(avs_at_exit);
-  AVSC_LOAD_FUNC(avs_bit_blt);
-  AVSC_LOAD_FUNC(avs_check_version);
-  AVSC_LOAD_FUNC(avs_clip_get_error);
-  AVSC_LOAD_FUNC(avs_copy_clip);
-  AVSC_LOAD_FUNC(avs_copy_value);
-  AVSC_LOAD_FUNC(avs_copy_video_frame);
-  AVSC_LOAD_FUNC(avs_create_script_environment);
-  AVSC_LOAD_FUNC(avs_delete_script_environment);
-  AVSC_LOAD_FUNC(avs_function_exists);
-  AVSC_LOAD_FUNC(avs_get_audio);
-  AVSC_LOAD_FUNC(avs_get_cpu_flags);
-  AVSC_LOAD_FUNC(avs_get_frame);
-  AVSC_LOAD_FUNC(avs_get_parity);
-  AVSC_LOAD_FUNC(avs_get_var);
-  AVSC_LOAD_FUNC(avs_get_version);
-  AVSC_LOAD_FUNC(avs_get_video_info);
-  AVSC_LOAD_FUNC(avs_invoke);
-  AVSC_LOAD_FUNC(avs_make_writable);
-  AVSC_LOAD_FUNC(avs_new_c_filter);
-  AVSC_LOAD_FUNC(avs_new_video_frame_a);
-  AVSC_LOAD_FUNC(avs_release_clip);
-  AVSC_LOAD_FUNC(avs_release_value);
-  AVSC_LOAD_FUNC(avs_release_video_frame);
-  AVSC_LOAD_FUNC(avs_save_string);
-  AVSC_LOAD_FUNC(avs_set_cache_hints);
-  AVSC_LOAD_FUNC(avs_set_global_var);
-  AVSC_LOAD_FUNC(avs_set_memory_max);
-  AVSC_LOAD_FUNC(avs_set_to_clip);
-  AVSC_LOAD_FUNC(avs_set_var);
-  AVSC_LOAD_FUNC(avs_set_working_dir);
-  AVSC_LOAD_FUNC(avs_sprintf);
-  AVSC_LOAD_FUNC(avs_subframe);
-  AVSC_LOAD_FUNC(avs_subframe_planar);
-  AVSC_LOAD_FUNC(avs_take_clip);
-  AVSC_LOAD_FUNC(avs_vsprintf);
-
-  AVSC_LOAD_FUNC(avs_get_error);
-  AVSC_LOAD_FUNC(avs_is_yv24);
-  AVSC_LOAD_FUNC(avs_is_yv16);
-  AVSC_LOAD_FUNC(avs_is_yv12);
-  AVSC_LOAD_FUNC(avs_is_yv411);
-  AVSC_LOAD_FUNC(avs_is_y8);
-  AVSC_LOAD_FUNC(avs_is_color_space);
-
-  AVSC_LOAD_FUNC(avs_get_plane_width_subsampling);
-  AVSC_LOAD_FUNC(avs_get_plane_height_subsampling);
-  AVSC_LOAD_FUNC(avs_bits_per_pixel);
-  AVSC_LOAD_FUNC(avs_bytes_from_pixels);
-  AVSC_LOAD_FUNC(avs_row_size_p);
-  AVSC_LOAD_FUNC(avs_bmp_size);
-  AVSC_LOAD_FUNC(avs_get_pitch_p);
-  AVSC_LOAD_FUNC(avs_get_row_size_p);
-  AVSC_LOAD_FUNC(avs_get_height_p);
-  AVSC_LOAD_FUNC(avs_get_read_ptr_p);
-  AVSC_LOAD_FUNC(avs_is_writable);
-  AVSC_LOAD_FUNC(avs_get_write_ptr_p);
-
-#undef __AVSC_STRINGIFY
-#undef AVSC_STRINGIFY
-#undef AVSC_LOAD_FUNC
-
-  return library;
-
-fail:
-  free(library);
-  return NULL;
-}
-
-AVSC_INLINE void avs_free_library(AVS_Library *library) {
-  if (library == NULL)
-    return;
-  FreeLibrary(library->handle);
-  free(library);
-}
-#endif
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/extras/avxsynth_c.h b/android/src/main/libenc/jni/libx264/extras/avxsynth_c.h
deleted file mode 100755
index a899ca8..0000000
--- a/android/src/main/libenc/jni/libx264/extras/avxsynth_c.h
+++ /dev/null
@@ -1,725 +0,0 @@
-// Avisynth C Interface Version 0.20
-// Copyright 2003 Kevin Atkinson
-
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
-// http://www.gnu.org/copyleft/gpl.html .
-//
-// As a special exception, I give you permission to link to the
-// Avisynth C interface with independent modules that communicate with
-// the Avisynth C interface solely through the interfaces defined in
-// avisynth_c.h, regardless of the license terms of these independent
-// modules, and to copy and distribute the resulting combined work
-// under terms of your choice, provided that every copy of the
-// combined work is accompanied by a complete copy of the source code
-// of the Avisynth C interface and Avisynth itself (with the version
-// used to produce the combined work), being distributed under the
-// terms of the GNU General Public License plus this exception.  An
-// independent module is a module which is not derived from or based
-// on Avisynth C Interface, such as 3rd-party filters, import and
-// export plugins, or graphical user interfaces.
-
-#ifndef __AVXSYNTH_C__
-#define __AVXSYNTH_C__
-  
-#include <stdarg.h>
-#include <stdint.h>
-
-typedef int64_t INT64;
-#define __stdcall
-#define __declspec(x)
-
-#ifdef __cplusplus
-#  define EXTERN_C extern "C"
-#else
-#  define EXTERN_C
-#endif
-
-#define AVSC_USE_STDCALL 1
-
-#ifndef AVSC_USE_STDCALL
-#  define AVSC_CC __cdecl
-#else
-#  define AVSC_CC __stdcall
-#endif
-
-#define AVSC_INLINE static __inline
-
-#ifdef AVISYNTH_C_EXPORTS
-#  define AVSC_EXPORT EXTERN_C
-#  define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name
-#else
-#  define AVSC_EXPORT EXTERN_C __declspec(dllexport)
-#  ifndef AVSC_NO_DECLSPEC
-#    define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name
-#  else
-#    define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
-#  endif
-#endif
-
-
-/////////////////////////////////////////////////////////////////////
-//
-// Constants
-//
-
-#ifndef __AVXSYNTH_H__
-enum { AVISYNTH_INTERFACE_VERSION = 3 };
-#endif
-
-enum {AVS_SAMPLE_INT8  = 1<<0,
-      AVS_SAMPLE_INT16 = 1<<1, 
-      AVS_SAMPLE_INT24 = 1<<2,
-      AVS_SAMPLE_INT32 = 1<<3,
-      AVS_SAMPLE_FLOAT = 1<<4};
-
-enum {AVS_PLANAR_Y=1<<0,
-      AVS_PLANAR_U=1<<1,
-      AVS_PLANAR_V=1<<2,
-      AVS_PLANAR_ALIGNED=1<<3,
-      AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
-      AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED};
-
-  // Colorspace properties.
-enum {AVS_CS_BGR = 1<<28,  
-      AVS_CS_YUV = 1<<29,
-      AVS_CS_INTERLEAVED = 1<<30,
-      AVS_CS_PLANAR = 1<<31};
-
-  // Specific colorformats
-enum {
-  AVS_CS_UNKNOWN = 0,
-  AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
-  AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
-  AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
-  AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-v-u, planar
-  AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-u-v, planar
-  AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR  // same as above
-};
-
-enum {
-  AVS_IT_BFF = 1<<0,
-  AVS_IT_TFF = 1<<1,
-  AVS_IT_FIELDBASED = 1<<2};
-
-enum {
-  AVS_FILTER_TYPE=1,
-  AVS_FILTER_INPUT_COLORSPACE=2,
-  AVS_FILTER_OUTPUT_TYPE=9,
-  AVS_FILTER_NAME=4,
-  AVS_FILTER_AUTHOR=5,
-  AVS_FILTER_VERSION=6,
-  AVS_FILTER_ARGS=7,
-  AVS_FILTER_ARGS_INFO=8,
-  AVS_FILTER_ARGS_DESCRIPTION=10,
-  AVS_FILTER_DESCRIPTION=11};
-
-enum {  //SUBTYPES
-  AVS_FILTER_TYPE_AUDIO=1,
-  AVS_FILTER_TYPE_VIDEO=2,
-  AVS_FILTER_OUTPUT_TYPE_SAME=3,
-  AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
-
-enum {
-  AVS_CACHE_NOTHING=0,
-  AVS_CACHE_RANGE=1,
-  AVS_CACHE_ALL=2,
-  AVS_CACHE_AUDIO=3,
-  AVS_CACHE_AUDIO_NONE=4,
-  AVS_CACHE_AUDIO_AUTO=5
-};
-
-#define AVS_FRAME_ALIGN 16 
-
-typedef struct AVS_Clip AVS_Clip;
-typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_VideoInfo
-//
-
-// AVS_VideoInfo is layed out identicly to VideoInfo
-typedef struct AVS_VideoInfo {
-  int width, height;    // width=0 means no video
-  unsigned fps_numerator, fps_denominator;
-  int num_frames;
-
-  int pixel_type;
-  
-  int audio_samples_per_second;   // 0 means no audio
-  int sample_type;
-  INT64 num_audio_samples;
-  int nchannels;
-
-  // Imagetype properties
-
-  int image_type;
-} AVS_VideoInfo;
-
-// useful functions of the above
-AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) 
-        { return (p->width!=0); }
-
-AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) 
-        { return (p->audio_samples_per_second!=0); }
-
-AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) 
-        { return !!(p->pixel_type&AVS_CS_BGR); }
-
-AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) 
-        { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties
-
-AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) 
-        { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }
-
-AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) 
-        { return !!(p->pixel_type&AVS_CS_YUV ); }
-
-AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) 
-        { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }  
-
-AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) 
-        { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); }
-
-AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space) 
-        { return ((p->pixel_type & c_space) == c_space); }
-
-AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) 
-        { return ((p->pixel_type & property)==property ); }
-
-AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) 
-        { return !!(p->pixel_type & AVS_CS_PLANAR); }
-        
-AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) 
-        { return !!(p->image_type & AVS_IT_FIELDBASED); }
-
-AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) 
-        { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }
-
-AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) 
-        { return !!(p->image_type & AVS_IT_BFF); }
-
-AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) 
-        { return !!(p->image_type & AVS_IT_TFF); }
-
-AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p) 
-{ 
-  switch (p->pixel_type) {
-      case AVS_CS_BGR24: return 24;
-      case AVS_CS_BGR32: return 32;
-      case AVS_CS_YUY2:  return 16;
-      case AVS_CS_YV12:
-      case AVS_CS_I420:  return 12;
-      default:           return 0;
-    }
-}
-AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels) 
-        { return pixels * (avs_bits_per_pixel(p)>>3); }   // Will work on planar images, but will return only luma planes
-
-AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p) 
-        { return avs_bytes_from_pixels(p,p->width); }  // Also only returns first plane on planar images
-
-AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi)                
-        { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p;  } return vi->height * ((avs_row_size(vi)+3) & ~3); }
-
-AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) 
-        { return p->audio_samples_per_second; }
-
-
-AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) 
-{
-    switch (p->sample_type) {
-      case AVS_SAMPLE_INT8:  return sizeof(signed char);
-      case AVS_SAMPLE_INT16: return sizeof(signed short);
-      case AVS_SAMPLE_INT24: return 3;
-      case AVS_SAMPLE_INT32: return sizeof(signed int);
-      case AVS_SAMPLE_FLOAT: return sizeof(float);
-      default: return 0;
-    }
-}
-AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p)   
-        { return p->nchannels*avs_bytes_per_channel_sample(p);}
-
-AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames) 
-        { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }
-
-AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
-        { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }
-
-AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes) 
-        { return bytes / avs_bytes_per_audio_sample(p); }
-
-AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
-        { return samples * avs_bytes_per_audio_sample(p); }
-
-AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) 
-        { return p->nchannels; }
-
-AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
-        { return p->sample_type;}
-
-// useful mutator
-AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property)  
-        { p->image_type|=property; }
-
-AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property)  
-        { p->image_type&=~property; }
-
-AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased)  
-        { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }
-
-AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) 
-{
-    unsigned x=numerator, y=denominator;
-    while (y) {   // find gcd
-      unsigned t = x%y; x = y; y = t;
-    }
-    p->fps_numerator = numerator/x;
-    p->fps_denominator = denominator/x;
-}
-
-AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
-{
-        return (x->pixel_type == y->pixel_type)
-                || (avs_is_yv12(x) && avs_is_yv12(y));
-}
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_VideoFrame
-//
-
-// VideoFrameBuffer holds information about a memory block which is used
-// for video data.  For efficiency, instances of this class are not deleted
-// when the refcount reaches zero; instead they're stored in a linked list
-// to be reused.  The instances are deleted when the corresponding AVS
-// file is closed.
-
-// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
-// DO NOT USE THIS STRUCTURE DIRECTLY
-typedef struct AVS_VideoFrameBuffer {
-  unsigned char * data;
-  int data_size;
-  // sequence_number is incremented every time the buffer is changed, so
-  // that stale views can tell they're no longer valid.
-  long sequence_number;
-
-  long refcount;
-} AVS_VideoFrameBuffer;
-
-// VideoFrame holds a "window" into a VideoFrameBuffer.
-
-// AVS_VideoFrame is layed out identicly to IVideoFrame
-// DO NOT USE THIS STRUCTURE DIRECTLY
-typedef struct AVS_VideoFrame {
-  int refcount;
-  AVS_VideoFrameBuffer * vfb;
-  int offset, pitch, row_size, height, offsetU, offsetV, pitchUV;  // U&V offsets are from top of picture.
-} AVS_VideoFrame;
-
-// Access functions for AVS_VideoFrame
-AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
-        return p->pitch;}
-
-AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { 
-  switch (plane) {
-  case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;}
-  return p->pitch;}
-
-AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
-        return p->row_size; }
-
-AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { 
-        int r;
-    switch (plane) {
-    case AVS_PLANAR_U: case AVS_PLANAR_V: 
-                if (p->pitchUV) return p->row_size>>1; 
-                else            return 0;
-    case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED: 
-                if (p->pitchUV) { 
-                        r = ((p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)) )>>1; // Aligned rowsize
-                        if (r < p->pitchUV) 
-                                return r; 
-                        return p->row_size>>1; 
-                } else return 0;
-    case AVS_PLANAR_Y_ALIGNED:
-                r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
-                if (r <= p->pitch) 
-                        return r; 
-                return p->row_size;
-    }
-    return p->row_size;
-}
-
-AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
-        return p->height;}
-
-AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) {
-        switch (plane) {
-                case AVS_PLANAR_U: case AVS_PLANAR_V: 
-                        if (p->pitchUV) return p->height>>1;
-                        return 0;
-        }
-        return p->height;}
-
-AVSC_INLINE const unsigned char* avs_get_read_ptr(const AVS_VideoFrame * p) {
-        return p->vfb->data + p->offset;}
-
-AVSC_INLINE const unsigned char* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) 
-{
-        switch (plane) {
-                case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
-                case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
-                default:           return p->vfb->data + p->offset;}
-}
-
-AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) {
-        return (p->refcount == 1 && p->vfb->refcount == 1);}
-
-AVSC_INLINE unsigned char* avs_get_write_ptr(const AVS_VideoFrame * p) 
-{
-        if (avs_is_writable(p)) {
-                ++p->vfb->sequence_number;
-                return p->vfb->data + p->offset;
-        } else
-                return 0;
-}
-
-AVSC_INLINE unsigned char* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane) 
-{
-        if (plane==AVS_PLANAR_Y && avs_is_writable(p)) {
-                ++p->vfb->sequence_number;
-                return p->vfb->data + p->offset;
-        } else if (plane==AVS_PLANAR_Y) {
-                return 0;
-        } else {
-                switch (plane) {
-                        case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
-                        case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
-                        default:       return p->vfb->data + p->offset;
-                }
-        }
-}
-
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *);
-// makes a shallow copy of a video frame
-AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *);
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
-  {avs_release_video_frame(f);}
-AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
-  {return avs_copy_video_frame(f);}
-#endif
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_Value
-//
-
-// Treat AVS_Value as a fat pointer.  That is use avs_copy_value
-// and avs_release_value appropiaty as you would if AVS_Value was
-// a pointer.
-
-// To maintain source code compatibility with future versions of the
-// avisynth_c API don't use the AVS_Value directly.  Use the helper
-// functions below.
-
-// AVS_Value is layed out identicly to AVSValue
-typedef struct AVS_Value AVS_Value;
-struct AVS_Value {
-  short type;  // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
-               // for some function e'rror
-  short array_size;
-  union {
-    void * clip; // do not use directly, use avs_take_clip
-    char boolean;
-    int integer;
-    INT64 integer64; // match addition of __int64 to avxplugin.h
-    float floating_pt;
-    const char * string;
-    const AVS_Value * array;
-  } d;
-};
-
-// AVS_Value should be initilized with avs_void.
-// Should also set to avs_void after the value is released
-// with avs_copy_value.  Consider it the equalvent of setting
-// a pointer to NULL
-static const AVS_Value avs_void = {'v'};
-
-AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src);
-AVSC_API(void, avs_release_value)(AVS_Value);
-
-AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
-AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
-AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
-AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
-AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
-AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
-AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
-AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }
-
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *);
-AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *);
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-AVSC_INLINE int avs_as_bool(AVS_Value v) 
-        { return v.d.boolean; }   
-AVSC_INLINE int avs_as_int(AVS_Value v) 
-        { return v.d.integer; }   
-AVSC_INLINE const char * avs_as_string(AVS_Value v) 
-        { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
-AVSC_INLINE double avs_as_float(AVS_Value v) 
-        { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
-AVSC_INLINE const char * avs_as_error(AVS_Value v) 
-        { return avs_is_error(v) ? v.d.string : 0; }
-AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
-        { return v.d.array; }
-AVSC_INLINE int avs_array_size(AVS_Value v) 
-        { return avs_is_array(v) ? v.array_size : 1; }
-AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) 
-        { return avs_is_array(v) ? v.d.array[index] : v; }
-
-// only use these functions on am AVS_Value that does not already have
-// an active value.  Remember, treat AVS_Value as a fat pointer.
-AVSC_INLINE AVS_Value avs_new_value_bool(int v0) 
-        { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; }   
-AVSC_INLINE AVS_Value avs_new_value_int(int v0) 
-        { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; }   
-AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) 
-        { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
-AVSC_INLINE AVS_Value avs_new_value_float(float v0) 
-        { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
-AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) 
-        { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
-        { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
-#endif
-AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
-        { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; }
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_Clip
-//
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(void, avs_release_clip)(AVS_Clip *);
-AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *);
-
-AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error
-
-AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *);
-
-AVSC_API(int, avs_get_version)(AVS_Clip *);
- 
-AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n);
-// The returned video frame must be released with avs_release_video_frame
-
-AVSC_API(int, avs_get_parity)(AVS_Clip *, int n); 
-// return field parity if field_based, else parity of first field in frame
-
-AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf, 
-                                  INT64 start, INT64 count); 
-// start and count are in samples
-
-AVSC_API(int, avs_set_cache_hints)(AVS_Clip *, 
-                                        int cachehints, size_t frame_range);
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-// This is the callback type used by avs_add_function
-typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
-                        (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);
-
-typedef struct AVS_FilterInfo AVS_FilterInfo;
-struct AVS_FilterInfo
-{
-  // these members should not be modified outside of the AVS_ApplyFunc callback
-  AVS_Clip * child;
-  AVS_VideoInfo vi;
-  AVS_ScriptEnvironment * env;
-  AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
-  int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
-  int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, 
-				  INT64 start, INT64 count);
-  int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, 
-					int frame_range);
-  void (AVSC_CC * free_filter)(AVS_FilterInfo *);
-  
-  // Should be set when ever there is an error to report.
-  // It is cleared before any of the above methods are called
-  const char * error;
-  // this is to store whatever and may be modified at will
-  void * user_data;
-};
-
-// Create a new filter
-// fi is set to point to the AVS_FilterInfo so that you can
-//   modify it once it is initilized.
-// store_child should generally be set to true.  If it is not
-//    set than ALL methods (the function pointers) must be defined
-// If it is set than you do not need to worry about freeing the child
-//    clip.
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e,
-                                      AVS_FilterInfo * * fi,
-                                      AVS_Value child, int store_child);
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-
-/////////////////////////////////////////////////////////////////////
-//
-// AVS_ScriptEnvironment
-//
-
-// For GetCPUFlags.  These are backwards-compatible with those in VirtualDub.
-enum {                    
-                                /* slowest CPU to support extension */
-  AVS_CPU_FORCE        = 0x01,   // N/A
-  AVS_CPU_FPU          = 0x02,   // 386/486DX
-  AVS_CPU_MMX          = 0x04,   // P55C, K6, PII
-  AVS_CPU_INTEGER_SSE  = 0x08,   // PIII, Athlon
-  AVS_CPU_SSE          = 0x10,   // PIII, Athlon XP/MP
-  AVS_CPU_SSE2         = 0x20,   // PIV, Hammer
-  AVS_CPU_3DNOW        = 0x40,   // K6-2
-  AVS_CPU_3DNOW_EXT    = 0x80,   // Athlon
-  AVS_CPU_X86_64       = 0xA0,   // Hammer (note: equiv. to 3DNow + SSE2, 
-                                 // which only Hammer will have anyway)
-};
-
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error
-
-AVSC_API(long, avs_get_cpu_flags)(AVS_ScriptEnvironment *);
-AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version);
-
-AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length);
-AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...);
-
-AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, va_list val);
- // note: val is really a va_list; I hope everyone typedefs va_list to a pointer
-
-AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *, 
-				     const char * name, const char * params, 
-				     AVS_ApplyFunc apply, void * user_data);
-
-AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name);
-
-AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name, 
-                               AVS_Value args, const char** arg_names);
-// The returned value must be be released with avs_release_value
-
-AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name);
-// The returned value must be be released with avs_release_value
-
-AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val);
-
-AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);
-
-//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
-//void avs_pop_context(AVS_ScriptEnvironment *);
-
-AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *, 
-                                          const AVS_VideoInfo * vi, int align);
-// align should be at least 16
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-#ifndef AVSC_NO_DECLSPEC
-AVSC_INLINE 
-AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, 
-                                     const AVS_VideoInfo * vi)
-  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
-
-AVSC_INLINE 
-AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, 
-                               const AVS_VideoInfo * vi)
-  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
-#endif
-
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);
-
-AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, unsigned char* dstp, int dst_pitch, const unsigned char* srcp, int src_pitch, int row_size, int height);
-
-typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
-AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);
-
-AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
-// The returned video frame must be be released
-
-AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem);
-
-AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir);
-
-// avisynth.dll exports this; it's a way to use it as a library, without
-// writing an AVS script or without going through AVIFile.
-AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version);
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-// this symbol is the entry point for the plugin and must
-// be defined
-AVSC_EXPORT
-const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);
-
-
-#if defined __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *);
-
-
-AVSC_API(AVS_VideoFrame *, avs_subframe_planar)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
-// The returned video frame must be be released
-#if defined __cplusplus
-}
-#endif // __cplusplus
-
-#endif //__AVXSYNTH_C__
diff --git a/android/src/main/libenc/jni/libx264/extras/cl.h b/android/src/main/libenc/jni/libx264/extras/cl.h
deleted file mode 100755
index f543257..0000000
--- a/android/src/main/libenc/jni/libx264/extras/cl.h
+++ /dev/null
@@ -1,1209 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_H
-#define __OPENCL_CL_H
-
-#include "cl_platform.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-typedef cl_bitfield         cl_command_queue_properties;
-typedef intptr_t            cl_device_partition_property;
-typedef cl_bitfield         cl_device_affinity_domain;
-
-typedef intptr_t            cl_context_properties;
-typedef cl_uint             cl_context_info;
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-typedef cl_bitfield         cl_mem_migration_flags;
-typedef cl_uint             cl_image_info;
-typedef cl_uint             cl_buffer_create_type;
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-typedef cl_uint             cl_program_binary_type;
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-typedef cl_uint             cl_kernel_arg_info;
-typedef cl_uint             cl_kernel_arg_address_qualifier;
-typedef cl_uint             cl_kernel_arg_access_qualifier;
-typedef cl_bitfield         cl_kernel_arg_type_qualifier;
-typedef cl_uint             cl_kernel_work_group_info;
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-
-
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-
-typedef struct _cl_image_desc {
-    cl_mem_object_type      image_type;
-    size_t                  image_width;
-    size_t                  image_height;
-    size_t                  image_depth;
-    size_t                  image_array_size;
-    size_t                  image_row_pitch;
-    size_t                  image_slice_pitch;
-    cl_uint                 num_mip_levels;
-    cl_uint                 num_samples;
-    cl_mem                  buffer;
-} cl_image_desc;
-
-typedef struct _cl_buffer_region {
-    size_t                  origin;
-    size_t                  size;
-} cl_buffer_region;
-
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_SUCCESS                                  0
-#define CL_DEVICE_NOT_FOUND                         -1
-#define CL_DEVICE_NOT_AVAILABLE                     -2
-#define CL_COMPILER_NOT_AVAILABLE                   -3
-#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
-#define CL_OUT_OF_RESOURCES                         -5
-#define CL_OUT_OF_HOST_MEMORY                       -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
-#define CL_MEM_COPY_OVERLAP                         -8
-#define CL_IMAGE_FORMAT_MISMATCH                    -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
-#define CL_BUILD_PROGRAM_FAILURE                    -11
-#define CL_MAP_FAILURE                              -12
-#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
-#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
-#define CL_COMPILE_PROGRAM_FAILURE                  -15
-#define CL_LINKER_NOT_AVAILABLE                     -16
-#define CL_LINK_PROGRAM_FAILURE                     -17
-#define CL_DEVICE_PARTITION_FAILED                  -18
-#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
-
-#define CL_INVALID_VALUE                            -30
-#define CL_INVALID_DEVICE_TYPE                      -31
-#define CL_INVALID_PLATFORM                         -32
-#define CL_INVALID_DEVICE                           -33
-#define CL_INVALID_CONTEXT                          -34
-#define CL_INVALID_QUEUE_PROPERTIES                 -35
-#define CL_INVALID_COMMAND_QUEUE                    -36
-#define CL_INVALID_HOST_PTR                         -37
-#define CL_INVALID_MEM_OBJECT                       -38
-#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
-#define CL_INVALID_IMAGE_SIZE                       -40
-#define CL_INVALID_SAMPLER                          -41
-#define CL_INVALID_BINARY                           -42
-#define CL_INVALID_BUILD_OPTIONS                    -43
-#define CL_INVALID_PROGRAM                          -44
-#define CL_INVALID_PROGRAM_EXECUTABLE               -45
-#define CL_INVALID_KERNEL_NAME                      -46
-#define CL_INVALID_KERNEL_DEFINITION                -47
-#define CL_INVALID_KERNEL                           -48
-#define CL_INVALID_ARG_INDEX                        -49
-#define CL_INVALID_ARG_VALUE                        -50
-#define CL_INVALID_ARG_SIZE                         -51
-#define CL_INVALID_KERNEL_ARGS                      -52
-#define CL_INVALID_WORK_DIMENSION                   -53
-#define CL_INVALID_WORK_GROUP_SIZE                  -54
-#define CL_INVALID_WORK_ITEM_SIZE                   -55
-#define CL_INVALID_GLOBAL_OFFSET                    -56
-#define CL_INVALID_EVENT_WAIT_LIST                  -57
-#define CL_INVALID_EVENT                            -58
-#define CL_INVALID_OPERATION                        -59
-#define CL_INVALID_GL_OBJECT                        -60
-#define CL_INVALID_BUFFER_SIZE                      -61
-#define CL_INVALID_MIP_LEVEL                        -62
-#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
-#define CL_INVALID_PROPERTY                         -64
-#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
-#define CL_INVALID_COMPILER_OPTIONS                 -66
-#define CL_INVALID_LINKER_OPTIONS                   -67
-#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
-
-/* OpenCL Version */
-#define CL_VERSION_1_0                              1
-#define CL_VERSION_1_1                              1
-#define CL_VERSION_1_2                              1
-
-/* cl_bool */
-#define CL_FALSE                                    0
-#define CL_TRUE                                     1
-#define CL_BLOCKING                                 CL_TRUE
-#define CL_NON_BLOCKING                             CL_FALSE
-
-/* cl_platform_info */
-#define CL_PLATFORM_PROFILE                         0x0900
-#define CL_PLATFORM_VERSION                         0x0901
-#define CL_PLATFORM_NAME                            0x0902
-#define CL_PLATFORM_VENDOR                          0x0903
-#define CL_PLATFORM_EXTENSIONS                      0x0904
-
-/* cl_device_type - bitfield */
-#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
-#define CL_DEVICE_TYPE_CPU                          (1 << 1)
-#define CL_DEVICE_TYPE_GPU                          (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
-#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
-#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
-
-/* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
-#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
-#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
-#define CL_DEVICE_PARENT_DEVICE                     0x1042
-#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
-#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
-#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
-#define CL_DEVICE_PARTITION_TYPE                    0x1046
-#define CL_DEVICE_REFERENCE_COUNT                   0x1047
-#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
-#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
-
-/* cl_device_fp_config - bitfield */
-#define CL_FP_DENORM                                (1 << 0)
-#define CL_FP_INF_NAN                               (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
-#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
-#define CL_FP_ROUND_TO_INF                          (1 << 4)
-#define CL_FP_FMA                                   (1 << 5)
-#define CL_FP_SOFT_FLOAT                            (1 << 6)
-#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
-
-/* cl_device_mem_cache_type */
-#define CL_NONE                                     0x0
-#define CL_READ_ONLY_CACHE                          0x1
-#define CL_READ_WRITE_CACHE                         0x2
-
-/* cl_device_local_mem_type */
-#define CL_LOCAL                                    0x1
-#define CL_GLOBAL                                   0x2
-
-/* cl_device_exec_capabilities - bitfield */
-#define CL_EXEC_KERNEL                              (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
-
-/* cl_command_queue_properties - bitfield */
-#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
-#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
-
-/* cl_context_info  */
-#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
-#define CL_CONTEXT_DEVICES                          0x1081
-#define CL_CONTEXT_PROPERTIES                       0x1082
-#define CL_CONTEXT_NUM_DEVICES                      0x1083
-
-/* cl_context_properties */
-#define CL_CONTEXT_PLATFORM                         0x1084
-#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
-
-/* cl_device_partition_property */
-#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
-#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
-#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
-#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
-
-/* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
-#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
-
-/* cl_command_queue_info */
-#define CL_QUEUE_CONTEXT                            0x1090
-#define CL_QUEUE_DEVICE                             0x1091
-#define CL_QUEUE_REFERENCE_COUNT                    0x1092
-#define CL_QUEUE_PROPERTIES                         0x1093
-
-/* cl_mem_flags - bitfield */
-#define CL_MEM_READ_WRITE                           (1 << 0)
-#define CL_MEM_WRITE_ONLY                           (1 << 1)
-#define CL_MEM_READ_ONLY                            (1 << 2)
-#define CL_MEM_USE_HOST_PTR                         (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
-#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-// reserved                                         (1 << 6)
-#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
-#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
-#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
-
-/* cl_mem_migration_flags - bitfield */
-#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
-#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
-
-/* cl_channel_order */
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-#define CL_Rx                                       0x10BA
-#define CL_RGx                                      0x10BB
-#define CL_RGBx                                     0x10BC
-#define CL_DEPTH                                    0x10BD
-#define CL_DEPTH_STENCIL                            0x10BE
-
-/* cl_channel_type */
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-#define CL_UNORM_INT24                              0x10DF
-
-/* cl_mem_object_type */
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
-#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
-#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
-#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
-#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
-
-/* cl_mem_info */
-#define CL_MEM_TYPE                                 0x1100
-#define CL_MEM_FLAGS                                0x1101
-#define CL_MEM_SIZE                                 0x1102
-#define CL_MEM_HOST_PTR                             0x1103
-#define CL_MEM_MAP_COUNT                            0x1104
-#define CL_MEM_REFERENCE_COUNT                      0x1105
-#define CL_MEM_CONTEXT                              0x1106
-#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
-#define CL_MEM_OFFSET                               0x1108
-
-/* cl_image_info */
-#define CL_IMAGE_FORMAT                             0x1110
-#define CL_IMAGE_ELEMENT_SIZE                       0x1111
-#define CL_IMAGE_ROW_PITCH                          0x1112
-#define CL_IMAGE_SLICE_PITCH                        0x1113
-#define CL_IMAGE_WIDTH                              0x1114
-#define CL_IMAGE_HEIGHT                             0x1115
-#define CL_IMAGE_DEPTH                              0x1116
-#define CL_IMAGE_ARRAY_SIZE                         0x1117
-#define CL_IMAGE_BUFFER                             0x1118
-#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
-#define CL_IMAGE_NUM_SAMPLES                        0x111A
-
-/* cl_addressing_mode */
-#define CL_ADDRESS_NONE                             0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
-#define CL_ADDRESS_CLAMP                            0x1132
-#define CL_ADDRESS_REPEAT                           0x1133
-#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
-
-/* cl_filter_mode */
-#define CL_FILTER_NEAREST                           0x1140
-#define CL_FILTER_LINEAR                            0x1141
-
-/* cl_sampler_info */
-#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
-#define CL_SAMPLER_CONTEXT                          0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
-#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
-#define CL_SAMPLER_FILTER_MODE                      0x1154
-
-/* cl_map_flags - bitfield */
-#define CL_MAP_READ                                 (1 << 0)
-#define CL_MAP_WRITE                                (1 << 1)
-#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
-
-/* cl_program_info */
-#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
-#define CL_PROGRAM_CONTEXT                          0x1161
-#define CL_PROGRAM_NUM_DEVICES                      0x1162
-#define CL_PROGRAM_DEVICES                          0x1163
-#define CL_PROGRAM_SOURCE                           0x1164
-#define CL_PROGRAM_BINARY_SIZES                     0x1165
-#define CL_PROGRAM_BINARIES                         0x1166
-#define CL_PROGRAM_NUM_KERNELS                      0x1167
-#define CL_PROGRAM_KERNEL_NAMES                     0x1168
-
-/* cl_program_build_info */
-#define CL_PROGRAM_BUILD_STATUS                     0x1181
-#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
-#define CL_PROGRAM_BUILD_LOG                        0x1183
-#define CL_PROGRAM_BINARY_TYPE                      0x1184
-
-/* cl_program_binary_type */
-#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
-#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
-#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
-#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
-
-/* cl_build_status */
-#define CL_BUILD_SUCCESS                            0
-#define CL_BUILD_NONE                               -1
-#define CL_BUILD_ERROR                              -2
-#define CL_BUILD_IN_PROGRESS                        -3
-
-/* cl_kernel_info */
-#define CL_KERNEL_FUNCTION_NAME                     0x1190
-#define CL_KERNEL_NUM_ARGS                          0x1191
-#define CL_KERNEL_REFERENCE_COUNT                   0x1192
-#define CL_KERNEL_CONTEXT                           0x1193
-#define CL_KERNEL_PROGRAM                           0x1194
-#define CL_KERNEL_ATTRIBUTES                        0x1195
-
-/* cl_kernel_arg_info */
-#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
-#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
-#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
-#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
-#define CL_KERNEL_ARG_NAME                          0x119A
-
-/* cl_kernel_arg_address_qualifier */
-#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
-#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
-#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
-#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
-
-/* cl_kernel_arg_access_qualifier */
-#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
-#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
-#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
-#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
-
-/* cl_kernel_arg_type_qualifer */
-#define CL_KERNEL_ARG_TYPE_NONE                     0
-#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
-#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
-#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
-
-/* cl_kernel_work_group_info */
-#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
-#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
-#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
-#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
-
-/* cl_event_info  */
-#define CL_EVENT_COMMAND_QUEUE                      0x11D0
-#define CL_EVENT_COMMAND_TYPE                       0x11D1
-#define CL_EVENT_REFERENCE_COUNT                    0x11D2
-#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
-#define CL_EVENT_CONTEXT                            0x11D4
-
-/* cl_command_type */
-#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
-#define CL_COMMAND_TASK                             0x11F1
-#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
-#define CL_COMMAND_READ_BUFFER                      0x11F3
-#define CL_COMMAND_WRITE_BUFFER                     0x11F4
-#define CL_COMMAND_COPY_BUFFER                      0x11F5
-#define CL_COMMAND_READ_IMAGE                       0x11F6
-#define CL_COMMAND_WRITE_IMAGE                      0x11F7
-#define CL_COMMAND_COPY_IMAGE                       0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
-#define CL_COMMAND_MAP_BUFFER                       0x11FB
-#define CL_COMMAND_MAP_IMAGE                        0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
-#define CL_COMMAND_MARKER                           0x11FE
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
-#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
-#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
-#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
-#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
-#define CL_COMMAND_USER                             0x1204
-#define CL_COMMAND_BARRIER                          0x1205
-#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
-#define CL_COMMAND_FILL_BUFFER                      0x1207
-#define CL_COMMAND_FILL_IMAGE                       0x1208
-
-/* command execution status */
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-
-/* cl_buffer_create_type  */
-#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
-
-/* cl_profiling_info  */
-#define CL_PROFILING_COMMAND_QUEUED                 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
-#define CL_PROFILING_COMMAND_START                  0x1282
-#define CL_PROFILING_COMMAND_END                    0x1283
-
-/********************************************************************************************************/
-
-/* Platform API */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformIDs(cl_uint          /* num_entries */,
-                 cl_platform_id * /* platforms */,
-                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformInfo(cl_platform_id   /* platform */,
-                  cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */,
-                  void *           /* param_value */,
-                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Device APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */,
-               cl_uint          /* num_entries */,
-               cl_device_id *   /* devices */,
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateSubDevices(cl_device_id                         /* in_device */,
-                   const cl_device_partition_property * /* properties */,
-                   cl_uint                              /* num_devices */,
-                   cl_device_id *                       /* out_devices */,
-                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-
-/* Context APIs  */
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                 /* num_devices */,
-                const cl_device_id *    /* devices */,
-                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
-                void *                  /* user_data */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type          /* device_type */,
-                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
-                        void *                  /* user_data */,
-                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */,
-                 cl_context_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Command Queue APIs */
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */,
-                     cl_device_id                   /* device */,
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
-                      cl_command_queue_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Memory Object APIs */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags /* flags */,
-               size_t       /* size */,
-               void *       /* host_ptr */,
-               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateSubBuffer(cl_mem                   /* buffer */,
-                  cl_mem_flags             /* flags */,
-                  cl_buffer_create_type    /* buffer_create_type */,
-                  const void *             /* buffer_create_info */,
-                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage(cl_context              /* context */,
-              cl_mem_flags            /* flags */,
-              const cl_image_format * /* image_format */,
-              const cl_image_desc *   /* image_desc */,
-              void *                  /* host_ptr */,
-              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSupportedImageFormats(cl_context           /* context */,
-                           cl_mem_flags         /* flags */,
-                           cl_mem_object_type   /* image_type */,
-                           cl_uint              /* num_entries */,
-                           cl_image_format *    /* image_formats */,
-                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */,
-                   size_t           /* param_value_size */,
-                   void *           /* param_value */,
-                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(  cl_mem /* memobj */,
-                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
-                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
-
-/* Sampler APIs */
-extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */,
-                cl_addressing_mode  /* addressing_mode */,
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSamplerInfo(cl_sampler         /* sampler */,
-                 cl_sampler_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Program Object APIs  */
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context        /* context */,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context                     /* context */,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBuiltInKernels(cl_context            /* context */,
-                                  cl_uint               /* num_devices */,
-                                  const cl_device_id *  /* device_list */,
-                                  const char *          /* kernel_names */,
-                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */,
-               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCompileProgram(cl_program           /* program */,
-                 cl_uint              /* num_devices */,
-                 const cl_device_id * /* device_list */,
-                 const char *         /* options */,
-                 cl_uint              /* num_input_headers */,
-                 const cl_program *   /* input_headers */,
-                 const char **        /* header_include_names */,
-                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clLinkProgram(cl_context           /* context */,
-              cl_uint              /* num_devices */,
-              const cl_device_id * /* device_list */,
-              const char *         /* options */,
-              cl_uint              /* num_input_programs */,
-              const cl_program *   /* input_programs */,
-              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-              void *               /* user_data */,
-              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Kernel Object APIs */
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCreateKernel(cl_program      /* program */,
-               const char *    /* kernel_name */,
-               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateKernelsInProgram(cl_program     /* program */,
-                         cl_uint        /* num_kernels */,
-                         cl_kernel *    /* kernels */,
-                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArg(cl_kernel    /* kernel */,
-               cl_uint      /* arg_index */,
-               size_t       /* arg_size */,
-               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelInfo(cl_kernel       /* kernel */,
-                cl_kernel_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelArgInfo(cl_kernel       /* kernel */,
-                   cl_uint         /* arg_indx */,
-                   cl_kernel_arg_info  /* param_name */,
-                   size_t          /* param_value_size */,
-                   void *          /* param_value */,
-                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  /* param_name */,
-                         size_t                     /* param_value_size */,
-                         void *                     /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Event Object APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clWaitForEvents(cl_uint             /* num_events */,
-                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventInfo(cl_event         /* event */,
-               cl_event_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateUserEvent(cl_context    /* context */,
-                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetUserEventStatus(cl_event   /* event */,
-                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetEventCallback( cl_event    /* event */,
-                    cl_int      /* command_exec_callback_type */,
-                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
-                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
-
-/* Profiling APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventProfilingInfo(cl_event            /* event */,
-                        cl_profiling_info   /* param_name */,
-                        size_t              /* param_value_size */,
-                        void *              /* param_value */,
-                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Flush and Finish APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Enqueued Commands APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* buffer */,
-                    cl_bool             /* blocking_read */,
-                    size_t              /* offset */,
-                    size_t              /* size */,
-                    void *              /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
-                        cl_mem              /* buffer */,
-                        cl_bool             /* blocking_read */,
-                        const size_t *      /* buffer_offset */,
-                        const size_t *      /* host_offset */,
-                        const size_t *      /* region */,
-                        size_t              /* buffer_row_pitch */,
-                        size_t              /* buffer_slice_pitch */,
-                        size_t              /* host_row_pitch */,
-                        size_t              /* host_slice_pitch */,
-                        void *              /* ptr */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
-                     cl_mem             /* buffer */,
-                     cl_bool            /* blocking_write */,
-                     size_t             /* offset */,
-                     size_t             /* size */,
-                     const void *       /* ptr */,
-                     cl_uint            /* num_events_in_wait_list */,
-                     const cl_event *   /* event_wait_list */,
-                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
-                         cl_mem              /* buffer */,
-                         cl_bool             /* blocking_write */,
-                         const size_t *      /* buffer_offset */,
-                         const size_t *      /* host_offset */,
-                         const size_t *      /* region */,
-                         size_t              /* buffer_row_pitch */,
-                         size_t              /* buffer_slice_pitch */,
-                         size_t              /* host_row_pitch */,
-                         size_t              /* host_slice_pitch */,
-                         const void *        /* ptr */,
-                         cl_uint             /* num_events_in_wait_list */,
-                         const cl_event *    /* event_wait_list */,
-                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
-                    cl_mem             /* buffer */,
-                    const void *       /* pattern */,
-                    size_t             /* pattern_size */,
-                    size_t             /* offset */,
-                    size_t             /* size */,
-                    cl_uint            /* num_events_in_wait_list */,
-                    const cl_event *   /* event_wait_list */,
-                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */,
-                    size_t              /* src_offset */,
-                    size_t              /* dst_offset */,
-                    size_t              /* size */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
-                        cl_mem              /* src_buffer */,
-                        cl_mem              /* dst_buffer */,
-                        const size_t *      /* src_origin */,
-                        const size_t *      /* dst_origin */,
-                        const size_t *      /* region */,
-                        size_t              /* src_row_pitch */,
-                        size_t              /* src_slice_pitch */,
-                        size_t              /* dst_row_pitch */,
-                        size_t              /* dst_slice_pitch */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* image */,
-                   cl_bool              /* blocking_read */,
-                   const size_t *       /* origin[3] */,
-                   const size_t *       /* region[3] */,
-                   size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */,
-                   void *               /* ptr */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue    /* command_queue */,
-                    cl_mem              /* image */,
-                    cl_bool             /* blocking_write */,
-                    const size_t *      /* origin[3] */,
-                    const size_t *      /* region[3] */,
-                    size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */,
-                    const void *        /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueFillImage(cl_command_queue   /* command_queue */,
-                   cl_mem             /* image */,
-                   const void *       /* fill_color */,
-                   const size_t *     /* origin[3] */,
-                   const size_t *     /* region[3] */,
-                   cl_uint            /* num_events_in_wait_list */,
-                   const cl_event *   /* event_wait_list */,
-                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */,
-                   const size_t *       /* src_origin[3] */,
-                   const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */,
-                           const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */,
-                           size_t           /* dst_offset */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */,
-                           size_t           /* src_offset */,
-                           const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue /* command_queue */,
-                   cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */,
-                   cl_map_flags     /* map_flags */,
-                   size_t           /* offset */,
-                   size_t           /* size */,
-                   cl_uint          /* num_events_in_wait_list */,
-                   const cl_event * /* event_wait_list */,
-                   cl_event *       /* event */,
-                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */,
-                  cl_bool           /* blocking_map */,
-                  cl_map_flags      /* map_flags */,
-                  const size_t *    /* origin[3] */,
-                  const size_t *    /* region[3] */,
-                  size_t *          /* image_row_pitch */,
-                  size_t *          /* image_slice_pitch */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */,
-                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
-                        cl_mem           /* memobj */,
-                        void *           /* mapped_ptr */,
-                        cl_uint          /* num_events_in_wait_list */,
-                        const cl_event *  /* event_wait_list */,
-                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
-                           cl_uint                /* num_mem_objects */,
-                           const cl_mem *         /* mem_objects */,
-                           cl_mem_migration_flags /* flags */,
-                           cl_uint                /* num_events_in_wait_list */,
-                           const cl_event *       /* event_wait_list */,
-                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-                      void (CL_CALLBACK * /*user_func*/)(void *),
-                      void *            /* args */,
-                      size_t            /* cb_args */,
-                      cl_uint           /* num_mem_objects */,
-                      const cl_mem *    /* mem_list */,
-                      const void **     /* args_mem_loc */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
-                            cl_uint           /* num_events_in_wait_list */,
-                            const cl_event *  /* event_wait_list */,
-                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
-                             cl_uint           /* num_events_in_wait_list */,
-                             const cl_event *  /* event_wait_list */,
-                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-
-/* Extension function access
- *
- * Returns the extension function address for the given function name,
- * or NULL if a valid function can not be found.  The client must
- * check to make sure the address is not NULL, before using or
- * calling the returned function address.
- */
-extern CL_API_ENTRY void * CL_API_CALL
-clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
-                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
-
-
-// Deprecated OpenCL 1.1 APIs
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */,
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */,
-                size_t                  /* image_row_pitch */,
-                size_t                  /* image_slice_pitch */,
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                        cl_uint          /* num_events */,
-                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
-clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_H */
diff --git a/android/src/main/libenc/jni/libx264/extras/cl_platform.h b/android/src/main/libenc/jni/libx264/extras/cl_platform.h
deleted file mode 100755
index 7b06e09..0000000
--- a/android/src/main/libenc/jni/libx264/extras/cl_platform.h
+++ /dev/null
@@ -1,1268 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
-
-#ifndef __CL_PLATFORM_H
-#define __CL_PLATFORM_H
-
-#ifdef __APPLE__
-    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
-    #include <AvailabilityMacros.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(_WIN32)
-    #define CL_API_ENTRY
-    #define CL_API_CALL     __stdcall
-    #define CL_CALLBACK     __stdcall
-#else
-    #define CL_API_ENTRY
-    #define CL_API_CALL
-    #define CL_CALLBACK
-#endif
-
-#ifdef __APPLE__
-    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
-    #ifndef UNAVAILABLE_ATTRIBUTE
-        #define UNAVAILABLE_ATTRIBUTE
-    #endif
-    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-        #define CL_API_SUFFIX__VERSION_1_0              AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-    #else
-        #define CL_API_SUFFIX__VERSION_1_0              UNAVAILABLE_ATTRIBUTE
-        #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
-    #endif
-    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define CL_API_SUFFIX__VERSION_1_1              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define GCL_API_SUFFIX__VERSION_1_1             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
-    #else
-        #define CL_API_SUFFIX__VERSION_1_1              UNAVAILABLE_ATTRIBUTE
-        #define GCL_API_SUFFIX__VERSION_1_1             UNAVAILABLE_ATTRIBUTE
-        #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
-        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE    CL_EXT_SUFFIX__VERSION_1_0
-    #endif
-    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
-    #else
-        #define CL_API_SUFFIX__VERSION_1_2              UNAVAILABLE_ATTRIBUTE
-        #define GCL_API_SUFFIX__VERSION_1_2             UNAVAILABLE_ATTRIBUTE
-        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
-        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXT_SUFFIX__VERSION_1_1
-    #endif
-#else
-    #define CL_EXTENSION_WEAK_LINK
-    #define CL_API_SUFFIX__VERSION_1_0
-    #define CL_EXT_SUFFIX__VERSION_1_0
-    #define CL_API_SUFFIX__VERSION_1_1
-    #define CL_EXT_SUFFIX__VERSION_1_1
-    #define CL_API_SUFFIX__VERSION_1_2
-    #define CL_EXT_SUFFIX__VERSION_1_2
-
-    #ifdef __GNUC__
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #endif
-    #elif _WIN32
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
-        #endif
-    #else
-        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    #endif
-#endif
-
-#if (defined (_WIN32) && defined(_MSC_VER))
-
-/* scalar types  */
-typedef signed   __int8         cl_char;
-typedef unsigned __int8         cl_uchar;
-typedef signed   __int16        cl_short;
-typedef unsigned __int16        cl_ushort;
-typedef signed   __int32        cl_int;
-typedef unsigned __int32        cl_uint;
-typedef signed   __int64        cl_long;
-typedef unsigned __int64        cl_ulong;
-
-typedef unsigned __int16        cl_half;
-typedef float                   cl_float;
-typedef double                  cl_double;
-
-/* Macro names and corresponding values defined by OpenCL */
-#define CL_CHAR_BIT         8
-#define CL_SCHAR_MAX        127
-#define CL_SCHAR_MIN        (-127-1)
-#define CL_CHAR_MAX         CL_SCHAR_MAX
-#define CL_CHAR_MIN         CL_SCHAR_MIN
-#define CL_UCHAR_MAX        255
-#define CL_SHRT_MAX         32767
-#define CL_SHRT_MIN         (-32767-1)
-#define CL_USHRT_MAX        65535
-#define CL_INT_MAX          2147483647
-#define CL_INT_MIN          (-2147483647-1)
-#define CL_UINT_MAX         0xffffffffU
-#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-
-#define CL_FLT_DIG          6
-#define CL_FLT_MANT_DIG     24
-#define CL_FLT_MAX_10_EXP   +38
-#define CL_FLT_MAX_EXP      +128
-#define CL_FLT_MIN_10_EXP   -37
-#define CL_FLT_MIN_EXP      -125
-#define CL_FLT_RADIX        2
-#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
-#define CL_FLT_MIN          1.175494350822287507969e-38f
-#define CL_FLT_EPSILON      0x1.0p-23f
-
-#define CL_DBL_DIG          15
-#define CL_DBL_MANT_DIG     53
-#define CL_DBL_MAX_10_EXP   +308
-#define CL_DBL_MAX_EXP      +1024
-#define CL_DBL_MIN_10_EXP   -307
-#define CL_DBL_MIN_EXP      -1021
-#define CL_DBL_RADIX        2
-#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
-#define CL_DBL_MIN          2.225073858507201383090e-308
-#define CL_DBL_EPSILON      2.220446049250313080847e-16
-
-#define  CL_M_E             2.718281828459045090796
-#define  CL_M_LOG2E         1.442695040888963387005
-#define  CL_M_LOG10E        0.434294481903251816668
-#define  CL_M_LN2           0.693147180559945286227
-#define  CL_M_LN10          2.302585092994045901094
-#define  CL_M_PI            3.141592653589793115998
-#define  CL_M_PI_2          1.570796326794896557999
-#define  CL_M_PI_4          0.785398163397448278999
-#define  CL_M_1_PI          0.318309886183790691216
-#define  CL_M_2_PI          0.636619772367581382433
-#define  CL_M_2_SQRTPI      1.128379167095512558561
-#define  CL_M_SQRT2         1.414213562373095145475
-#define  CL_M_SQRT1_2       0.707106781186547572737
-
-#define  CL_M_E_F           2.71828174591064f
-#define  CL_M_LOG2E_F       1.44269502162933f
-#define  CL_M_LOG10E_F      0.43429449200630f
-#define  CL_M_LN2_F         0.69314718246460f
-#define  CL_M_LN10_F        2.30258512496948f
-#define  CL_M_PI_F          3.14159274101257f
-#define  CL_M_PI_2_F        1.57079637050629f
-#define  CL_M_PI_4_F        0.78539818525314f
-#define  CL_M_1_PI_F        0.31830987334251f
-#define  CL_M_2_PI_F        0.63661974668503f
-#define  CL_M_2_SQRTPI_F    1.12837922573090f
-#define  CL_M_SQRT2_F       1.41421353816986f
-#define  CL_M_SQRT1_2_F     0.70710676908493f
-
-#define CL_NAN              (CL_INFINITY - CL_INFINITY)
-#define CL_HUGE_VALF        ((cl_float) 1e50)
-#define CL_HUGE_VAL         ((cl_double) 1e500)
-#define CL_MAXFLOAT         CL_FLT_MAX
-#define CL_INFINITY         CL_HUGE_VALF
-
-#else
-
-#include <stdint.h>
-
-/* scalar types  */
-typedef int8_t          cl_char;
-typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short    __attribute__((aligned(2)));
-typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
-typedef int32_t         cl_int      __attribute__((aligned(4)));
-typedef uint32_t        cl_uint     __attribute__((aligned(4)));
-typedef int64_t         cl_long     __attribute__((aligned(8)));
-typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
-
-typedef uint16_t        cl_half     __attribute__((aligned(2)));
-typedef float           cl_float    __attribute__((aligned(4)));
-typedef double          cl_double   __attribute__((aligned(8)));
-
-/* Macro names and corresponding values defined by OpenCL */
-#define CL_CHAR_BIT         8
-#define CL_SCHAR_MAX        127
-#define CL_SCHAR_MIN        (-127-1)
-#define CL_CHAR_MAX         CL_SCHAR_MAX
-#define CL_CHAR_MIN         CL_SCHAR_MIN
-#define CL_UCHAR_MAX        255
-#define CL_SHRT_MAX         32767
-#define CL_SHRT_MIN         (-32767-1)
-#define CL_USHRT_MAX        65535
-#define CL_INT_MAX          2147483647
-#define CL_INT_MIN          (-2147483647-1)
-#define CL_UINT_MAX         0xffffffffU
-#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-
-#define CL_FLT_DIG          6
-#define CL_FLT_MANT_DIG     24
-#define CL_FLT_MAX_10_EXP   +38
-#define CL_FLT_MAX_EXP      +128
-#define CL_FLT_MIN_10_EXP   -37
-#define CL_FLT_MIN_EXP      -125
-#define CL_FLT_RADIX        2
-#define CL_FLT_MAX          0x1.fffffep127f
-#define CL_FLT_MIN          0x1.0p-126f
-#define CL_FLT_EPSILON      0x1.0p-23f
-
-#define CL_DBL_DIG          15
-#define CL_DBL_MANT_DIG     53
-#define CL_DBL_MAX_10_EXP   +308
-#define CL_DBL_MAX_EXP      +1024
-#define CL_DBL_MIN_10_EXP   -307
-#define CL_DBL_MIN_EXP      -1021
-#define CL_DBL_RADIX        2
-#define CL_DBL_MAX          0x1.fffffffffffffp1023
-#define CL_DBL_MIN          0x1.0p-1022
-#define CL_DBL_EPSILON      0x1.0p-52
-
-#define  CL_M_E             2.718281828459045090796
-#define  CL_M_LOG2E         1.442695040888963387005
-#define  CL_M_LOG10E        0.434294481903251816668
-#define  CL_M_LN2           0.693147180559945286227
-#define  CL_M_LN10          2.302585092994045901094
-#define  CL_M_PI            3.141592653589793115998
-#define  CL_M_PI_2          1.570796326794896557999
-#define  CL_M_PI_4          0.785398163397448278999
-#define  CL_M_1_PI          0.318309886183790691216
-#define  CL_M_2_PI          0.636619772367581382433
-#define  CL_M_2_SQRTPI      1.128379167095512558561
-#define  CL_M_SQRT2         1.414213562373095145475
-#define  CL_M_SQRT1_2       0.707106781186547572737
-
-#define  CL_M_E_F           2.71828174591064f
-#define  CL_M_LOG2E_F       1.44269502162933f
-#define  CL_M_LOG10E_F      0.43429449200630f
-#define  CL_M_LN2_F         0.69314718246460f
-#define  CL_M_LN10_F        2.30258512496948f
-#define  CL_M_PI_F          3.14159274101257f
-#define  CL_M_PI_2_F        1.57079637050629f
-#define  CL_M_PI_4_F        0.78539818525314f
-#define  CL_M_1_PI_F        0.31830987334251f
-#define  CL_M_2_PI_F        0.63661974668503f
-#define  CL_M_2_SQRTPI_F    1.12837922573090f
-#define  CL_M_SQRT2_F       1.41421353816986f
-#define  CL_M_SQRT1_2_F     0.70710676908493f
-
-#if defined( __GNUC__ )
-   #define CL_HUGE_VALF     __builtin_huge_valf()
-   #define CL_HUGE_VAL      __builtin_huge_val()
-   #define CL_NAN           __builtin_nanf( "" )
-#else
-   #define CL_HUGE_VALF     ((cl_float) 1e50)
-   #define CL_HUGE_VAL      ((cl_double) 1e500)
-   float nanf( const char * );
-   #define CL_NAN           nanf( "" )
-#endif
-#define CL_MAXFLOAT         CL_FLT_MAX
-#define CL_INFINITY         CL_HUGE_VALF
-
-#endif
-
-#include <stddef.h>
-
-/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
-typedef unsigned int cl_GLuint;
-typedef int          cl_GLint;
-typedef unsigned int cl_GLenum;
-
-/*
- * Vector types
- *
- *  Note:   OpenCL requires that all types be naturally aligned.
- *          This means that vector types must be naturally aligned.
- *          For example, a vector of four floats must be aligned to
- *          a 16 byte boundary (calculated as 4 * the natural 4-byte
- *          alignment of the float).  The alignment qualifiers here
- *          will only function properly if your compiler supports them
- *          and if you don't actively work to defeat them.  For example,
- *          in order for a cl_float4 to be 16 byte aligned in a struct,
- *          the start of the struct must itself be 16-byte aligned.
- *
- *          Maintaining proper alignment is the user's responsibility.
- */
-
-/* Define basic vector types */
-#if defined( __VEC__ )
-   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
-   typedef vector unsigned char     __cl_uchar16;
-   typedef vector signed char       __cl_char16;
-   typedef vector unsigned short    __cl_ushort8;
-   typedef vector signed short      __cl_short8;
-   typedef vector unsigned int      __cl_uint4;
-   typedef vector signed int        __cl_int4;
-   typedef vector float             __cl_float4;
-   #define  __CL_UCHAR16__  1
-   #define  __CL_CHAR16__   1
-   #define  __CL_USHORT8__  1
-   #define  __CL_SHORT8__   1
-   #define  __CL_UINT4__    1
-   #define  __CL_INT4__     1
-   #define  __CL_FLOAT4__   1
-#endif
-
-#if defined( __SSE__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <xmmintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef float __cl_float4   __attribute__((vector_size(16)));
-    #else
-        typedef __m128 __cl_float4;
-    #endif
-    #define __CL_FLOAT4__   1
-#endif
-
-#if defined( __SSE2__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <emmintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
-        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
-        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
-        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
-        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
-        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
-        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
-        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
-        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
-    #else
-        typedef __m128i __cl_uchar16;
-        typedef __m128i __cl_char16;
-        typedef __m128i __cl_ushort8;
-        typedef __m128i __cl_short8;
-        typedef __m128i __cl_uint4;
-        typedef __m128i __cl_int4;
-        typedef __m128i __cl_ulong2;
-        typedef __m128i __cl_long2;
-        typedef __m128d __cl_double2;
-    #endif
-    #define __CL_UCHAR16__  1
-    #define __CL_CHAR16__   1
-    #define __CL_USHORT8__  1
-    #define __CL_SHORT8__   1
-    #define __CL_INT4__     1
-    #define __CL_UINT4__    1
-    #define __CL_ULONG2__   1
-    #define __CL_LONG2__    1
-    #define __CL_DOUBLE2__  1
-#endif
-
-#if defined( __MMX__ )
-    #include <mmintrin.h>
-    #if defined( __GNUC__ )
-        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
-        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
-        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
-        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
-        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
-        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
-        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
-        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
-        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
-    #else
-        typedef __m64       __cl_uchar8;
-        typedef __m64       __cl_char8;
-        typedef __m64       __cl_ushort4;
-        typedef __m64       __cl_short4;
-        typedef __m64       __cl_uint2;
-        typedef __m64       __cl_int2;
-        typedef __m64       __cl_ulong1;
-        typedef __m64       __cl_long1;
-        typedef __m64       __cl_float2;
-    #endif
-    #define __CL_UCHAR8__   1
-    #define __CL_CHAR8__    1
-    #define __CL_USHORT4__  1
-    #define __CL_SHORT4__   1
-    #define __CL_INT2__     1
-    #define __CL_UINT2__    1
-    #define __CL_ULONG1__   1
-    #define __CL_LONG1__    1
-    #define __CL_FLOAT2__   1
-#endif
-
-#if defined( __AVX__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <immintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
-        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
-    #else
-        typedef __m256      __cl_float8;
-        typedef __m256d     __cl_double4;
-    #endif
-    #define __CL_FLOAT8__   1
-    #define __CL_DOUBLE4__  1
-#endif
-
-/* Define alignment keys */
-#if defined( __GNUC__ )
-    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
-#elif defined( _WIN32) && (_MSC_VER)
-    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
-    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
-    /* #include <crtdefs.h>                                                                                             */
-    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
-    #define CL_ALIGNED(_x)
-#else
-   #warning  Need to implement some method to align data here
-   #define  CL_ALIGNED(_x)
-#endif
-
-/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-    /* .xyzw and .s0123...{f|F} are supported */
-    #define CL_HAS_NAMED_VECTOR_FIELDS 1
-    /* .hi and .lo are supported */
-    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
-#endif
-
-/* Define cl_vector types */
-
-/* ---- cl_charn ---- */
-typedef union
-{
-    cl_char  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y; };
-   __extension__ struct{ cl_char  s0, s1; };
-   __extension__ struct{ cl_char  lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2;
-#endif
-}cl_char2;
-
-typedef union
-{
-    cl_char  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3; };
-   __extension__ struct{ cl_char2 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[2];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4;
-#endif
-}cl_char4;
-
-/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
-typedef  cl_char4  cl_char3;
-
-typedef union
-{
-    cl_char   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_char4 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[4];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4[2];
-#endif
-#if defined( __CL_CHAR8__ )
-    __cl_char8     v8;
-#endif
-}cl_char8;
-
-typedef union
-{
-    cl_char  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_char8 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[8];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4[4];
-#endif
-#if defined( __CL_CHAR8__ )
-    __cl_char8     v8[2];
-#endif
-#if defined( __CL_CHAR16__ )
-    __cl_char16    v16;
-#endif
-}cl_char16;
-
-
-/* ---- cl_ucharn ---- */
-typedef union
-{
-    cl_uchar  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y; };
-   __extension__ struct{ cl_uchar  s0, s1; };
-   __extension__ struct{ cl_uchar  lo, hi; };
-#endif
-#if defined( __cl_uchar2__)
-    __cl_uchar2     v2;
-#endif
-}cl_uchar2;
-
-typedef union
-{
-    cl_uchar  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uchar2 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[2];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4;
-#endif
-}cl_uchar4;
-
-/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
-typedef  cl_uchar4  cl_uchar3;
-
-typedef union
-{
-    cl_uchar   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uchar4 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[4];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4[2];
-#endif
-#if defined( __CL_UCHAR8__ )
-    __cl_uchar8     v8;
-#endif
-}cl_uchar8;
-
-typedef union
-{
-    cl_uchar  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uchar8 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[8];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4[4];
-#endif
-#if defined( __CL_UCHAR8__ )
-    __cl_uchar8     v8[2];
-#endif
-#if defined( __CL_UCHAR16__ )
-    __cl_uchar16    v16;
-#endif
-}cl_uchar16;
-
-
-/* ---- cl_shortn ---- */
-typedef union
-{
-    cl_short  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y; };
-   __extension__ struct{ cl_short  s0, s1; };
-   __extension__ struct{ cl_short  lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2;
-#endif
-}cl_short2;
-
-typedef union
-{
-    cl_short  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3; };
-   __extension__ struct{ cl_short2 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[2];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4;
-#endif
-}cl_short4;
-
-/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
-typedef  cl_short4  cl_short3;
-
-typedef union
-{
-    cl_short   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_short4 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[4];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4[2];
-#endif
-#if defined( __CL_SHORT8__ )
-    __cl_short8     v8;
-#endif
-}cl_short8;
-
-typedef union
-{
-    cl_short  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_short8 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[8];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4[4];
-#endif
-#if defined( __CL_SHORT8__ )
-    __cl_short8     v8[2];
-#endif
-#if defined( __CL_SHORT16__ )
-    __cl_short16    v16;
-#endif
-}cl_short16;
-
-
-/* ---- cl_ushortn ---- */
-typedef union
-{
-    cl_ushort  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y; };
-   __extension__ struct{ cl_ushort  s0, s1; };
-   __extension__ struct{ cl_ushort  lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2;
-#endif
-}cl_ushort2;
-
-typedef union
-{
-    cl_ushort  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ushort2 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[2];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4;
-#endif
-}cl_ushort4;
-
-/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
-typedef  cl_ushort4  cl_ushort3;
-
-typedef union
-{
-    cl_ushort   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ushort4 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[4];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4[2];
-#endif
-#if defined( __CL_USHORT8__ )
-    __cl_ushort8     v8;
-#endif
-}cl_ushort8;
-
-typedef union
-{
-    cl_ushort  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ushort8 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[8];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4[4];
-#endif
-#if defined( __CL_USHORT8__ )
-    __cl_ushort8     v8[2];
-#endif
-#if defined( __CL_USHORT16__ )
-    __cl_ushort16    v16;
-#endif
-}cl_ushort16;
-
-/* ---- cl_intn ---- */
-typedef union
-{
-    cl_int  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y; };
-   __extension__ struct{ cl_int  s0, s1; };
-   __extension__ struct{ cl_int  lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2;
-#endif
-}cl_int2;
-
-typedef union
-{
-    cl_int  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3; };
-   __extension__ struct{ cl_int2 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[2];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4;
-#endif
-}cl_int4;
-
-/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
-typedef  cl_int4  cl_int3;
-
-typedef union
-{
-    cl_int   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_int4 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[4];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4[2];
-#endif
-#if defined( __CL_INT8__ )
-    __cl_int8     v8;
-#endif
-}cl_int8;
-
-typedef union
-{
-    cl_int  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_int8 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[8];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4[4];
-#endif
-#if defined( __CL_INT8__ )
-    __cl_int8     v8[2];
-#endif
-#if defined( __CL_INT16__ )
-    __cl_int16    v16;
-#endif
-}cl_int16;
-
-
-/* ---- cl_uintn ---- */
-typedef union
-{
-    cl_uint  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y; };
-   __extension__ struct{ cl_uint  s0, s1; };
-   __extension__ struct{ cl_uint  lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2;
-#endif
-}cl_uint2;
-
-typedef union
-{
-    cl_uint  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uint2 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[2];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4;
-#endif
-}cl_uint4;
-
-/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
-typedef  cl_uint4  cl_uint3;
-
-typedef union
-{
-    cl_uint   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uint4 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[4];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4[2];
-#endif
-#if defined( __CL_UINT8__ )
-    __cl_uint8     v8;
-#endif
-}cl_uint8;
-
-typedef union
-{
-    cl_uint  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uint8 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[8];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4[4];
-#endif
-#if defined( __CL_UINT8__ )
-    __cl_uint8     v8[2];
-#endif
-#if defined( __CL_UINT16__ )
-    __cl_uint16    v16;
-#endif
-}cl_uint16;
-
-/* ---- cl_longn ---- */
-typedef union
-{
-    cl_long  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y; };
-   __extension__ struct{ cl_long  s0, s1; };
-   __extension__ struct{ cl_long  lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2;
-#endif
-}cl_long2;
-
-typedef union
-{
-    cl_long  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3; };
-   __extension__ struct{ cl_long2 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[2];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4;
-#endif
-}cl_long4;
-
-/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
-typedef  cl_long4  cl_long3;
-
-typedef union
-{
-    cl_long   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_long4 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[4];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4[2];
-#endif
-#if defined( __CL_LONG8__ )
-    __cl_long8     v8;
-#endif
-}cl_long8;
-
-typedef union
-{
-    cl_long  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_long8 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[8];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4[4];
-#endif
-#if defined( __CL_LONG8__ )
-    __cl_long8     v8[2];
-#endif
-#if defined( __CL_LONG16__ )
-    __cl_long16    v16;
-#endif
-}cl_long16;
-
-
-/* ---- cl_ulongn ---- */
-typedef union
-{
-    cl_ulong  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y; };
-   __extension__ struct{ cl_ulong  s0, s1; };
-   __extension__ struct{ cl_ulong  lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2;
-#endif
-}cl_ulong2;
-
-typedef union
-{
-    cl_ulong  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ulong2 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[2];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4;
-#endif
-}cl_ulong4;
-
-/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
-typedef  cl_ulong4  cl_ulong3;
-
-typedef union
-{
-    cl_ulong   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ulong4 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[4];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4[2];
-#endif
-#if defined( __CL_ULONG8__ )
-    __cl_ulong8     v8;
-#endif
-}cl_ulong8;
-
-typedef union
-{
-    cl_ulong  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ulong8 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[8];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4[4];
-#endif
-#if defined( __CL_ULONG8__ )
-    __cl_ulong8     v8[2];
-#endif
-#if defined( __CL_ULONG16__ )
-    __cl_ulong16    v16;
-#endif
-}cl_ulong16;
-
-
-/* --- cl_floatn ---- */
-
-typedef union
-{
-    cl_float  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y; };
-   __extension__ struct{ cl_float  s0, s1; };
-   __extension__ struct{ cl_float  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2;
-#endif
-}cl_float2;
-
-typedef union
-{
-    cl_float  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3; };
-   __extension__ struct{ cl_float2  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[2];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4;
-#endif
-}cl_float4;
-
-/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
-typedef  cl_float4  cl_float3;
-
-typedef union
-{
-    cl_float   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_float4  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[4];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4[2];
-#endif
-#if defined( __CL_FLOAT8__ )
-    __cl_float8     v8;
-#endif
-}cl_float8;
-
-typedef union
-{
-    cl_float  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_float8 lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[8];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4[4];
-#endif
-#if defined( __CL_FLOAT8__ )
-    __cl_float8     v8[2];
-#endif
-#if defined( __CL_FLOAT16__ )
-    __cl_float16    v16;
-#endif
-}cl_float16;
-
-/* --- cl_doublen ---- */
-
-typedef union
-{
-    cl_double  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y; };
-   __extension__ struct{ cl_double s0, s1; };
-   __extension__ struct{ cl_double lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2;
-#endif
-}cl_double2;
-
-typedef union
-{
-    cl_double  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3; };
-   __extension__ struct{ cl_double2 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[2];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4;
-#endif
-}cl_double4;
-
-/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
-typedef  cl_double4  cl_double3;
-
-typedef union
-{
-    cl_double   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_double4 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[4];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4[2];
-#endif
-#if defined( __CL_DOUBLE8__ )
-    __cl_double8     v8;
-#endif
-}cl_double8;
-
-typedef union
-{
-    cl_double  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_double8 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[8];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4[4];
-#endif
-#if defined( __CL_DOUBLE8__ )
-    __cl_double8     v8[2];
-#endif
-#if defined( __CL_DOUBLE16__ )
-    __cl_double16    v16;
-#endif
-}cl_double16;
-
-/* Macro to facilitate debugging
- * Usage:
- *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
- *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
- *   Each line thereafter of OpenCL C source must end with: \n\
- *   The last line ends in ";
- *
- *   Example:
- *
- *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
- *   kernel void foo( int a, float * b )             \n\
- *   {                                               \n\
- *      // my comment                                \n\
- *      *b[ get_global_id(0)] = a;                   \n\
- *   }                                               \n\
- *   ";
- *
- * This should correctly set up the line, (column) and file information for your source
- * string so you can do source level debugging.
- */
-#define  __CL_STRINGIFY( _x )               # _x
-#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
-#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __CL_PLATFORM_H  */
diff --git a/android/src/main/libenc/jni/libx264/extras/getopt.c b/android/src/main/libenc/jni/libx264/extras/getopt.c
deleted file mode 100755
index 434efe7..0000000
--- a/android/src/main/libenc/jni/libx264/extras/getopt.c
+++ /dev/null
@@ -1,1065 +0,0 @@
-/* Getopt for GNU.
-   NOTE: getopt is now part of the C library, so if you don't know what
-   "Keep this file name-space clean" means, talk to drepper@gnu.org
-   before changing it!
-   Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001
-   	Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02111-1307 USA.  */
-
-/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
-   Ditto for AIX 3.2 and <stdlib.h>.  */
-#ifndef _NO_PROTO
-# define _NO_PROTO
-#endif
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#if !defined __STDC__ || !__STDC__
-/* This is a separate conditional since some stdc systems
-   reject `defined (const)'.  */
-# ifndef const
-#  define const
-# endif
-#endif
-
-#include <stdio.h>
-
-/* Comment out all this code if we are using the GNU C Library, and are not
-   actually compiling the library itself.  This code is part of the GNU C
-   Library, but also included in many other GNU distributions.  Compiling
-   and linking in this code is a waste when using the GNU C library
-   (especially if it is a shared library).  Rather than having every GNU
-   program understand `configure --with-gnu-libc' and omit the object files,
-   it is simpler to just do this in the source for each such file.  */
-
-#define GETOPT_INTERFACE_VERSION 2
-#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
-# include <gnu-versions.h>
-# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
-#  define ELIDE_CODE
-# endif
-#endif
-
-#ifndef ELIDE_CODE
-
-
-/* This needs to come after some library #include
-   to get __GNU_LIBRARY__ defined.  */
-#ifdef	__GNU_LIBRARY__
-/* Don't include stdlib.h for non-GNU C libraries because some of them
-   contain conflicting prototypes for getopt.  */
-# include <stdlib.h>
-# include <unistd.h>
-#endif	/* GNU C library.  */
-
-#ifdef VMS
-# include <unixlib.h>
-# if HAVE_STRING_H - 0
-#  include <string.h>
-# endif
-#endif
-
-#ifndef _
-/* This is for other GNU distributions with internationalized messages.  */
-# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
-#  include <libintl.h>
-#  ifndef _
-#   define _(msgid)	gettext (msgid)
-#  endif
-# else
-#  define _(msgid)	(msgid)
-# endif
-#endif
-
-/* This version of `getopt' appears to the caller like standard Unix `getopt'
-   but it behaves differently for the user, since it allows the user
-   to intersperse the options with the other arguments.
-
-   As `getopt' works, it permutes the elements of ARGV so that,
-   when it is done, all the options precede everything else.  Thus
-   all application programs are extended to handle flexible argument order.
-
-   Setting the environment variable POSIXLY_CORRECT disables permutation.
-   Then the behavior is completely standard.
-
-   GNU application programs can use a third alternative mode in which
-   they can distinguish the relative order of options and other arguments.  */
-
-#include "getopt.h"
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-/* 1003.2 says this must be 1 before any call.  */
-int optind = 1;
-
-/* Formerly, initialization of getopt depended on optind==0, which
-   causes problems with re-calling getopt as programs generally don't
-   know that. */
-
-int __getopt_initialized;
-
-/* The next char to be scanned in the option-element
-   in which the last option character we returned was found.
-   This allows us to pick up the scan where we left off.
-
-   If this is zero, or a null string, it means resume the scan
-   by advancing to the next ARGV-element.  */
-
-static char *nextchar;
-
-/* Callers store zero here to inhibit the error message
-   for unrecognized options.  */
-
-int opterr = 1;
-
-/* Set to an option character which was unrecognized.
-   This must be initialized on some systems to avoid linking in the
-   system's own getopt implementation.  */
-
-int optopt = '?';
-
-/* Describe how to deal with options that follow non-option ARGV-elements.
-
-   If the caller did not specify anything,
-   the default is REQUIRE_ORDER if the environment variable
-   POSIXLY_CORRECT is defined, PERMUTE otherwise.
-
-   REQUIRE_ORDER means don't recognize them as options;
-   stop option processing when the first non-option is seen.
-   This is what Unix does.
-   This mode of operation is selected by either setting the environment
-   variable POSIXLY_CORRECT, or using `+' as the first character
-   of the list of option characters.
-
-   PERMUTE is the default.  We permute the contents of ARGV as we scan,
-   so that eventually all the non-options are at the end.  This allows options
-   to be given in any order, even with programs that were not written to
-   expect this.
-
-   RETURN_IN_ORDER is an option available to programs that were written
-   to expect options and other ARGV-elements in any order and that care about
-   the ordering of the two.  We describe each non-option ARGV-element
-   as if it were the argument of an option with character code 1.
-   Using `-' as the first character of the list of option characters
-   selects this mode of operation.
-
-   The special argument `--' forces an end of option-scanning regardless
-   of the value of `ordering'.  In the case of RETURN_IN_ORDER, only
-   `--' can cause `getopt' to return -1 with `optind' != ARGC.  */
-
-static enum
-{
-  REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
-} ordering;
-
-/* Value of POSIXLY_CORRECT environment variable.  */
-static char *posixly_correct;
-
-#ifdef	__GNU_LIBRARY__
-/* We want to avoid inclusion of string.h with non-GNU libraries
-   because there are many ways it can cause trouble.
-   On some systems, it contains special magic macros that don't work
-   in GCC.  */
-# include <string.h>
-# define my_index	strchr
-#else
-
-# if HAVE_STRING_H
-#  include <string.h>
-# else
-#  include <strings.h>
-# endif
-
-/* Avoid depending on library functions or files
-   whose names are inconsistent.  */
-
-#ifndef getenv
-extern char *getenv ();
-#endif
-
-static char *
-my_index (str, chr)
-     const char *str;
-     int chr;
-{
-  while (*str)
-    {
-      if (*str == chr)
-	return (char *) str;
-      str++;
-    }
-  return 0;
-}
-
-/* If using GCC, we can safely declare strlen this way.
-   If not using GCC, it is ok not to declare it.  */
-#ifdef __GNUC__
-/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
-   That was relevant to code that was here before.  */
-# if (!defined __STDC__ || !__STDC__) && !defined strlen
-/* gcc with -traditional declares the built-in strlen to return int,
-   and has done so at least since version 2.4.5. -- rms.  */
-extern int strlen (const char *);
-# endif /* not __STDC__ */
-#endif /* __GNUC__ */
-
-#endif /* not __GNU_LIBRARY__ */
-
-/* Handle permutation of arguments.  */
-
-/* Describe the part of ARGV that contains non-options that have
-   been skipped.  `first_nonopt' is the index in ARGV of the first of them;
-   `last_nonopt' is the index after the last of them.  */
-
-static int first_nonopt;
-static int last_nonopt;
-
-#ifdef _LIBC
-/* Stored original parameters.
-   XXX This is no good solution.  We should rather copy the args so
-   that we can compare them later.  But we must not use malloc(3).  */
-extern int __libc_argc;
-extern char **__libc_argv;
-
-/* Bash 2.0 gives us an environment variable containing flags
-   indicating ARGV elements that should not be considered arguments.  */
-
-# ifdef USE_NONOPTION_FLAGS
-/* Defined in getopt_init.c  */
-extern char *__getopt_nonoption_flags;
-
-static int nonoption_flags_max_len;
-static int nonoption_flags_len;
-# endif
-
-# ifdef USE_NONOPTION_FLAGS
-#  define SWAP_FLAGS(ch1, ch2) \
-  if (nonoption_flags_len > 0)						      \
-    {									      \
-      char __tmp = __getopt_nonoption_flags[ch1];			      \
-      __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2];	      \
-      __getopt_nonoption_flags[ch2] = __tmp;				      \
-    }
-# else
-#  define SWAP_FLAGS(ch1, ch2)
-# endif
-#else	/* !_LIBC */
-# define SWAP_FLAGS(ch1, ch2)
-#endif	/* _LIBC */
-
-/* Exchange two adjacent subsequences of ARGV.
-   One subsequence is elements [first_nonopt,last_nonopt)
-   which contains all the non-options that have been skipped so far.
-   The other is elements [last_nonopt,optind), which contains all
-   the options processed since those non-options were skipped.
-
-   `first_nonopt' and `last_nonopt' are relocated so that they describe
-   the new indices of the non-options in ARGV after they are moved.  */
-
-#if defined __STDC__ && __STDC__
-static void exchange (char **);
-#endif
-
-static void
-exchange (argv)
-     char **argv;
-{
-  int bottom = first_nonopt;
-  int middle = last_nonopt;
-  int top = optind;
-  char *tem;
-
-  /* Exchange the shorter segment with the far end of the longer segment.
-     That puts the shorter segment into the right place.
-     It leaves the longer segment in the right place overall,
-     but it consists of two parts that need to be swapped next.  */
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-  /* First make sure the handling of the `__getopt_nonoption_flags'
-     string can work normally.  Our top argument must be in the range
-     of the string.  */
-  if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len)
-    {
-      /* We must extend the array.  The user plays games with us and
-	 presents new arguments.  */
-      char *new_str = malloc (top + 1);
-      if (new_str == NULL)
-	nonoption_flags_len = nonoption_flags_max_len = 0;
-      else
-	{
-	  memset (__mempcpy (new_str, __getopt_nonoption_flags,
-			     nonoption_flags_max_len),
-		  '\0', top + 1 - nonoption_flags_max_len);
-	  nonoption_flags_max_len = top + 1;
-	  __getopt_nonoption_flags = new_str;
-	}
-    }
-#endif
-
-  while (top > middle && middle > bottom)
-    {
-      if (top - middle > middle - bottom)
-	{
-	  /* Bottom segment is the short one.  */
-	  int len = middle - bottom;
-	  register int i;
-
-	  /* Swap it with the top part of the top segment.  */
-	  for (i = 0; i < len; i++)
-	    {
-	      tem = argv[bottom + i];
-	      argv[bottom + i] = argv[top - (middle - bottom) + i];
-	      argv[top - (middle - bottom) + i] = tem;
-	      SWAP_FLAGS (bottom + i, top - (middle - bottom) + i);
-	    }
-	  /* Exclude the moved bottom segment from further swapping.  */
-	  top -= len;
-	}
-      else
-	{
-	  /* Top segment is the short one.  */
-	  int len = top - middle;
-	  register int i;
-
-	  /* Swap it with the bottom part of the bottom segment.  */
-	  for (i = 0; i < len; i++)
-	    {
-	      tem = argv[bottom + i];
-	      argv[bottom + i] = argv[middle + i];
-	      argv[middle + i] = tem;
-	      SWAP_FLAGS (bottom + i, middle + i);
-	    }
-	  /* Exclude the moved top segment from further swapping.  */
-	  bottom += len;
-	}
-    }
-
-  /* Update records for the slots the non-options now occupy.  */
-
-  first_nonopt += (optind - last_nonopt);
-  last_nonopt = optind;
-}
-
-/* Initialize the internal data when the first call is made.  */
-
-#if defined __STDC__ && __STDC__
-static const char *_getopt_initialize (int, char *const *, const char *);
-#endif
-static const char *
-_getopt_initialize (argc, argv, optstring)
-     int argc;
-     char *const *argv;
-     const char *optstring;
-{
-  /* Start processing options with ARGV-element 1 (since ARGV-element 0
-     is the program name); the sequence of previously skipped
-     non-option ARGV-elements is empty.  */
-
-  first_nonopt = last_nonopt = optind;
-
-  nextchar = NULL;
-
-  posixly_correct = getenv ("POSIXLY_CORRECT");
-
-  /* Determine how to handle the ordering of options and nonoptions.  */
-
-  if (optstring[0] == '-')
-    {
-      ordering = RETURN_IN_ORDER;
-      ++optstring;
-    }
-  else if (optstring[0] == '+')
-    {
-      ordering = REQUIRE_ORDER;
-      ++optstring;
-    }
-  else if (posixly_correct != NULL)
-    ordering = REQUIRE_ORDER;
-  else
-    ordering = PERMUTE;
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-  if (posixly_correct == NULL
-      && argc == __libc_argc && argv == __libc_argv)
-    {
-      if (nonoption_flags_max_len == 0)
-	{
-	  if (__getopt_nonoption_flags == NULL
-	      || __getopt_nonoption_flags[0] == '\0')
-	    nonoption_flags_max_len = -1;
-	  else
-	    {
-	      const char *orig_str = __getopt_nonoption_flags;
-	      int len = nonoption_flags_max_len = strlen (orig_str);
-	      if (nonoption_flags_max_len < argc)
-		nonoption_flags_max_len = argc;
-	      __getopt_nonoption_flags =
-		(char *) malloc (nonoption_flags_max_len);
-	      if (__getopt_nonoption_flags == NULL)
-		nonoption_flags_max_len = -1;
-	      else
-		memset (__mempcpy (__getopt_nonoption_flags, orig_str, len),
-			'\0', nonoption_flags_max_len - len);
-	    }
-	}
-      nonoption_flags_len = nonoption_flags_max_len;
-    }
-  else
-    nonoption_flags_len = 0;
-#endif
-
-  return optstring;
-}
-
-/* Scan elements of ARGV (whose length is ARGC) for option characters
-   given in OPTSTRING.
-
-   If an element of ARGV starts with '-', and is not exactly "-" or "--",
-   then it is an option element.  The characters of this element
-   (aside from the initial '-') are option characters.  If `getopt'
-   is called repeatedly, it returns successively each of the option characters
-   from each of the option elements.
-
-   If `getopt' finds another option character, it returns that character,
-   updating `optind' and `nextchar' so that the next call to `getopt' can
-   resume the scan with the following option character or ARGV-element.
-
-   If there are no more option characters, `getopt' returns -1.
-   Then `optind' is the index in ARGV of the first ARGV-element
-   that is not an option.  (The ARGV-elements have been permuted
-   so that those that are not options now come last.)
-
-   OPTSTRING is a string containing the legitimate option characters.
-   If an option character is seen that is not listed in OPTSTRING,
-   return '?' after printing an error message.  If you set `opterr' to
-   zero, the error message is suppressed but we still return '?'.
-
-   If a char in OPTSTRING is followed by a colon, that means it wants an arg,
-   so the following text in the same ARGV-element, or the text of the following
-   ARGV-element, is returned in `optarg'.  Two colons mean an option that
-   wants an optional arg; if there is text in the current ARGV-element,
-   it is returned in `optarg', otherwise `optarg' is set to zero.
-
-   If OPTSTRING starts with `-' or `+', it requests different methods of
-   handling the non-option ARGV-elements.
-   See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
-
-   Long-named options begin with `--' instead of `-'.
-   Their names may be abbreviated as long as the abbreviation is unique
-   or is an exact match for some defined option.  If they have an
-   argument, it follows the option name in the same ARGV-element, separated
-   from the option name by a `=', or else the in next ARGV-element.
-   When `getopt' finds a long-named option, it returns 0 if that option's
-   `flag' field is nonzero, the value of the option's `val' field
-   if the `flag' field is zero.
-
-   The elements of ARGV aren't really const, because we permute them.
-   But we pretend they're const in the prototype to be compatible
-   with other systems.
-
-   LONGOPTS is a vector of `struct option' terminated by an
-   element containing a name which is zero.
-
-   LONGIND returns the index in LONGOPT of the long-named option found.
-   It is only valid when a long-named option has been found by the most
-   recent call.
-
-   If LONG_ONLY is nonzero, '-' as well as '--' can introduce
-   long-named options.  */
-
-int
-_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
-     int argc;
-     char *const *argv;
-     const char *optstring;
-     const struct option *longopts;
-     int *longind;
-     int long_only;
-{
-  int print_errors = opterr;
-  if (optstring[0] == ':')
-    print_errors = 0;
-
-  if (argc < 1)
-    return -1;
-
-  optarg = NULL;
-
-  if (optind == 0 || !__getopt_initialized)
-    {
-      if (optind == 0)
-	optind = 1;	/* Don't scan ARGV[0], the program name.  */
-      optstring = _getopt_initialize (argc, argv, optstring);
-      __getopt_initialized = 1;
-    }
-
-  /* Test whether ARGV[optind] points to a non-option argument.
-     Either it does not have option syntax, or there is an environment flag
-     from the shell indicating it is not an option.  The later information
-     is only used when the used in the GNU libc.  */
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0'	      \
-		      || (optind < nonoption_flags_len			      \
-			  && __getopt_nonoption_flags[optind] == '1'))
-#else
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0')
-#endif
-
-  if (nextchar == NULL || *nextchar == '\0')
-    {
-      /* Advance to the next ARGV-element.  */
-
-      /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been
-	 moved back by the user (who may also have changed the arguments).  */
-      if (last_nonopt > optind)
-	last_nonopt = optind;
-      if (first_nonopt > optind)
-	first_nonopt = optind;
-
-      if (ordering == PERMUTE)
-	{
-	  /* If we have just processed some options following some non-options,
-	     exchange them so that the options come first.  */
-
-	  if (first_nonopt != last_nonopt && last_nonopt != optind)
-	    exchange ((char **) argv);
-	  else if (last_nonopt != optind)
-	    first_nonopt = optind;
-
-	  /* Skip any additional non-options
-	     and extend the range of non-options previously skipped.  */
-
-	  while (optind < argc && NONOPTION_P)
-	    optind++;
-	  last_nonopt = optind;
-	}
-
-      /* The special ARGV-element `--' means premature end of options.
-	 Skip it like a null option,
-	 then exchange with previous non-options as if it were an option,
-	 then skip everything else like a non-option.  */
-
-      if (optind != argc && !strcmp (argv[optind], "--"))
-	{
-	  optind++;
-
-	  if (first_nonopt != last_nonopt && last_nonopt != optind)
-	    exchange ((char **) argv);
-	  else if (first_nonopt == last_nonopt)
-	    first_nonopt = optind;
-	  last_nonopt = argc;
-
-	  optind = argc;
-	}
-
-      /* If we have done all the ARGV-elements, stop the scan
-	 and back over any non-options that we skipped and permuted.  */
-
-      if (optind == argc)
-	{
-	  /* Set the next-arg-index to point at the non-options
-	     that we previously skipped, so the caller will digest them.  */
-	  if (first_nonopt != last_nonopt)
-	    optind = first_nonopt;
-	  return -1;
-	}
-
-      /* If we have come to a non-option and did not permute it,
-	 either stop the scan or describe it to the caller and pass it by.  */
-
-      if (NONOPTION_P)
-	{
-	  if (ordering == REQUIRE_ORDER)
-	    return -1;
-	  optarg = argv[optind++];
-	  return 1;
-	}
-
-      /* We have found another option-ARGV-element.
-	 Skip the initial punctuation.  */
-
-      nextchar = (argv[optind] + 1
-		  + (longopts != NULL && argv[optind][1] == '-'));
-    }
-
-  /* Decode the current option-ARGV-element.  */
-
-  /* Check whether the ARGV-element is a long option.
-
-     If long_only and the ARGV-element has the form "-f", where f is
-     a valid short option, don't consider it an abbreviated form of
-     a long option that starts with f.  Otherwise there would be no
-     way to give the -f short option.
-
-     On the other hand, if there's a long option "fubar" and
-     the ARGV-element is "-fu", do consider that an abbreviation of
-     the long option, just like "--fu", and not "-f" with arg "u".
-
-     This distinction seems to be the most useful approach.  */
-
-  if (longopts != NULL
-      && (argv[optind][1] == '-'
-	  || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
-    {
-      char *nameend;
-      const struct option *p;
-      const struct option *pfound = NULL;
-      int exact = 0;
-      int ambig = 0;
-      int indfound = -1;
-      int option_index;
-
-      for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
-	/* Do nothing.  */ ;
-
-      /* Test all long options for either exact match
-	 or abbreviated matches.  */
-      for (p = longopts, option_index = 0; p->name; p++, option_index++)
-	if (!strncmp (p->name, nextchar, nameend - nextchar))
-	  {
-	    if ((unsigned int) (nameend - nextchar)
-		== (unsigned int) strlen (p->name))
-	      {
-		/* Exact match found.  */
-		pfound = p;
-		indfound = option_index;
-		exact = 1;
-		break;
-	      }
-	    else if (pfound == NULL)
-	      {
-		/* First nonexact match found.  */
-		pfound = p;
-		indfound = option_index;
-	      }
-	    else if (long_only
-		     || pfound->has_arg != p->has_arg
-		     || pfound->flag != p->flag
-		     || pfound->val != p->val)
-	      /* Second or later nonexact match found.  */
-	      ambig = 1;
-	  }
-
-      if (ambig && !exact)
-	{
-	  if (print_errors)
-	    fprintf (stderr, _("%s: option `%s' is ambiguous\n"),
-		     argv[0], argv[optind]);
-	  nextchar += strlen (nextchar);
-	  optind++;
-	  optopt = 0;
-	  return '?';
-	}
-
-      if (pfound != NULL)
-	{
-	  option_index = indfound;
-	  optind++;
-	  if (*nameend)
-	    {
-	      /* Don't test has_arg with >, because some C compilers don't
-		 allow it to be used on enums.  */
-	      if (pfound->has_arg)
-		optarg = nameend + 1;
-	      else
-		{
-		  if (print_errors)
-		    {
-		      if (argv[optind - 1][1] == '-')
-			/* --option */
-			fprintf (stderr,
-				 _("%s: option `--%s' doesn't allow an argument\n"),
-				 argv[0], pfound->name);
-		      else
-			/* +option or -option */
-			fprintf (stderr,
-				 _("%s: option `%c%s' doesn't allow an argument\n"),
-				 argv[0], argv[optind - 1][0], pfound->name);
-		    }
-
-		  nextchar += strlen (nextchar);
-
-		  optopt = pfound->val;
-		  return '?';
-		}
-	    }
-	  else if (pfound->has_arg == 1)
-	    {
-	      if (optind < argc)
-		optarg = argv[optind++];
-	      else
-		{
-		  if (print_errors)
-		    fprintf (stderr,
-			   _("%s: option `%s' requires an argument\n"),
-			   argv[0], argv[optind - 1]);
-		  nextchar += strlen (nextchar);
-		  optopt = pfound->val;
-		  return optstring[0] == ':' ? ':' : '?';
-		}
-	    }
-	  nextchar += strlen (nextchar);
-	  if (longind != NULL)
-	    *longind = option_index;
-	  if (pfound->flag)
-	    {
-	      *(pfound->flag) = pfound->val;
-	      return 0;
-	    }
-	  return pfound->val;
-	}
-
-      /* Can't find it as a long option.  If this is not getopt_long_only,
-	 or the option starts with '--' or is not a valid short
-	 option, then it's an error.
-	 Otherwise interpret it as a short option.  */
-      if (!long_only || argv[optind][1] == '-'
-	  || my_index (optstring, *nextchar) == NULL)
-	{
-	  if (print_errors)
-	    {
-	      if (argv[optind][1] == '-')
-		/* --option */
-		fprintf (stderr, _("%s: unrecognized option `--%s'\n"),
-			 argv[0], nextchar);
-	      else
-		/* +option or -option */
-		fprintf (stderr, _("%s: unrecognized option `%c%s'\n"),
-			 argv[0], argv[optind][0], nextchar);
-	    }
-	  nextchar = (char *) "";
-	  optind++;
-	  optopt = 0;
-	  return '?';
-	}
-    }
-
-  /* Look at and handle the next short option-character.  */
-
-  {
-    char c = *nextchar++;
-    char *temp = my_index (optstring, c);
-
-    /* Increment `optind' when we start to process its last character.  */
-    if (*nextchar == '\0')
-      ++optind;
-
-    if (temp == NULL || c == ':')
-      {
-	if (print_errors)
-	  {
-	    if (posixly_correct)
-	      /* 1003.2 specifies the format of this message.  */
-	      fprintf (stderr, _("%s: illegal option -- %c\n"),
-		       argv[0], c);
-	    else
-	      fprintf (stderr, _("%s: invalid option -- %c\n"),
-		       argv[0], c);
-	  }
-	optopt = c;
-	return '?';
-      }
-    /* Convenience. Treat POSIX -W foo same as long option --foo */
-    if (temp[0] == 'W' && temp[1] == ';')
-      {
-	char *nameend;
-	const struct option *p;
-	const struct option *pfound = NULL;
-	int exact = 0;
-	int ambig = 0;
-	int indfound = 0;
-	int option_index;
-
-	/* This is an option that requires an argument.  */
-	if (*nextchar != '\0')
-	  {
-	    optarg = nextchar;
-	    /* If we end this ARGV-element by taking the rest as an arg,
-	       we must advance to the next element now.  */
-	    optind++;
-	  }
-	else if (optind == argc)
-	  {
-	    if (print_errors)
-	      {
-		/* 1003.2 specifies the format of this message.  */
-		fprintf (stderr, _("%s: option requires an argument -- %c\n"),
-			 argv[0], c);
-	      }
-	    optopt = c;
-	    if (optstring[0] == ':')
-	      c = ':';
-	    else
-	      c = '?';
-	    return c;
-	  }
-	else
-	  /* We already incremented `optind' once;
-	     increment it again when taking next ARGV-elt as argument.  */
-	  optarg = argv[optind++];
-
-	/* optarg is now the argument, see if it's in the
-	   table of longopts.  */
-
-	for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++)
-	  /* Do nothing.  */ ;
-
-	/* Test all long options for either exact match
-	   or abbreviated matches.  */
-	for (p = longopts, option_index = 0; p->name; p++, option_index++)
-	  if (!strncmp (p->name, nextchar, nameend - nextchar))
-	    {
-	      if ((unsigned int) (nameend - nextchar) == strlen (p->name))
-		{
-		  /* Exact match found.  */
-		  pfound = p;
-		  indfound = option_index;
-		  exact = 1;
-		  break;
-		}
-	      else if (pfound == NULL)
-		{
-		  /* First nonexact match found.  */
-		  pfound = p;
-		  indfound = option_index;
-		}
-	      else
-		/* Second or later nonexact match found.  */
-		ambig = 1;
-	    }
-	if (ambig && !exact)
-	  {
-	    if (print_errors)
-	      fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"),
-		       argv[0], argv[optind]);
-	    nextchar += strlen (nextchar);
-	    optind++;
-	    return '?';
-	  }
-	if (pfound != NULL)
-	  {
-	    option_index = indfound;
-	    if (*nameend)
-	      {
-		/* Don't test has_arg with >, because some C compilers don't
-		   allow it to be used on enums.  */
-		if (pfound->has_arg)
-		  optarg = nameend + 1;
-		else
-		  {
-		    if (print_errors)
-		      fprintf (stderr, _("\
-%s: option `-W %s' doesn't allow an argument\n"),
-			       argv[0], pfound->name);
-
-		    nextchar += strlen (nextchar);
-		    return '?';
-		  }
-	      }
-	    else if (pfound->has_arg == 1)
-	      {
-		if (optind < argc)
-		  optarg = argv[optind++];
-		else
-		  {
-		    if (print_errors)
-		      fprintf (stderr,
-			       _("%s: option `%s' requires an argument\n"),
-			       argv[0], argv[optind - 1]);
-		    nextchar += strlen (nextchar);
-		    return optstring[0] == ':' ? ':' : '?';
-		  }
-	      }
-	    nextchar += strlen (nextchar);
-	    if (longind != NULL)
-	      *longind = option_index;
-	    if (pfound->flag)
-	      {
-		*(pfound->flag) = pfound->val;
-		return 0;
-	      }
-	    return pfound->val;
-	  }
-	  nextchar = NULL;
-	  return 'W';	/* Let the application handle it.   */
-      }
-    if (temp[1] == ':')
-      {
-	if (temp[2] == ':')
-	  {
-	    /* This is an option that accepts an argument optionally.  */
-	    if (*nextchar != '\0')
-	      {
-		optarg = nextchar;
-		optind++;
-	      }
-	    else
-	      optarg = NULL;
-	    nextchar = NULL;
-	  }
-	else
-	  {
-	    /* This is an option that requires an argument.  */
-	    if (*nextchar != '\0')
-	      {
-		optarg = nextchar;
-		/* If we end this ARGV-element by taking the rest as an arg,
-		   we must advance to the next element now.  */
-		optind++;
-	      }
-	    else if (optind == argc)
-	      {
-		if (print_errors)
-		  {
-		    /* 1003.2 specifies the format of this message.  */
-		    fprintf (stderr,
-			     _("%s: option requires an argument -- %c\n"),
-			     argv[0], c);
-		  }
-		optopt = c;
-		if (optstring[0] == ':')
-		  c = ':';
-		else
-		  c = '?';
-	      }
-	    else
-	      /* We already incremented `optind' once;
-		 increment it again when taking next ARGV-elt as argument.  */
-	      optarg = argv[optind++];
-	    nextchar = NULL;
-	  }
-      }
-    return c;
-  }
-}
-
-int
-getopt (argc, argv, optstring)
-     int argc;
-     char *const *argv;
-     const char *optstring;
-{
-  return _getopt_internal (argc, argv, optstring,
-			   (const struct option *) 0,
-			   (int *) 0,
-			   0);
-}
-
-int getopt_long (argc, argv, optstring, long_options, opt_index)
-     int argc;
-     char *const *argv;
-     const char *optstring;
-     const struct option *long_options;
-     int *opt_index;
-{
-  return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0);
-}
-
-#endif	/* Not ELIDE_CODE.  */
-
-#ifdef TEST
-
-/* Compile with -DTEST to make an executable for use in testing
-   the above definition of `getopt'.  */
-
-int
-main (argc, argv)
-     int argc;
-     char **argv;
-{
-  int c;
-  int digit_optind = 0;
-
-  while (1)
-    {
-      int this_option_optind = optind ? optind : 1;
-
-      c = getopt (argc, argv, "abc:d:0123456789");
-      if (c == -1)
-	break;
-
-      switch (c)
-	{
-	case '0':
-	case '1':
-	case '2':
-	case '3':
-	case '4':
-	case '5':
-	case '6':
-	case '7':
-	case '8':
-	case '9':
-	  if (digit_optind != 0 && digit_optind != this_option_optind)
-	    printf ("digits occur in two different argv-elements.\n");
-	  digit_optind = this_option_optind;
-	  printf ("option %c\n", c);
-	  break;
-
-	case 'a':
-	  printf ("option a\n");
-	  break;
-
-	case 'b':
-	  printf ("option b\n");
-	  break;
-
-	case 'c':
-	  printf ("option c with value `%s'\n", optarg);
-	  break;
-
-	case '?':
-	  break;
-
-	default:
-	  printf ("?? getopt returned character code 0%o ??\n", c);
-	}
-    }
-
-  if (optind < argc)
-    {
-      printf ("non-option ARGV-elements: ");
-      while (optind < argc)
-	printf ("%s ", argv[optind++]);
-      printf ("\n");
-    }
-
-  exit (0);
-}
-
-#endif /* TEST */
diff --git a/android/src/main/libenc/jni/libx264/extras/getopt.h b/android/src/main/libenc/jni/libx264/extras/getopt.h
deleted file mode 100755
index e2e40da..0000000
--- a/android/src/main/libenc/jni/libx264/extras/getopt.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Declarations for getopt.
-   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software Foundation,
-   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.  */
-
-#ifndef _GETOPT_H
-
-#ifndef __need_getopt
-# define _GETOPT_H 1
-#endif
-
-/* If __GNU_LIBRARY__ is not already defined, either we are being used
-   standalone, or this is the first header included in the source file.
-   If we are being used with glibc, we need to include <features.h>, but
-   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
-   not defined, include <ctype.h>, which will pull in <features.h> for us
-   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
-   doesn't flood the namespace with stuff the way some other headers do.)  */
-#if !defined __GNU_LIBRARY__
-# include <ctype.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-extern char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-extern int optind;
-
-/* Callers store zero here to inhibit the error message `getopt' prints
-   for unrecognized options.  */
-
-extern int opterr;
-
-/* Set to an option character which was unrecognized.  */
-
-extern int optopt;
-
-#ifndef __need_getopt
-/* Describe the long-named options requested by the application.
-   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
-   of `struct option' terminated by an element containing a name which is
-   zero.
-
-   The field `has_arg' is:
-   no_argument		(or 0) if the option does not take an argument,
-   required_argument	(or 1) if the option requires an argument,
-   optional_argument 	(or 2) if the option takes an optional argument.
-
-   If the field `flag' is not NULL, it points to a variable that is set
-   to the value given in the field `val' when the option is found, but
-   left unchanged if the option is not found.
-
-   To have a long-named option do something other than set an `int' to
-   a compiled-in constant, such as set a value from `optarg', set the
-   option's `flag' field to zero and its `val' field to a nonzero
-   value (the equivalent single-letter option character, if there is
-   one).  For long options that have a zero `flag' field, `getopt'
-   returns the contents of the `val' field.  */
-
-struct option
-{
-# if (defined __STDC__ && __STDC__) || defined __cplusplus
-  const char *name;
-# else
-  char *name;
-# endif
-  /* has_arg can't be an enum because some compilers complain about
-     type mismatches in all the code that assumes it is an int.  */
-  int has_arg;
-  int *flag;
-  int val;
-};
-
-/* Names for the values of the `has_arg' field of `struct option'.  */
-
-# define no_argument		0
-# define required_argument	1
-# define optional_argument	2
-#endif	/* need getopt */
-
-
-/* Get definitions and prototypes for functions to process the
-   arguments in ARGV (ARGC of them, minus the program name) for
-   options given in OPTS.
-
-   Return the option character from OPTS just read.  Return -1 when
-   there are no more options.  For unrecognized options, or options
-   missing arguments, `optopt' is set to the option letter, and '?' is
-   returned.
-
-   The OPTS string is a list of characters which are recognized option
-   letters, optionally followed by colons, specifying that that letter
-   takes an argument, to be placed in `optarg'.
-
-   If a letter in OPTS is followed by two colons, its argument is
-   optional.  This behavior is specific to the GNU `getopt'.
-
-   The argument `--' causes premature termination of argument
-   scanning, explicitly telling `getopt' that there are no more
-   options.
-
-   If OPTS begins with `--', then non-option arguments are treated as
-   arguments to the option '\0'.  This behavior is specific to the GNU
-   `getopt'.  */
-
-#if (defined __STDC__ && __STDC__) || defined __cplusplus
-# ifdef __GNU_LIBRARY__
-/* Many other libraries have conflicting prototypes for getopt, with
-   differences in the consts, in stdlib.h.  To avoid compilation
-   errors, only prototype getopt for the GNU C library.  */
-extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
-# else /* not __GNU_LIBRARY__ */
-extern int getopt ();
-# endif /* __GNU_LIBRARY__ */
-
-# ifndef __need_getopt
-extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
-		        const struct option *__longopts, int *__longind);
-extern int getopt_long_only (int __argc, char *const *__argv,
-			     const char *__shortopts,
-		             const struct option *__longopts, int *__longind);
-
-/* Internal only.  Users should not call this directly.  */
-extern int _getopt_internal (int __argc, char *const *__argv,
-			     const char *__shortopts,
-		             const struct option *__longopts, int *__longind,
-			     int __long_only);
-# endif
-#else /* not __STDC__ */
-extern int getopt ();
-# ifndef __need_getopt
-extern int getopt_long ();
-extern int getopt_long_only ();
-
-extern int _getopt_internal ();
-# endif
-#endif /* __STDC__ */
-
-#ifdef	__cplusplus
-}
-#endif
-
-/* Make sure we later can get all the definitions and declarations.  */
-#undef __need_getopt
-
-#endif /* getopt.h */
diff --git a/android/src/main/libenc/jni/libx264/extras/intel_dispatcher.h b/android/src/main/libenc/jni/libx264/extras/intel_dispatcher.h
deleted file mode 100755
index 9f4ed58..0000000
--- a/android/src/main/libenc/jni/libx264/extras/intel_dispatcher.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*****************************************************************************
- * intel_dispatcher.h: intel compiler cpu dispatcher override
- *****************************************************************************
- * Copyright (C) 2014-2016 x264 project
- *
- * Authors: Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_INTEL_DISPATCHER_H
-#define X264_INTEL_DISPATCHER_H
-
-/* Feature flags using _FEATURE_* defines from immintrin.h */
-extern unsigned long long __intel_cpu_feature_indicator;
-extern unsigned long long __intel_cpu_feature_indicator_x;
-
-/* CPU vendor independent version of dispatcher */
-void __intel_cpu_features_init_x( void );
-
-static void x264_intel_dispatcher_override( void )
-{
-    if( __intel_cpu_feature_indicator & ~1ULL )
-        return;
-    __intel_cpu_feature_indicator = 0;
-    __intel_cpu_feature_indicator_x = 0;
-    __intel_cpu_features_init_x();
-    __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
-}
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/extras/inttypes.h b/android/src/main/libenc/jni/libx264/extras/inttypes.h
deleted file mode 100755
index b2e3fc6..0000000
--- a/android/src/main/libenc/jni/libx264/extras/inttypes.h
+++ /dev/null
@@ -1,285 +0,0 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
-//
-//  Copyright (c) 2006 Alexander Chemeris
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-//
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-//
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "I32d"
-#define PRIi32       "I32i"
-#define PRIdLEAST32  "I32d"
-#define PRIiLEAST32  "I32i"
-#define PRIdFAST32   "I32d"
-#define PRIiFAST32   "I32i"
-
-#define PRId64       "I64d"
-#define PRIi64       "I64i"
-#define PRIdLEAST64  "I64d"
-#define PRIiLEAST64  "I64i"
-#define PRIdFAST64   "I64d"
-#define PRIiFAST64   "I64i"
-
-#define PRIdMAX     "I64d"
-#define PRIiMAX     "I64i"
-
-#define PRIdPTR     "Id"
-#define PRIiPTR     "Ii"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "I32o"
-#define PRIu32       "I32u"
-#define PRIx32       "I32x"
-#define PRIX32       "I32X"
-#define PRIoLEAST32  "I32o"
-#define PRIuLEAST32  "I32u"
-#define PRIxLEAST32  "I32x"
-#define PRIXLEAST32  "I32X"
-#define PRIoFAST32   "I32o"
-#define PRIuFAST32   "I32u"
-#define PRIxFAST32   "I32x"
-#define PRIXFAST32   "I32X"
-
-#define PRIo64       "I64o"
-#define PRIu64       "I64u"
-#define PRIx64       "I64x"
-#define PRIX64       "I64X"
-#define PRIoLEAST64  "I64o"
-#define PRIuLEAST64  "I64u"
-#define PRIxLEAST64  "I64x"
-#define PRIXLEAST64  "I64X"
-#define PRIoFAST64   "I64o"
-#define PRIuFAST64   "I64u"
-#define PRIxFAST64   "I64x"
-#define PRIXFAST64   "I64X"
-
-#define PRIoMAX     "I64o"
-#define PRIuMAX     "I64u"
-#define PRIxMAX     "I64x"
-#define PRIXMAX     "I64X"
-
-#define PRIoPTR     "Io"
-#define PRIuPTR     "Iu"
-#define PRIxPTR     "Ix"
-#define PRIXPTR     "IX"
-
-// The fscanf macros for signed integers are:
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
diff --git a/android/src/main/libenc/jni/libx264/extras/stdint.h b/android/src/main/libenc/jni/libx264/extras/stdint.h
deleted file mode 100755
index 9459662..0000000
--- a/android/src/main/libenc/jni/libx264/extras/stdint.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* ISO C9x  7.18  Integer types <stdint.h>
- * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794)
- *
- *  THIS SOFTWARE IS NOT COPYRIGHTED
- *
- *  Contributor: Danny Smith <danny_r_smith_2001@yahoo.co.nz>
- *
- *  This source code is offered for use in the public domain. You may
- *  use, modify or distribute it freely.
- *
- *  This code is distributed in the hope that it will be useful but
- *  WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
- *  DISCLAIMED. This includes but is not limited to warranties of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- *  Date: 2000-12-02
- */
-
-
-#ifndef _STDINT_H
-#define _STDINT_H
-#define __need_wint_t
-#define __need_wchar_t
-#include <stddef.h>
-
-/* 7.18.1.1  Exact-width integer types */
-typedef signed char int8_t;
-typedef unsigned char   uint8_t;
-typedef short  int16_t;
-typedef unsigned short  uint16_t;
-typedef int  int32_t;
-typedef unsigned   uint32_t;
-typedef __int64  int64_t;
-typedef unsigned __int64 uint64_t;
-
-/* 7.18.1.2  Minimum-width integer types */
-typedef signed char int_least8_t;
-typedef unsigned char   uint_least8_t;
-typedef short  int_least16_t;
-typedef unsigned short  uint_least16_t;
-typedef int  int_least32_t;
-typedef unsigned   uint_least32_t;
-typedef __int64  int_least64_t;
-typedef unsigned __int64   uint_least64_t;
-
-/*  7.18.1.3  Fastest minimum-width integer types
- *  Not actually guaranteed to be fastest for all purposes
- *  Here we use the exact-width types for 8 and 16-bit ints.
- */
-typedef char int_fast8_t;
-typedef unsigned char uint_fast8_t;
-typedef short  int_fast16_t;
-typedef unsigned short  uint_fast16_t;
-typedef int  int_fast32_t;
-typedef unsigned  int  uint_fast32_t;
-typedef __int64  int_fast64_t;
-typedef unsigned __int64   uint_fast64_t;
-
-/* 7.18.1.4  Integer types capable of holding object pointers */
-/*typedef int intptr_t;
-typedef unsigned uintptr_t;*/
-
-/* 7.18.1.5  Greatest-width integer types */
-typedef __int64  intmax_t;
-typedef unsigned __int64   uintmax_t;
-
-/* 7.18.2  Limits of specified-width integer types */
-#if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
-
-/* 7.18.2.1  Limits of exact-width integer types */
-#define INT8_MIN (-128)
-#define INT16_MIN (-32768)
-#define INT32_MIN (-2147483647 - 1)
-#define INT64_MIN  (-9223372036854775807LL - 1)
-
-#define INT8_MAX 127
-#define INT16_MAX 32767
-#define INT32_MAX 2147483647
-#define INT64_MAX 9223372036854775807LL
-
-#define UINT8_MAX 0xff /* 255U */
-#define UINT16_MAX 0xffff /* 65535U */
-#define UINT32_MAX 0xffffffff  /* 4294967295U */
-#define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */
-
-/* 7.18.2.2  Limits of minimum-width integer types */
-#define INT_LEAST8_MIN INT8_MIN
-#define INT_LEAST16_MIN INT16_MIN
-#define INT_LEAST32_MIN INT32_MIN
-#define INT_LEAST64_MIN INT64_MIN
-
-#define INT_LEAST8_MAX INT8_MAX
-#define INT_LEAST16_MAX INT16_MAX
-#define INT_LEAST32_MAX INT32_MAX
-#define INT_LEAST64_MAX INT64_MAX
-
-#define UINT_LEAST8_MAX UINT8_MAX
-#define UINT_LEAST16_MAX UINT16_MAX
-#define UINT_LEAST32_MAX UINT32_MAX
-#define UINT_LEAST64_MAX UINT64_MAX
-
-/* 7.18.2.3  Limits of fastest minimum-width integer types */
-#define INT_FAST8_MIN INT8_MIN
-#define INT_FAST16_MIN INT16_MIN
-#define INT_FAST32_MIN INT32_MIN
-#define INT_FAST64_MIN INT64_MIN
-
-#define INT_FAST8_MAX INT8_MAX
-#define INT_FAST16_MAX INT16_MAX
-#define INT_FAST32_MAX INT32_MAX
-#define INT_FAST64_MAX INT64_MAX
-
-#define UINT_FAST8_MAX UINT8_MAX
-#define UINT_FAST16_MAX UINT16_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-#define UINT_FAST64_MAX UINT64_MAX
-
-/* 7.18.2.4  Limits of integer types capable of holding
-    object pointers */
-#if defined(_WIN64) || defined(__LP64__)
-#define INTPTR_MIN INT64_MIN
-#define INTPTR_MAX INT64_MAX
-#define UINTPTR_MAX UINT64_MAX
-#else
-#define INTPTR_MIN INT32_MIN
-#define INTPTR_MAX INT32_MAX
-#define UINTPTR_MAX UINT32_MAX
-#endif
-
-/* 7.18.2.5  Limits of greatest-width integer types */
-#define INTMAX_MIN INT64_MIN
-#define INTMAX_MAX INT64_MAX
-#define UINTMAX_MAX UINT64_MAX
-
-/* 7.18.3  Limits of other integer types */
-#if defined(_WIN64) || defined(__LP64__)
-#define PTRDIFF_MIN INT64_MIN
-#define PTRDIFF_MAX INT64_MAX
-#else
-#define PTRDIFF_MIN INT32_MIN
-#define PTRDIFF_MAX INT32_MAX
-#endif
-
-#define SIG_ATOMIC_MIN INT32_MIN
-#define SIG_ATOMIC_MAX INT32_MAX
-
-#ifndef SIZE_MAX
-#if defined(_WIN64) || defined(__LP64__)
-#define SIZE_MAX UINT64_MAX
-#else
-#define SIZE_MAX UINT32_MAX
-#endif
-#endif
-
-#ifndef WCHAR_MIN  /* also in wchar.h */
-#define WCHAR_MIN 0
-#define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
-#endif
-
-/*
- * wint_t is unsigned short for compatibility with MS runtime
- */
-#define WINT_MIN 0
-#define WINT_MAX ((wint_t)-1) /* UINT16_MAX */
-
-#endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */
-
-
-/* 7.18.4  Macros for integer constants */
-#if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS)
-
-/* 7.18.4.1  Macros for minimum-width integer constants
-
-    Accoding to Douglas Gwyn <gwyn@arl.mil>:
-	"This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC
-	9899:1999 as initially published, the expansion was required
-	to be an integer constant of precisely matching type, which
-	is impossible to accomplish for the shorter types on most
-	platforms, because C99 provides no standard way to designate
-	an integer constant with width less than that of type int.
-	TC1 changed this to require just an integer constant
-	*expression* with *promoted* type."
-
-	The trick used here is from Clive D W Feather.
-*/
-
-#define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val))
-#define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val))
-#define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val))
-#define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val))
-
-#define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val))
-#define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val))
-#define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val))
-#define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val))
-
-/* 7.18.4.2  Macros for greatest-width integer constants */
-#define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val))
-#define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val))
-
-#endif  /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/filters/filters.c b/android/src/main/libenc/jni/libx264/filters/filters.c
deleted file mode 100755
index 1d3108e..0000000
--- a/android/src/main/libenc/jni/libx264/filters/filters.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*****************************************************************************
- * filters.c: common filter functions
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Diogo Franco <diogomfranco@gmail.com>
- *          Steven Walters <kemuri9@gmail.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "filters.h"
-#define RETURN_IF_ERROR( cond, ... ) RETURN_IF_ERR( cond, "options", NULL, __VA_ARGS__ )
-
-char **x264_split_options( const char *opt_str, const char * const *options )
-{
-    int opt_count = 0, options_count = 0, found_named = 0, size = 0;
-    const char *opt = opt_str;
-
-    if( !opt_str )
-        return NULL;
-
-    while( options[options_count] )
-        options_count++;
-
-    do
-    {
-        int length = strcspn( opt, "=," );
-        if( opt[length] == '=' )
-        {
-            const char * const *option = options;
-            while( *option && (strlen( *option ) != length || strncmp( opt, *option, length )) )
-                option++;
-
-            RETURN_IF_ERROR( !*option, "Invalid option '%.*s'\n", length, opt );
-            found_named = 1;
-            length += strcspn( opt + length, "," );
-        }
-        else
-        {
-            RETURN_IF_ERROR( opt_count >= options_count, "Too many options given\n" );
-            RETURN_IF_ERROR( found_named, "Ordered option given after named\n" );
-            size += strlen( options[opt_count] ) + 1;
-        }
-        opt_count++;
-        opt += length;
-    } while( *opt++ );
-
-    int offset = 2 * (opt_count+1) * sizeof(char*);
-    size += offset + (opt - opt_str);
-    char **opts = calloc( 1, size );
-    RETURN_IF_ERROR( !opts, "malloc failed\n" );
-
-#define insert_opt( src, length )\
-do {\
-    opts[i++] = memcpy( (char*)opts + offset, src, length );\
-    offset += length + 1;\
-    src    += length + 1;\
-} while( 0 )
-
-    for( int i = 0; i < 2*opt_count; )
-    {
-        int length = strcspn( opt_str, "=," );
-        if( opt_str[length] == '=' )
-        {
-            insert_opt( opt_str, length );
-            length = strcspn( opt_str, "," );
-        }
-        else
-        {
-            const char *option = options[i/2];
-            int option_length = strlen( option );
-            insert_opt( option, option_length );
-        }
-        insert_opt( opt_str, length );
-    }
-
-    assert( offset == size );
-    return opts;
-}
-
-char *x264_get_option( const char *name, char **split_options )
-{
-    if( split_options )
-    {
-        int last_i = -1;
-        for( int i = 0; split_options[i]; i += 2 )
-            if( !strcmp( split_options[i], name ) )
-                last_i = i;
-        if( last_i >= 0 && split_options[last_i+1][0] )
-            return split_options[last_i+1];
-    }
-    return NULL;
-}
-
-int x264_otob( const char *str, int def )
-{
-   if( str )
-       return !strcasecmp( str, "true" ) || !strcmp( str, "1" ) || !strcasecmp( str, "yes" );
-   return def;
-}
-
-double x264_otof( const char *str, double def )
-{
-   double ret = def;
-   if( str )
-   {
-       char *end;
-       ret = strtod( str, &end );
-       if( end == str || *end != '\0' )
-           ret = def;
-   }
-   return ret;
-}
-
-int x264_otoi( const char *str, int def )
-{
-    int ret = def;
-    if( str )
-    {
-        char *end;
-        ret = strtol( str, &end, 0 );
-        if( end == str || *end != '\0' )
-            ret = def;
-    }
-    return ret;
-}
-
-char *x264_otos( char *str, char *def )
-{
-    return str ? str : def;
-}
diff --git a/android/src/main/libenc/jni/libx264/filters/filters.h b/android/src/main/libenc/jni/libx264/filters/filters.h
deleted file mode 100755
index feceff4..0000000
--- a/android/src/main/libenc/jni/libx264/filters/filters.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*****************************************************************************
- * filters.h: common filter functions
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Diogo Franco <diogomfranco@gmail.com>
- *          Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_FILTERS_H
-#define X264_FILTERS_H
-
-#include "x264cli.h"
-#include "filters/video/video.h"
-
-char **x264_split_options( const char *opt_str, const char * const *options );
-char  *x264_get_option( const char *name, char **split_options );
-int    x264_otob( const char *str, int def );    // option to bool
-double x264_otof( const char *str, double def ); // option to float/double
-int    x264_otoi( const char *str, int def );    // option to int
-char  *x264_otos( char *str, char *def );        // option to string
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/filters/video/cache.c b/android/src/main/libenc/jni/libx264/filters/video/cache.c
deleted file mode 100755
index 5dc746f..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/cache.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*****************************************************************************
- * cache.c: cache video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#include "internal.h"
-#define NAME "cache"
-#define LAST_FRAME (h->first_frame + h->cur_size - 1)
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    int max_size;
-    int first_frame; /* first cached frame */
-    cli_pic_t **cache;
-    int cur_size;
-    int eof;         /* frame beyond end of the file */
-} cache_hnd_t;
-
-cli_vid_filter_t cache_filter;
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    intptr_t size = (intptr_t)opt_string;
-    /* upon a <= 0 cache request, do nothing */
-    if( size <= 0 )
-        return 0;
-    cache_hnd_t *h = calloc( 1, sizeof(cache_hnd_t) );
-    if( !h )
-        return -1;
-
-    h->max_size = size;
-    h->cache = malloc( (h->max_size+1) * sizeof(cli_pic_t*) );
-    if( !h->cache )
-        return -1;
-
-    for( int i = 0; i < h->max_size; i++ )
-    {
-        h->cache[i] = malloc( sizeof(cli_pic_t) );
-        if( !h->cache[i] || x264_cli_pic_alloc( h->cache[i], info->csp, info->width, info->height ) )
-            return -1;
-    }
-    h->cache[h->max_size] = NULL; /* require null terminator for list methods */
-
-    h->prev_filter = *filter;
-    h->prev_hnd = *handle;
-    *handle = h;
-    *filter = cache_filter;
-
-    return 0;
-}
-
-static void fill_cache( cache_hnd_t *h, int frame )
-{
-    /* shift frames out of the cache as the frame request is beyond the filled cache */
-    int shift = frame - LAST_FRAME;
-    /* no frames to shift or no frames left to read */
-    if( shift <= 0 || h->eof )
-        return;
-    /* the next frames to read are either
-     * A) starting at the end of the current cache, or
-     * B) starting at a new frame that has the end of the cache at the desired frame
-     * and proceeding to fill the entire cache */
-    int cur_frame = X264_MAX( h->first_frame + h->cur_size, frame - h->max_size + 1 );
-    /* the new starting point is either
-     * A) the current one shifted the number of frames entering/leaving the cache, or
-     * B) at a new frame that has the end of the cache at the desired frame. */
-    h->first_frame = X264_MIN( h->first_frame + shift, cur_frame );
-    h->cur_size = X264_MAX( h->cur_size - shift, 0 );
-    while( h->cur_size < h->max_size )
-    {
-        cli_pic_t temp;
-        /* the old front frame is going to shift off, overwrite it with the new frame */
-        cli_pic_t *cache = h->cache[0];
-        if( h->prev_filter.get_frame( h->prev_hnd, &temp, cur_frame ) ||
-            x264_cli_pic_copy( cache, &temp ) ||
-            h->prev_filter.release_frame( h->prev_hnd, &temp, cur_frame ) )
-        {
-            h->eof = cur_frame;
-            return;
-        }
-        /* the read was successful, shift the frame off the front to the end */
-        x264_frame_push( (void*)h->cache, x264_frame_shift( (void*)h->cache ) );
-        cur_frame++;
-        h->cur_size++;
-    }
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    cache_hnd_t *h = handle;
-    FAIL_IF_ERR( frame < h->first_frame, NAME, "frame %d is before first cached frame %d \n", frame, h->first_frame );
-    fill_cache( h, frame );
-    if( frame > LAST_FRAME ) /* eof */
-        return -1;
-    int idx = frame - (h->eof ? h->eof - h->max_size : h->first_frame);
-    *output = *h->cache[idx];
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    /* the parent filter's frame has already been released so do nothing here */
-    return 0;
-}
-
-static void free_filter( hnd_t handle )
-{
-    cache_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    for( int i = 0; i < h->max_size; i++ )
-    {
-        x264_cli_pic_clean( h->cache[i] );
-        free( h->cache[i] );
-    }
-    free( h->cache );
-    free( h );
-}
-
-cli_vid_filter_t cache_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/crop.c b/android/src/main/libenc/jni/libx264/filters/video/crop.c
deleted file mode 100755
index c27a3e4..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/crop.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*****************************************************************************
- * crop.c: crop video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *          James Darnley <james.darnley@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#define NAME "crop"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
-
-cli_vid_filter_t crop_filter;
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    int dims[4]; /* left, top, width, height */
-    const x264_cli_csp_t *csp;
-} crop_hnd_t;
-
-static void help( int longhelp )
-{
-    printf( "      "NAME":left,top,right,bottom\n" );
-    if( !longhelp )
-        return;
-    printf( "            removes pixels from the edges of the frame\n" );
-}
-
-static int handle_opts( crop_hnd_t *h, video_info_t *info, char **opts, const char * const *optlist )
-{
-    for( int i = 0; i < 4; i++ )
-    {
-        char *opt = x264_get_option( optlist[i], opts );
-        FAIL_IF_ERROR( !opt, "%s crop value not specified\n", optlist[i] );
-        h->dims[i] = x264_otoi( opt, -1 );
-        FAIL_IF_ERROR( h->dims[i] < 0, "%s crop value `%s' is less than 0\n", optlist[i], opt );
-        int dim_mod = i&1 ? (h->csp->mod_height << info->interlaced) : h->csp->mod_width;
-        FAIL_IF_ERROR( h->dims[i] % dim_mod, "%s crop value `%s' is not a multiple of %d\n", optlist[i], opt, dim_mod );
-    }
-    return 0;
-}
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    FAIL_IF_ERROR( x264_cli_csp_is_invalid( info->csp ), "invalid csp %d\n", info->csp );
-    crop_hnd_t *h = calloc( 1, sizeof(crop_hnd_t) );
-    if( !h )
-        return -1;
-
-    h->csp = x264_cli_get_csp( info->csp );
-    static const char * const optlist[] = { "left", "top", "right", "bottom", NULL };
-    char **opts = x264_split_options( opt_string, optlist );
-    if( !opts )
-        return -1;
-
-    int err = handle_opts( h, info, opts, optlist );
-    free( opts );
-    if( err )
-        return -1;
-
-    h->dims[2] = info->width  - h->dims[0] - h->dims[2];
-    h->dims[3] = info->height - h->dims[1] - h->dims[3];
-    FAIL_IF_ERROR( h->dims[2] <= 0 || h->dims[3] <= 0, "invalid output resolution %dx%d\n", h->dims[2], h->dims[3] );
-
-    if( info->width != h->dims[2] || info->height != h->dims[3] )
-        x264_cli_log( NAME, X264_LOG_INFO, "cropping to %dx%d\n", h->dims[2], h->dims[3] );
-    else
-    {
-        /* do nothing as the user supplied 0s for all the values */
-        free( h );
-        return 0;
-    }
-    /* done initializing, overwrite values */
-    info->width  = h->dims[2];
-    info->height = h->dims[3];
-
-    h->prev_filter = *filter;
-    h->prev_hnd = *handle;
-    *handle = h;
-    *filter = crop_filter;
-
-    return 0;
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    crop_hnd_t *h = handle;
-    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
-        return -1;
-    output->img.width  = h->dims[2];
-    output->img.height = h->dims[3];
-    /* shift the plane pointers down 'top' rows and right 'left' columns. */
-    for( int i = 0; i < output->img.planes; i++ )
-    {
-        intptr_t offset = output->img.stride[i] * h->dims[1] * h->csp->height[i];
-        offset += h->dims[0] * h->csp->width[i] * x264_cli_csp_depth_factor( output->img.csp );
-        output->img.plane[i] += offset;
-    }
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    crop_hnd_t *h = handle;
-    /* NO filter should ever have a dependent release based on the plane pointers,
-     * so avoid unnecessary unshifting */
-    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
-}
-
-static void free_filter( hnd_t handle )
-{
-    crop_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    free( h );
-}
-
-cli_vid_filter_t crop_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/depth.c b/android/src/main/libenc/jni/libx264/filters/video/depth.c
deleted file mode 100755
index 887e297..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/depth.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*****************************************************************************
- * depth.c: bit-depth conversion video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Oskar Arvidsson <oskar@irock.se>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#define NAME "depth"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
-
-cli_vid_filter_t depth_filter;
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    int bit_depth;
-    int dst_csp;
-    cli_pic_t buffer;
-    int16_t *error_buf;
-} depth_hnd_t;
-
-static int depth_filter_csp_is_supported( int csp )
-{
-    int csp_mask = csp & X264_CSP_MASK;
-    return csp_mask == X264_CSP_I420 ||
-           csp_mask == X264_CSP_I422 ||
-           csp_mask == X264_CSP_I444 ||
-           csp_mask == X264_CSP_YV12 ||
-           csp_mask == X264_CSP_YV16 ||
-           csp_mask == X264_CSP_YV24 ||
-           csp_mask == X264_CSP_NV12 ||
-           csp_mask == X264_CSP_NV21 ||
-           csp_mask == X264_CSP_NV16 ||
-           csp_mask == X264_CSP_BGR ||
-           csp_mask == X264_CSP_RGB ||
-           csp_mask == X264_CSP_BGRA;
-}
-
-static int csp_num_interleaved( int csp, int plane )
-{
-    int csp_mask = csp & X264_CSP_MASK;
-    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
-           csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 :
-           csp_mask == X264_CSP_BGRA ? 4 :
-           1;
-}
-
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
- * written in such a way so that if the source has been upconverted using the
- * same algorithm as used in scale_image, dithering down to the source bit
- * depth again is lossless. */
-#define DITHER_PLANE( pitch ) \
-static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \
-                                  int width, int height, int16_t *errors ) \
-{ \
-    const int lshift = 16-X264_BIT_DEPTH; \
-    const int rshift = 16-X264_BIT_DEPTH+2; \
-    const int half = 1 << (16-X264_BIT_DEPTH+1); \
-    const int pixel_max = (1 << X264_BIT_DEPTH)-1; \
-    memset( errors, 0, (width+1) * sizeof(int16_t) ); \
-    for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \
-    { \
-        int err = 0; \
-        for( int x = 0; x < width; x++ ) \
-        { \
-            err = err*2 + errors[x] + errors[x+1]; \
-            dst[x*pitch] = x264_clip3( ((src[x*pitch]<<2)+err+half) >> rshift, 0, pixel_max ); \
-            errors[x] = err = src[x*pitch] - (dst[x*pitch] << lshift); \
-        } \
-    } \
-}
-
-DITHER_PLANE( 1 )
-DITHER_PLANE( 2 )
-DITHER_PLANE( 3 )
-DITHER_PLANE( 4 )
-
-static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf )
-{
-    int csp_mask = img->csp & X264_CSP_MASK;
-    for( int i = 0; i < img->planes; i++ )
-    {
-        int num_interleaved = csp_num_interleaved( img->csp, i );
-        int height = x264_cli_csps[csp_mask].height[i] * img->height;
-        int width = x264_cli_csps[csp_mask].width[i] * img->width / num_interleaved;
-
-#define CALL_DITHER_PLANE( pitch, off ) \
-        dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/sizeof(pixel), \
-                ((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf )
-
-        if( num_interleaved == 4 )
-        {
-            CALL_DITHER_PLANE( 4, 0 );
-            CALL_DITHER_PLANE( 4, 1 );
-            CALL_DITHER_PLANE( 4, 2 );
-            CALL_DITHER_PLANE( 4, 3 ); //we probably can skip this one
-        }
-        else if( num_interleaved == 3 )
-        {
-            CALL_DITHER_PLANE( 3, 0 );
-            CALL_DITHER_PLANE( 3, 1 );
-            CALL_DITHER_PLANE( 3, 2 );
-        }
-        else if( num_interleaved == 2 )
-        {
-            CALL_DITHER_PLANE( 2, 0 );
-            CALL_DITHER_PLANE( 2, 1 );
-        }
-        else //if( num_interleaved == 1 )
-        {
-            CALL_DITHER_PLANE( 1, 0 );
-        }
-    }
-}
-
-static void scale_image( cli_image_t *output, cli_image_t *img )
-{
-    int csp_mask = img->csp & X264_CSP_MASK;
-    const int shift = X264_BIT_DEPTH - 8;
-    for( int i = 0; i < img->planes; i++ )
-    {
-        uint8_t *src = img->plane[i];
-        uint16_t *dst = (uint16_t*)output->plane[i];
-        int height = x264_cli_csps[csp_mask].height[i] * img->height;
-        int width = x264_cli_csps[csp_mask].width[i] * img->width;
-
-        for( int j = 0; j < height; j++ )
-        {
-            for( int k = 0; k < width; k++ )
-                dst[k] = src[k] << shift;
-
-            src += img->stride[i];
-            dst += output->stride[i]/2;
-        }
-    }
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    depth_hnd_t *h = handle;
-
-    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
-        return -1;
-
-    if( h->bit_depth < 16 && output->img.csp & X264_CSP_HIGH_DEPTH )
-    {
-        dither_image( &h->buffer.img, &output->img, h->error_buf );
-        output->img = h->buffer.img;
-    }
-    else if( h->bit_depth > 8 && !(output->img.csp & X264_CSP_HIGH_DEPTH) )
-    {
-        scale_image( &h->buffer.img, &output->img );
-        output->img = h->buffer.img;
-    }
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    depth_hnd_t *h = handle;
-    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
-}
-
-static void free_filter( hnd_t handle )
-{
-    depth_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    x264_cli_pic_clean( &h->buffer );
-    x264_free( h );
-}
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info,
-                 x264_param_t *param, char *opt_string )
-{
-    int ret = 0;
-    int change_fmt = (info->csp ^ param->i_csp) & X264_CSP_HIGH_DEPTH;
-    int csp = ~(~info->csp ^ change_fmt);
-    int bit_depth = 8*x264_cli_csp_depth_factor( csp );
-
-    if( opt_string )
-    {
-        static const char * const optlist[] = { "bit_depth", NULL };
-        char **opts = x264_split_options( opt_string, optlist );
-
-        if( opts )
-        {
-            char *str_bit_depth = x264_get_option( "bit_depth", opts );
-            bit_depth = x264_otoi( str_bit_depth, -1 );
-
-            ret = bit_depth < 8 || bit_depth > 16;
-            csp = bit_depth > 8 ? csp | X264_CSP_HIGH_DEPTH : csp & ~X264_CSP_HIGH_DEPTH;
-            change_fmt = (info->csp ^ csp) & X264_CSP_HIGH_DEPTH;
-            free( opts );
-        }
-        else
-            ret = 1;
-    }
-
-    FAIL_IF_ERROR( bit_depth != X264_BIT_DEPTH, "this build supports only bit depth %d\n", X264_BIT_DEPTH );
-    FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" );
-
-    /* only add the filter to the chain if it's needed */
-    if( change_fmt || bit_depth != 8 * x264_cli_csp_depth_factor( csp ) )
-    {
-        FAIL_IF_ERROR( !depth_filter_csp_is_supported(csp), "unsupported colorspace.\n" );
-        depth_hnd_t *h = x264_malloc( sizeof(depth_hnd_t) + (info->width+1)*sizeof(int16_t) );
-
-        if( !h )
-            return -1;
-
-        h->error_buf = (int16_t*)(h + 1);
-        h->dst_csp = csp;
-        h->bit_depth = bit_depth;
-        h->prev_hnd = *handle;
-        h->prev_filter = *filter;
-
-        if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, info->width, info->height ) )
-        {
-            x264_free( h );
-            return -1;
-        }
-
-        *handle = h;
-        *filter = depth_filter;
-        info->csp = h->dst_csp;
-    }
-
-    return 0;
-}
-
-cli_vid_filter_t depth_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/fix_vfr_pts.c b/android/src/main/libenc/jni/libx264/filters/video/fix_vfr_pts.c
deleted file mode 100755
index 94af828..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/fix_vfr_pts.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*****************************************************************************
- * fix_vfr_pts.c: vfr pts fixing video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#include "internal.h"
-
-/* This filter calculates and store the frame's duration to the frame data
- * (if it is not already calculated when the frame arrives to this point)
- * so it can be used by filters that will need to reconstruct pts due to
- * out-of-order frame requests */
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    /* we need 1 buffer picture and 1 place holder */
-    cli_pic_t buffer;
-    cli_pic_t holder;
-    int buffer_allocated;
-    int holder_frame;
-    int holder_ret;
-    int64_t pts;
-    int64_t last_duration;
-} fix_vfr_pts_hnd_t;
-
-cli_vid_filter_t fix_vfr_pts_filter;
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    /* if the input is not vfr, we don't do anything */
-    if( !info->vfr )
-        return 0;
-    fix_vfr_pts_hnd_t *h = calloc( 1, sizeof(fix_vfr_pts_hnd_t) );
-    if( !h )
-        return -1;
-
-    h->holder_frame = -1;
-    h->prev_hnd = *handle;
-    h->prev_filter = *filter;
-    *handle = h;
-    *filter = fix_vfr_pts_filter;
-
-    return 0;
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    fix_vfr_pts_hnd_t *h = handle;
-    /* if we want the holder picture and it errored, return the error. */
-    if( frame == h->holder_frame )
-    {
-        if( h->holder_ret )
-            return h->holder_ret;
-    }
-    else
-    {
-        /* if we have a holder frame and we don't want it, release the frame */
-        if( h->holder_frame > 0 && h->holder_frame < frame && h->prev_filter.release_frame( h->prev_hnd, &h->holder, h->holder_frame ) )
-            return -1;
-        h->holder_frame = -1;
-        if( h->prev_filter.get_frame( h->prev_hnd, &h->holder, frame ) )
-            return -1;
-    }
-
-    /* if the frame's duration is not set already, read the next frame to set it. */
-    if( !h->holder.duration )
-    {
-        /* allocate a buffer picture if we didn't already */
-        if( !h->buffer_allocated )
-        {
-            if( x264_cli_pic_alloc( &h->buffer, h->holder.img.csp, h->holder.img.width, h->holder.img.height ) )
-                return -1;
-            h->buffer_allocated = 1;
-        }
-        h->holder_frame = frame+1;
-        /* copy the current frame to the buffer, release it, and then read in the next frame to the placeholder */
-        if( x264_cli_pic_copy( &h->buffer, &h->holder ) || h->prev_filter.release_frame( h->prev_hnd, &h->holder, frame ) )
-            return -1;
-        h->holder_ret = h->prev_filter.get_frame( h->prev_hnd, &h->holder, h->holder_frame );
-        /* suppress non-monotonic pts warnings by setting the duration to be at least 1 */
-        if( !h->holder_ret )
-            h->last_duration = X264_MAX( h->holder.pts - h->buffer.pts, 1 );
-        h->buffer.duration = h->last_duration;
-        *output = h->buffer;
-    }
-    else
-        *output = h->holder;
-
-    output->pts = h->pts;
-    h->pts += output->duration;
-
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    fix_vfr_pts_hnd_t *h = handle;
-    /* if the frame is the buffered one, it's already been released */
-    if( frame == (h->holder_frame - 1) )
-        return 0;
-    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
-}
-
-static void free_filter( hnd_t handle )
-{
-    fix_vfr_pts_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    if( h->buffer_allocated )
-        x264_cli_pic_clean( &h->buffer );
-    free( h );
-}
-
-cli_vid_filter_t fix_vfr_pts_filter = { "fix_vfr_pts", NULL, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/internal.c b/android/src/main/libenc/jni/libx264/filters/video/internal.c
deleted file mode 100755
index ed64d26..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/internal.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*****************************************************************************
- * internal.c: video filter utilities
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "internal.h"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ )
-
-void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h )
-{
-    while( h-- )
-    {
-        memcpy( dst, src, w );
-        dst += i_dst;
-        src += i_src;
-    }
-}
-
-int x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in )
-{
-    int csp = in->img.csp & X264_CSP_MASK;
-    FAIL_IF_ERROR( x264_cli_csp_is_invalid( in->img.csp ), "invalid colorspace arg %d\n", in->img.csp );
-    FAIL_IF_ERROR( in->img.csp != out->img.csp || in->img.height != out->img.height
-                || in->img.width != out->img.width, "incompatible frame properties\n" );
-    /* copy data */
-    out->duration = in->duration;
-    out->pts = in->pts;
-    out->opaque = in->opaque;
-
-    for( int i = 0; i < out->img.planes; i++ )
-    {
-        int height = in->img.height * x264_cli_csps[csp].height[i];
-        int width =  in->img.width  * x264_cli_csps[csp].width[i];
-        width *= x264_cli_csp_depth_factor( in->img.csp );
-        x264_cli_plane_copy( out->img.plane[i], out->img.stride[i], in->img.plane[i],
-                             in->img.stride[i], width, height );
-    }
-    return 0;
-}
diff --git a/android/src/main/libenc/jni/libx264/filters/video/internal.h b/android/src/main/libenc/jni/libx264/filters/video/internal.h
deleted file mode 100755
index a4fb766..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/internal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*****************************************************************************
- * internal.h: video filter utilities
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_FILTER_VIDEO_INTERNAL_H
-#define X264_FILTER_VIDEO_INTERNAL_H
-#include "video.h"
-
-void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h );
-int  x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/filters/video/resize.c b/android/src/main/libenc/jni/libx264/filters/video/resize.c
deleted file mode 100755
index 38af5ef..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/resize.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*****************************************************************************
- * resize.c: resize video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#define NAME "resize"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
-
-cli_vid_filter_t resize_filter;
-
-static int full_check( video_info_t *info, x264_param_t *param )
-{
-    int required = 0;
-    required |= info->csp       != param->i_csp;
-    required |= info->width     != param->i_width;
-    required |= info->height    != param->i_height;
-    required |= info->fullrange != param->vui.b_fullrange;
-    return required;
-}
-
-#if HAVE_SWSCALE
-#undef DECLARE_ALIGNED
-#include <libswscale/swscale.h>
-#include <libavutil/opt.h>
-#include <libavutil/pixdesc.h>
-
-#ifndef AV_PIX_FMT_BGRA64
-#define AV_PIX_FMT_BGRA64 AV_PIX_FMT_NONE
-#endif
-
-typedef struct
-{
-    int width;
-    int height;
-    int pix_fmt;
-    int range;
-} frame_prop_t;
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    cli_pic_t buffer;
-    int buffer_allocated;
-    int dst_csp;
-    int input_range;
-    struct SwsContext *ctx;
-    uint32_t ctx_flags;
-    /* state of swapping chroma planes pre and post resize */
-    int pre_swap_chroma;
-    int post_swap_chroma;
-    int variable_input; /* input is capable of changing properties */
-    int working;        /* we have already started working with frames */
-    frame_prop_t dst;   /* desired output properties */
-    frame_prop_t scale; /* properties of the SwsContext input */
-} resizer_hnd_t;
-
-static void help( int longhelp )
-{
-    printf( "      "NAME":[width,height][,sar][,fittobox][,csp][,method]\n" );
-    if( !longhelp )
-        return;
-    printf( "            resizes frames based on the given criteria:\n"
-            "            - resolution only: resizes and adapts sar to avoid stretching\n"
-            "            - sar only: sets the sar and resizes to avoid stretching\n"
-            "            - resolution and sar: resizes to given resolution and sets the sar\n"
-            "            - fittobox: resizes the video based on the desired constraints\n"
-            "               - width, height, both\n"
-            "            - fittobox and sar: same as above except with specified sar\n"
-            "            - csp: convert to the given csp. syntax: [name][:depth]\n"
-            "               - valid csp names [keep current]: " );
-
-    for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
-    {
-        if( x264_cli_csps[i].name )
-        {
-            printf( "%s", x264_cli_csps[i].name );
-            if( i+1 < X264_CSP_CLI_MAX )
-                printf( ", " );
-        }
-    }
-    printf( "\n"
-            "               - depth: 8 or 16 bits per pixel [keep current]\n"
-            "            note: not all depths are supported by all csps.\n"
-            "            - method: use resizer method [\"bicubic\"]\n"
-            "               - fastbilinear, bilinear, bicubic, experimental, point,\n"
-            "               - area, bicublin, gauss, sinc, lanczos, spline\n" );
-}
-
-static uint32_t convert_method_to_flag( const char *name )
-{
-    uint32_t flag = 0;
-    if( !strcasecmp( name, "fastbilinear" ) )
-        flag = SWS_FAST_BILINEAR;
-    else if( !strcasecmp( name, "bilinear" ) )
-        flag = SWS_BILINEAR;
-    else if( !strcasecmp( name, "bicubic" ) )
-        flag = SWS_BICUBIC;
-    else if( !strcasecmp( name, "experimental" ) )
-        flag = SWS_X;
-    else if( !strcasecmp( name, "point" ) )
-        flag = SWS_POINT;
-    else if( !strcasecmp( name, "area" ) )
-        flag = SWS_AREA;
-    else if( !strcasecmp( name, "bicublin" ) )
-        flag = SWS_BICUBLIN;
-    else if( !strcasecmp( name, "guass" ) )
-        flag = SWS_GAUSS;
-    else if( !strcasecmp( name, "sinc" ) )
-        flag = SWS_SINC;
-    else if( !strcasecmp( name, "lanczos" ) )
-        flag = SWS_LANCZOS;
-    else if( !strcasecmp( name, "spline" ) )
-        flag = SWS_SPLINE;
-    else // default
-        flag = SWS_BICUBIC;
-    return flag;
-}
-
-static int convert_csp_to_pix_fmt( int csp )
-{
-    if( csp&X264_CSP_OTHER )
-        return csp&X264_CSP_MASK;
-    switch( csp&X264_CSP_MASK )
-    {
-        case X264_CSP_YV12: /* specially handled via swapping chroma */
-        case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV420P16 : AV_PIX_FMT_YUV420P;
-        case X264_CSP_YV16: /* specially handled via swapping chroma */
-        case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV422P16 : AV_PIX_FMT_YUV422P;
-        case X264_CSP_YV24: /* specially handled via swapping chroma */
-        case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_YUV444P;
-        case X264_CSP_RGB:  return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48     : AV_PIX_FMT_RGB24;
-        case X264_CSP_BGR:  return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48     : AV_PIX_FMT_BGR24;
-        case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64    : AV_PIX_FMT_BGRA;
-        /* the next csp has no equivalent 16bit depth in swscale */
-        case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE      : AV_PIX_FMT_NV12;
-        case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE      : AV_PIX_FMT_NV21;
-        /* the next csp is no supported by swscale at all */
-        case X264_CSP_NV16:
-        default:            return AV_PIX_FMT_NONE;
-    }
-}
-
-static int pix_number_of_planes( const AVPixFmtDescriptor *pix_desc )
-{
-    int num_planes = 0;
-    for( int i = 0; i < pix_desc->nb_components; i++ )
-    {
-        int plane_plus1 = pix_desc->comp[i].plane + 1;
-        num_planes = X264_MAX( plane_plus1, num_planes );
-    }
-    return num_planes;
-}
-
-static int pick_closest_supported_csp( int csp )
-{
-    int pix_fmt = convert_csp_to_pix_fmt( csp );
-    // first determine the base csp
-    int ret = X264_CSP_NONE;
-    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get( pix_fmt );
-    if( !pix_desc || !pix_desc->name )
-        return ret;
-
-    const char *pix_fmt_name = pix_desc->name;
-    int is_rgb = pix_desc->flags & (AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PAL);
-    int is_bgr = !!strstr( pix_fmt_name, "bgr" );
-    if( is_bgr || is_rgb )
-    {
-        if( pix_desc->nb_components == 4 ) // has alpha
-            ret = X264_CSP_BGRA;
-        else if( is_bgr )
-            ret = X264_CSP_BGR;
-        else
-            ret = X264_CSP_RGB;
-    }
-    else
-    {
-        // yuv-based
-        if( pix_desc->nb_components == 1 || pix_desc->nb_components == 2 ) // no chroma
-            ret = X264_CSP_I420;
-        else if( pix_desc->log2_chroma_w && pix_desc->log2_chroma_h ) // reduced chroma width & height
-            ret = (pix_number_of_planes( pix_desc ) == 2) ? X264_CSP_NV12 : X264_CSP_I420;
-        else if( pix_desc->log2_chroma_w ) // reduced chroma width only
-            ret = X264_CSP_I422; // X264_CSP_NV16 is not supported by swscale so don't use it
-        else
-            ret = X264_CSP_I444;
-    }
-    // now determine high depth
-    for( int i = 0; i < pix_desc->nb_components; i++ )
-        if( pix_desc->comp[i].depth_minus1 >= 8 )
-            ret |= X264_CSP_HIGH_DEPTH;
-    return ret;
-}
-
-static int handle_opts( const char * const *optlist, char **opts, video_info_t *info, resizer_hnd_t *h )
-{
-    uint32_t out_sar_w, out_sar_h;
-
-    char *str_width  = x264_get_option( optlist[0], opts );
-    char *str_height = x264_get_option( optlist[1], opts );
-    char *str_sar    = x264_get_option( optlist[2], opts );
-    char *fittobox   = x264_get_option( optlist[3], opts );
-    char *str_csp    = x264_get_option( optlist[4], opts );
-    int width        = x264_otoi( str_width, -1 );
-    int height       = x264_otoi( str_height, -1 );
-
-    int csp_only = 0;
-    uint32_t in_sar_w = info->sar_width;
-    uint32_t in_sar_h = info->sar_height;
-
-    if( str_csp )
-    {
-        /* output csp was specified, first check if optional depth was provided */
-        char *str_depth = strchr( str_csp, ':' );
-        int depth = x264_cli_csp_depth_factor( info->csp ) * 8;
-        if( str_depth )
-        {
-            /* csp bit depth was specified */
-            *str_depth++ = '\0';
-            depth = x264_otoi( str_depth, -1 );
-            FAIL_IF_ERROR( depth != 8 && depth != 16, "unsupported bit depth %d\n", depth );
-        }
-        /* now lookup against the list of valid csps */
-        int csp;
-        if( strlen( str_csp ) == 0 )
-            csp = info->csp & X264_CSP_MASK;
-        else
-            for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- )
-            {
-                if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) )
-                    break;
-            }
-        FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
-        h->dst_csp = csp;
-        if( depth == 16 )
-            h->dst_csp |= X264_CSP_HIGH_DEPTH;
-    }
-
-    /* if the input sar is currently invalid, set it to 1:1 so it can be used in math */
-    if( !in_sar_w || !in_sar_h )
-        in_sar_w = in_sar_h = 1;
-    if( str_sar )
-    {
-        FAIL_IF_ERROR( 2 != sscanf( str_sar, "%u:%u", &out_sar_w, &out_sar_h ) &&
-                       2 != sscanf( str_sar, "%u/%u", &out_sar_w, &out_sar_h ),
-                       "invalid sar `%s'\n", str_sar );
-    }
-    else
-        out_sar_w = out_sar_h = 1;
-    if( fittobox )
-    {
-        /* resize the video to fit the box as much as possible */
-        if( !strcasecmp( fittobox, "both" ) )
-        {
-            FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid box resolution %sx%s\n",
-                           x264_otos( str_width, "<unset>" ), x264_otos( str_height, "<unset>" ) );
-        }
-        else if( !strcasecmp( fittobox, "width" ) )
-        {
-            FAIL_IF_ERROR( width <= 0, "invalid box width `%s'\n", x264_otos( str_width, "<unset>" ) );
-            height = INT_MAX;
-        }
-        else if( !strcasecmp( fittobox, "height" ) )
-        {
-            FAIL_IF_ERROR( height <= 0, "invalid box height `%s'\n", x264_otos( str_height, "<unset>" ) );
-            width = INT_MAX;
-        }
-        else FAIL_IF_ERROR( 1, "invalid fittobox mode `%s'\n", fittobox );
-
-        /* maximally fit the new coded resolution to the box */
-        const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
-        double width_units = (double)info->height * in_sar_h * out_sar_w;
-        double height_units = (double)info->width * in_sar_w * out_sar_h;
-        width = width / csp->mod_width * csp->mod_width;
-        height = height / csp->mod_height * csp->mod_height;
-        if( width * width_units > height * height_units )
-        {
-            int new_width = round( height * height_units / (width_units * csp->mod_width) );
-            new_width *= csp->mod_width;
-            width = X264_MIN( new_width, width );
-        }
-        else
-        {
-            int new_height = round( width * width_units / (height_units * csp->mod_height) );
-            new_height *= csp->mod_height;
-            height = X264_MIN( new_height, height );
-        }
-    }
-    else
-    {
-        if( str_width || str_height )
-        {
-            FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid resolution %sx%s\n",
-                           x264_otos( str_width, "<unset>" ), x264_otos( str_height, "<unset>" ) );
-            if( !str_sar ) /* res only -> adjust sar */
-            {
-                /* new_sar = (new_h * old_w * old_sar_w) / (old_h * new_w * old_sar_h) */
-                uint64_t num = (uint64_t)info->width  * height;
-                uint64_t den = (uint64_t)info->height * width;
-                x264_reduce_fraction64( &num, &den );
-                out_sar_w = num * in_sar_w;
-                out_sar_h = den * in_sar_h;
-                x264_reduce_fraction( &out_sar_w, &out_sar_h );
-            }
-        }
-        else if( str_sar ) /* sar only -> adjust res */
-        {
-             const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
-             double width_units = (double)in_sar_h * out_sar_w;
-             double height_units = (double)in_sar_w * out_sar_h;
-             width  = info->width;
-             height = info->height;
-             if( width_units > height_units ) // SAR got wider, decrease width
-             {
-                 width = round( info->width * height_units / (width_units * csp->mod_width) );
-                 width *= csp->mod_width;
-             }
-             else // SAR got thinner, decrease height
-             {
-                 height = round( info->height * width_units / (height_units * csp->mod_height) );
-                 height *= csp->mod_height;
-             }
-        }
-        else /* csp only */
-        {
-            h->dst.width  = info->width;
-            h->dst.height = info->height;
-            csp_only = 1;
-        }
-    }
-    if( !csp_only )
-    {
-        info->sar_width  = out_sar_w;
-        info->sar_height = out_sar_h;
-        h->dst.width  = width;
-        h->dst.height = height;
-    }
-    return 0;
-}
-
-static int x264_init_sws_context( resizer_hnd_t *h )
-{
-    if( h->ctx )
-        sws_freeContext( h->ctx );
-    h->ctx = sws_alloc_context();
-    if( !h->ctx )
-        return -1;
-
-    av_opt_set_int( h->ctx, "sws_flags",  h->ctx_flags,   0 );
-    av_opt_set_int( h->ctx, "dstw",       h->dst.width,   0 );
-    av_opt_set_int( h->ctx, "dsth",       h->dst.height,  0 );
-    av_opt_set_int( h->ctx, "dst_format", h->dst.pix_fmt, 0 );
-    av_opt_set_int( h->ctx, "dst_range",  h->dst.range,   0 );
-
-    av_opt_set_int( h->ctx, "srcw",       h->scale.width,   0 );
-    av_opt_set_int( h->ctx, "srch",       h->scale.height,  0 );
-    av_opt_set_int( h->ctx, "src_format", h->scale.pix_fmt, 0 );
-    av_opt_set_int( h->ctx, "src_range",  h->scale.range,   0 );
-
-    /* FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */
-    sws_setColorspaceDetails( h->ctx,
-                              sws_getCoefficients( SWS_CS_DEFAULT ), h->scale.range,
-                              sws_getCoefficients( SWS_CS_DEFAULT ), h->dst.range,
-                              0, 1<<16, 1<<16 );
-
-    return sws_init_context( h->ctx, NULL, NULL ) < 0;
-}
-
-static int check_resizer( resizer_hnd_t *h, cli_pic_t *in )
-{
-    frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ), h->input_range };
-    if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) )
-        return 0;
-    /* also warn if the resizer was initialized after the first frame */
-    if( h->ctx || h->working )
-        x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts );
-    h->scale = input_prop;
-    if( !h->buffer_allocated )
-    {
-        if( x264_cli_pic_alloc_aligned( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) )
-            return -1;
-        h->buffer_allocated = 1;
-    }
-    FAIL_IF_ERROR( x264_init_sws_context( h ), "swscale init failed\n" );
-    return 0;
-}
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    /* if called for normalizing the csp to known formats and the format is not unknown, exit */
-    if( opt_string && !strcmp( opt_string, "normcsp" ) && !(info->csp&X264_CSP_OTHER) )
-        return 0;
-    /* if called by x264cli and nothing needs to be done, exit */
-    if( !opt_string && !full_check( info, param ) )
-        return 0;
-
-    static const char * const optlist[] = { "width", "height", "sar", "fittobox", "csp", "method", NULL };
-    char **opts = x264_split_options( opt_string, optlist );
-    if( !opts && opt_string )
-        return -1;
-
-    resizer_hnd_t *h = calloc( 1, sizeof(resizer_hnd_t) );
-    if( !h )
-        return -1;
-
-    h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) );
-
-    if( opts )
-    {
-        h->dst_csp    = info->csp;
-        h->dst.width  = info->width;
-        h->dst.height = info->height;
-        h->dst.range  = info->fullrange; // maintain input range
-        if( !strcmp( opt_string, "normcsp" ) )
-        {
-            free( opts );
-            /* only in normalization scenarios is the input capable of changing properties */
-            h->variable_input = 1;
-            h->dst_csp = pick_closest_supported_csp( info->csp );
-            FAIL_IF_ERROR( h->dst_csp == X264_CSP_NONE,
-                           "filter get invalid input pixel format %d (colorspace %d)\n", convert_csp_to_pix_fmt( info->csp ), info->csp );
-        }
-        else
-        {
-            int err = handle_opts( optlist, opts, info, h );
-            free( opts );
-            if( err )
-                return -1;
-        }
-    }
-    else
-    {
-        h->dst_csp    = param->i_csp;
-        h->dst.width  = param->i_width;
-        h->dst.height = param->i_height;
-        h->dst.range  = param->vui.b_fullrange; // change to libx264's range
-    }
-
-    if( h->ctx_flags != SWS_FAST_BILINEAR )
-        h->ctx_flags |= SWS_FULL_CHR_H_INT | SWS_FULL_CHR_H_INP | SWS_ACCURATE_RND;
-    h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
-    h->scale = h->dst;
-    h->input_range = info->fullrange;
-
-    /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
-    int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
-    int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER);
-    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24;
-    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24;
-
-    int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );
-
-    int src_pix_fmt_inv = convert_csp_to_pix_fmt( info->csp ^ X264_CSP_HIGH_DEPTH );
-    int dst_pix_fmt_inv = convert_csp_to_pix_fmt( h->dst_csp ^ X264_CSP_HIGH_DEPTH );
-
-    /* confirm swscale can support this conversion */
-    FAIL_IF_ERROR( src_pix_fmt == AV_PIX_FMT_NONE && src_pix_fmt_inv != AV_PIX_FMT_NONE,
-                   "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( src_pix_fmt_inv ),
-                   info->csp & X264_CSP_HIGH_DEPTH ? 16 : 8 );
-    FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", av_get_pix_fmt_name( src_pix_fmt ) );
-    FAIL_IF_ERROR( h->dst.pix_fmt == AV_PIX_FMT_NONE && dst_pix_fmt_inv != AV_PIX_FMT_NONE,
-                   "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( dst_pix_fmt_inv ),
-                   h->dst_csp & X264_CSP_HIGH_DEPTH ? 16 : 8 );
-    FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", av_get_pix_fmt_name( h->dst.pix_fmt ) );
-    FAIL_IF_ERROR( h->dst.height != info->height && info->interlaced,
-                   "swscale is not compatible with interlaced vertical resizing\n" );
-    /* confirm that the desired resolution meets the colorspace requirements */
-    const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp );
-    FAIL_IF_ERROR( h->dst.width % csp->mod_width || h->dst.height % csp->mod_height,
-                   "resolution %dx%d is not compliant with colorspace %s\n", h->dst.width, h->dst.height, csp->name );
-
-    if( h->dst.width != info->width || h->dst.height != info->height )
-        x264_cli_log( NAME, X264_LOG_INFO, "resizing to %dx%d\n", h->dst.width, h->dst.height );
-    if( h->dst.pix_fmt != src_pix_fmt )
-        x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n",
-                      av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) );
-    else if( h->dst.range != h->input_range )
-        x264_cli_log( NAME, X264_LOG_WARNING, "converting range from %s to %s\n",
-                      h->input_range ? "PC" : "TV", h->dst.range ? "PC" : "TV" );
-    h->dst_csp |= info->csp & X264_CSP_VFLIP; // preserve vflip
-
-    /* if the input is not variable, initialize the context */
-    if( !h->variable_input )
-    {
-        cli_pic_t input_pic = {{info->csp, info->width, info->height, 0}, 0};
-        if( check_resizer( h, &input_pic ) )
-            return -1;
-    }
-
-    /* finished initing, overwrite values */
-    info->csp       = h->dst_csp;
-    info->width     = h->dst.width;
-    info->height    = h->dst.height;
-    info->fullrange = h->dst.range;
-
-    h->prev_filter = *filter;
-    h->prev_hnd = *handle;
-    *handle = h;
-    *filter = resize_filter;
-
-    return 0;
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    resizer_hnd_t *h = handle;
-    if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) )
-        return -1;
-    if( h->variable_input && check_resizer( h, output ) )
-        return -1;
-    h->working = 1;
-    if( h->pre_swap_chroma )
-        XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] );
-    if( h->ctx )
-    {
-        sws_scale( h->ctx, (const uint8_t* const*)output->img.plane, output->img.stride,
-                   0, output->img.height, h->buffer.img.plane, h->buffer.img.stride );
-        output->img = h->buffer.img; /* copy img data */
-    }
-    else
-        output->img.csp = h->dst_csp;
-    if( h->post_swap_chroma )
-        XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] );
-
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    resizer_hnd_t *h = handle;
-    return h->prev_filter.release_frame( h->prev_hnd, pic, frame );
-}
-
-static void free_filter( hnd_t handle )
-{
-    resizer_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    if( h->ctx )
-        sws_freeContext( h->ctx );
-    if( h->buffer_allocated )
-        x264_cli_pic_clean( &h->buffer );
-    free( h );
-}
-
-#else /* no swscale */
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    int ret = 0;
-
-    if( !opt_string )
-        ret = full_check( info, param );
-    else
-    {
-        if( !strcmp( opt_string, "normcsp" ) )
-            ret = info->csp & X264_CSP_OTHER;
-        else
-            ret = -1;
-    }
-
-    /* pass if nothing needs to be done, otherwise fail */
-    FAIL_IF_ERROR( ret, "not compiled with swscale support\n" );
-    return 0;
-}
-
-#define help NULL
-#define get_frame NULL
-#define release_frame NULL
-#define free_filter NULL
-#define convert_csp_to_pix_fmt(x) (x & X264_CSP_MASK)
-
-#endif
-
-cli_vid_filter_t resize_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/select_every.c b/android/src/main/libenc/jni/libx264/filters/video/select_every.c
deleted file mode 100755
index 028a701..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/select_every.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*****************************************************************************
- * select_every.c: select-every video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-#define NAME "select_every"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ )
-
-#define MAX_PATTERN_SIZE 100 /* arbitrary */
-
-typedef struct
-{
-    hnd_t prev_hnd;
-    cli_vid_filter_t prev_filter;
-
-    int *pattern;
-    int pattern_len;
-    int step_size;
-    int vfr;
-    int64_t pts;
-} selvry_hnd_t;
-
-cli_vid_filter_t select_every_filter;
-
-static void help( int longhelp )
-{
-    printf( "      "NAME":step,offset1[,...]\n" );
-    if( !longhelp )
-        return;
-    printf( "            apply a selection pattern to input frames\n"
-            "            step: the number of frames in the pattern\n"
-            "            offsets: the offset into the step to select a frame\n"
-            "            see: http://avisynth.nl/index.php/Select#SelectEvery\n" );
-}
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    selvry_hnd_t *h = malloc( sizeof(selvry_hnd_t) );
-    if( !h )
-        return -1;
-    h->pattern_len = 0;
-    h->step_size = 0;
-    int offsets[MAX_PATTERN_SIZE];
-    for( char *tok, *p = opt_string; (tok = strtok( p, "," )); p = NULL )
-    {
-        int val = x264_otoi( tok, -1 );
-        if( p )
-        {
-            FAIL_IF_ERROR( val <= 0, "invalid step `%s'\n", tok );
-            h->step_size = val;
-            continue;
-        }
-        FAIL_IF_ERROR( val < 0 || val >= h->step_size, "invalid offset `%s'\n", tok );
-        FAIL_IF_ERROR( h->pattern_len >= MAX_PATTERN_SIZE, "max pattern size %d reached\n", MAX_PATTERN_SIZE );
-        offsets[h->pattern_len++] = val;
-    }
-    FAIL_IF_ERROR( !h->step_size, "no step size provided\n" );
-    FAIL_IF_ERROR( !h->pattern_len, "no offsets supplied\n" );
-
-    h->pattern = malloc( h->pattern_len * sizeof(int) );
-    if( !h->pattern )
-        return -1;
-    memcpy( h->pattern, offsets, h->pattern_len * sizeof(int) );
-
-    /* determine required cache size to maintain pattern. */
-    intptr_t max_rewind = 0;
-    int min = h->step_size;
-    for( int i = h->pattern_len-1; i >= 0; i-- )
-    {
-         min = X264_MIN( min, offsets[i] );
-         if( i )
-             max_rewind = X264_MAX( max_rewind, offsets[i-1] - min + 1 );
-         /* reached maximum rewind size */
-         if( max_rewind == h->step_size )
-             break;
-    }
-    if( x264_init_vid_filter( "cache", handle, filter, info, param, (void*)max_rewind ) )
-        return -1;
-
-    /* done initing, overwrite properties */
-    if( h->step_size != h->pattern_len )
-    {
-        info->num_frames = (uint64_t)info->num_frames * h->pattern_len / h->step_size;
-        info->fps_den *= h->step_size;
-        info->fps_num *= h->pattern_len;
-        x264_reduce_fraction( &info->fps_num, &info->fps_den );
-        if( info->vfr )
-        {
-            info->timebase_den *= h->pattern_len;
-            info->timebase_num *= h->step_size;
-            x264_reduce_fraction( &info->timebase_num, &info->timebase_den );
-        }
-    }
-
-    h->pts = 0;
-    h->vfr = info->vfr;
-    h->prev_filter = *filter;
-    h->prev_hnd = *handle;
-    *filter = select_every_filter;
-    *handle = h;
-
-    return 0;
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    selvry_hnd_t *h = handle;
-    int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size;
-    if( h->prev_filter.get_frame( h->prev_hnd, output, pat_frame ) )
-        return -1;
-    if( h->vfr )
-    {
-        output->pts = h->pts;
-        h->pts += output->duration;
-    }
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    selvry_hnd_t *h = handle;
-    int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size;
-    return h->prev_filter.release_frame( h->prev_hnd, pic, pat_frame );
-}
-
-static void free_filter( hnd_t handle )
-{
-    selvry_hnd_t *h = handle;
-    h->prev_filter.free( h->prev_hnd );
-    free( h->pattern );
-    free( h );
-}
-
-cli_vid_filter_t select_every_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/source.c b/android/src/main/libenc/jni/libx264/filters/video/source.c
deleted file mode 100755
index d4a47df..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/source.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*****************************************************************************
- * source.c: source video filter
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-
-/* This filter converts the demuxer API into the filtering API for video frames.
- * Backseeking is prohibited here as not all demuxers are capable of doing so. */
-
-typedef struct
-{
-    cli_pic_t pic;
-    hnd_t hin;
-    int cur_frame;
-} source_hnd_t;
-
-cli_vid_filter_t source_filter;
-
-static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    source_hnd_t *h = calloc( 1, sizeof(source_hnd_t) );
-    if( !h )
-        return -1;
-    h->cur_frame = -1;
-
-    if( cli_input.picture_alloc( &h->pic, *handle, info->csp, info->width, info->height ) )
-        return -1;
-
-    h->hin = *handle;
-    *handle = h;
-    *filter = source_filter;
-
-    return 0;
-}
-
-static int get_frame( hnd_t handle, cli_pic_t *output, int frame )
-{
-    source_hnd_t *h = handle;
-    /* do not allow requesting of frames from before the current position */
-    if( frame <= h->cur_frame || cli_input.read_frame( &h->pic, h->hin, frame ) )
-        return -1;
-    h->cur_frame = frame;
-    *output = h->pic;
-    return 0;
-}
-
-static int release_frame( hnd_t handle, cli_pic_t *pic, int frame )
-{
-    source_hnd_t *h = handle;
-    if( cli_input.release_frame && cli_input.release_frame( &h->pic, h->hin ) )
-        return -1;
-    return 0;
-}
-
-static void free_filter( hnd_t handle )
-{
-    source_hnd_t *h = handle;
-    cli_input.picture_clean( &h->pic, h->hin );
-    cli_input.close_file( h->hin );
-    free( h );
-}
-
-cli_vid_filter_t source_filter = { "source", NULL, init, get_frame, release_frame, free_filter, NULL };
diff --git a/android/src/main/libenc/jni/libx264/filters/video/video.c b/android/src/main/libenc/jni/libx264/filters/video/video.c
deleted file mode 100755
index 0879f31..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/video.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*****************************************************************************
- * video.c: video filters
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "video.h"
-
-static cli_vid_filter_t *first_filter = NULL;
-
-static void register_vid_filter( cli_vid_filter_t *new_filter )
-{
-    cli_vid_filter_t *filter_i = first_filter;
-    while( filter_i->next )
-        filter_i = filter_i->next;
-    filter_i->next = new_filter;
-    new_filter->next = NULL;
-}
-
-#define REGISTER_VFILTER(name)\
-{\
-    extern cli_vid_filter_t name##_filter;\
-    register_vid_filter( &name##_filter );\
-}
-
-void x264_register_vid_filters( void )
-{
-    extern cli_vid_filter_t source_filter;
-    first_filter = &source_filter;
-    REGISTER_VFILTER( cache );
-    REGISTER_VFILTER( crop );
-    REGISTER_VFILTER( fix_vfr_pts );
-    REGISTER_VFILTER( resize );
-    REGISTER_VFILTER( select_every );
-    REGISTER_VFILTER( depth );
-#if HAVE_GPL
-#endif
-}
-
-int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,
-                          video_info_t *info, x264_param_t *param, char *opt_string )
-{
-    cli_vid_filter_t *filter_i = first_filter;
-    while( filter_i && strcasecmp( name, filter_i->name ) )
-        filter_i = filter_i->next;
-    FAIL_IF_ERR( !filter_i, "x264", "invalid filter `%s'\n", name );
-    if( filter_i->init( handle, filter, info, param, opt_string ) )
-        return -1;
-
-    return 0;
-}
-
-void x264_vid_filter_help( int longhelp )
-{
-    for( cli_vid_filter_t *filter_i = first_filter; filter_i; filter_i = filter_i->next )
-        if( filter_i->help )
-            filter_i->help( longhelp );
-}
diff --git a/android/src/main/libenc/jni/libx264/filters/video/video.h b/android/src/main/libenc/jni/libx264/filters/video/video.h
deleted file mode 100755
index 34f1e20..0000000
--- a/android/src/main/libenc/jni/libx264/filters/video/video.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*****************************************************************************
- * video.h: video filters
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_FILTER_VIDEO_H
-#define X264_FILTER_VIDEO_H
-
-#include "input/input.h"
-#include "filters/filters.h"
-
-typedef struct cli_vid_filter_t cli_vid_filter_t;
-
-struct cli_vid_filter_t
-{
-    /* name of the filter */
-    const char *name;
-    /* help: a short message on what the filter does and how to use it.
-     * this should only be implemented by filters directly accessible by the user */
-    void (*help)( int longhelp );
-    /* init: initializes the filter given the input clip properties and parameter to adjust them as necessary
-     * with the given options provided by the user.
-     * returns 0 on success, nonzero on error. */
-    int (*init)( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string );
-    /* get_frame: given the storage for the output frame and desired frame number, generate the frame accordingly.
-     * the image data returned by get_frame should be treated as const and not be altered.
-     * returns 0 on success, nonzero on error. */
-    int (*get_frame)( hnd_t handle, cli_pic_t *output, int frame );
-    /* release_frame: frame is done being used and is signaled for cleanup.
-     * returns 0 on succeess, nonzero on error. */
-    int (*release_frame)( hnd_t handle, cli_pic_t *pic, int frame );
-    /* free: run filter cleanup procedures. */
-    void (*free)( hnd_t handle );
-    /* next registered filter, unused by filters themselves */
-    cli_vid_filter_t *next;
-};
-
-void x264_register_vid_filters( void );
-void x264_vid_filter_help( int longhelp );
-int  x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter,
-                           video_info_t *info, x264_param_t *param, char *opt_string );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/input/avs.c b/android/src/main/libenc/jni/libx264/input/avs.c
deleted file mode 100755
index a872681..0000000
--- a/android/src/main/libenc/jni/libx264/input/avs.c
+++ /dev/null
@@ -1,524 +0,0 @@
-/*****************************************************************************
- * avs.c: avisynth input
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *          Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#if USE_AVXSYNTH
-#include <dlfcn.h>
-#if SYS_MACOSX
-#define avs_open() dlopen( "libavxsynth.dylib", RTLD_NOW )
-#else
-#define avs_open() dlopen( "libavxsynth.so", RTLD_NOW )
-#endif
-#define avs_close dlclose
-#define avs_address dlsym
-#else
-#define avs_open() LoadLibraryW( L"avisynth" )
-#define avs_close FreeLibrary
-#define avs_address GetProcAddress
-#endif
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ )
-
-#define AVSC_NO_DECLSPEC
-#undef EXTERN_C
-#if USE_AVXSYNTH
-#include "extras/avxsynth_c.h"
-#else
-#include "extras/avisynth_c.h"
-#endif
-#define AVSC_DECLARE_FUNC(name) name##_func name
-
-/* AVS uses a versioned interface to control backwards compatibility */
-/* YV12 support is required, which was added in 2.5 */
-#define AVS_INTERFACE_25 2
-
-#if HAVE_SWSCALE
-#include <libavutil/pixfmt.h>
-#endif
-
-/* maximum size of the sequence of filters to try on non script files */
-#define AVS_MAX_SEQUENCE 5
-
-#define LOAD_AVS_FUNC(name, continue_on_fail)\
-{\
-    h->func.name = (void*)avs_address( h->library, #name );\
-    if( !continue_on_fail && !h->func.name )\
-        goto fail;\
-}
-
-#define LOAD_AVS_FUNC_ALIAS(name, alias, continue_on_fail)\
-{\
-    if( !h->func.name )\
-        h->func.name = (void*)avs_address( h->library, alias );\
-    if( !continue_on_fail && !h->func.name )\
-        goto fail;\
-}
-
-typedef struct
-{
-    AVS_Clip *clip;
-    AVS_ScriptEnvironment *env;
-    void *library;
-    int num_frames;
-    struct
-    {
-        AVSC_DECLARE_FUNC( avs_clip_get_error );
-        AVSC_DECLARE_FUNC( avs_create_script_environment );
-        AVSC_DECLARE_FUNC( avs_delete_script_environment );
-        AVSC_DECLARE_FUNC( avs_get_error );
-        AVSC_DECLARE_FUNC( avs_get_frame );
-        AVSC_DECLARE_FUNC( avs_get_video_info );
-        AVSC_DECLARE_FUNC( avs_function_exists );
-        AVSC_DECLARE_FUNC( avs_invoke );
-        AVSC_DECLARE_FUNC( avs_release_clip );
-        AVSC_DECLARE_FUNC( avs_release_value );
-        AVSC_DECLARE_FUNC( avs_release_video_frame );
-        AVSC_DECLARE_FUNC( avs_take_clip );
-#if !USE_AVXSYNTH
-        // AviSynth+ extension
-        AVSC_DECLARE_FUNC( avs_is_rgb48 );
-        AVSC_DECLARE_FUNC( avs_is_rgb64 );
-        AVSC_DECLARE_FUNC( avs_is_yuv444p16 );
-        AVSC_DECLARE_FUNC( avs_is_yuv422p16 );
-        AVSC_DECLARE_FUNC( avs_is_yuv420p16 );
-        AVSC_DECLARE_FUNC( avs_is_y16 );
-        AVSC_DECLARE_FUNC( avs_is_yuv444ps );
-        AVSC_DECLARE_FUNC( avs_is_yuv422ps );
-        AVSC_DECLARE_FUNC( avs_is_yuv420ps );
-        AVSC_DECLARE_FUNC( avs_is_y32 );
-        AVSC_DECLARE_FUNC( avs_is_444 );
-        AVSC_DECLARE_FUNC( avs_is_422 );
-        AVSC_DECLARE_FUNC( avs_is_420 );
-        AVSC_DECLARE_FUNC( avs_is_y );
-        AVSC_DECLARE_FUNC( avs_is_yuva );
-        AVSC_DECLARE_FUNC( avs_is_planar_rgb );
-        AVSC_DECLARE_FUNC( avs_is_planar_rgba );
-        AVSC_DECLARE_FUNC( avs_num_components );
-        AVSC_DECLARE_FUNC( avs_component_size );
-        AVSC_DECLARE_FUNC( avs_bits_per_component );
-#endif
-    } func;
-} avs_hnd_t;
-
-/* load the library and functions we require from it */
-static int x264_avs_load_library( avs_hnd_t *h )
-{
-    h->library = avs_open();
-    if( !h->library )
-        return -1;
-    LOAD_AVS_FUNC( avs_clip_get_error, 0 );
-    LOAD_AVS_FUNC( avs_create_script_environment, 0 );
-    LOAD_AVS_FUNC( avs_delete_script_environment, 1 );
-    LOAD_AVS_FUNC( avs_get_error, 1 );
-    LOAD_AVS_FUNC( avs_get_frame, 0 );
-    LOAD_AVS_FUNC( avs_get_video_info, 0 );
-    LOAD_AVS_FUNC( avs_function_exists, 0 );
-    LOAD_AVS_FUNC( avs_invoke, 0 );
-    LOAD_AVS_FUNC( avs_release_clip, 0 );
-    LOAD_AVS_FUNC( avs_release_value, 0 );
-    LOAD_AVS_FUNC( avs_release_video_frame, 0 );
-    LOAD_AVS_FUNC( avs_take_clip, 0 );
-#if !USE_AVXSYNTH
-    // AviSynth+ extension
-    LOAD_AVS_FUNC( avs_is_rgb48, 1 );
-    LOAD_AVS_FUNC_ALIAS( avs_is_rgb48, "_avs_is_rgb48@4", 1 );
-    LOAD_AVS_FUNC( avs_is_rgb64, 1 );
-    LOAD_AVS_FUNC_ALIAS( avs_is_rgb64, "_avs_is_rgb64@4", 1 );
-    LOAD_AVS_FUNC( avs_is_yuv444p16, 1 );
-    LOAD_AVS_FUNC( avs_is_yuv422p16, 1 );
-    LOAD_AVS_FUNC( avs_is_yuv420p16, 1 );
-    LOAD_AVS_FUNC( avs_is_y16, 1 );
-    LOAD_AVS_FUNC( avs_is_yuv444ps, 1 );
-    LOAD_AVS_FUNC( avs_is_yuv422ps, 1 );
-    LOAD_AVS_FUNC( avs_is_yuv420ps, 1 );
-    LOAD_AVS_FUNC( avs_is_y32, 1 );
-    LOAD_AVS_FUNC( avs_is_444, 1 );
-    LOAD_AVS_FUNC( avs_is_422, 1 );
-    LOAD_AVS_FUNC( avs_is_420, 1 );
-    LOAD_AVS_FUNC( avs_is_y, 1 );
-    LOAD_AVS_FUNC( avs_is_yuva, 1 );
-    LOAD_AVS_FUNC( avs_is_planar_rgb, 1 );
-    LOAD_AVS_FUNC( avs_is_planar_rgba, 1 );
-    LOAD_AVS_FUNC( avs_num_components, 1 );
-    LOAD_AVS_FUNC( avs_component_size, 1 );
-    LOAD_AVS_FUNC( avs_bits_per_component, 1 );
-#endif
-    return 0;
-fail:
-    avs_close( h->library );
-    h->library = NULL;
-    return -1;
-}
-
-/* AvxSynth doesn't have yv24, yv16, yv411, or y8, so disable them. */
-#if USE_AVXSYNTH
-#define avs_is_yv24( vi ) (0)
-#define avs_is_yv16( vi ) (0)
-#define avs_is_yv411( vi ) (0)
-#define avs_is_y8( vi ) (0)
-/* AvxSynth doesn't support AviSynth+ pixel types. */
-#define AVS_IS_AVISYNTHPLUS (0)
-#define AVS_IS_420( vi ) (0)
-#define AVS_IS_422( vi ) (0)
-#define AVS_IS_444( vi ) (0)
-#define AVS_IS_RGB48( vi ) (0)
-#define AVS_IS_RGB64( vi ) (0)
-#define AVS_IS_YUV420P16( vi ) (0)
-#define AVS_IS_YUV422P16( vi ) (0)
-#define AVS_IS_YUV444P16( vi ) (0)
-#else
-#define AVS_IS_AVISYNTHPLUS (h->func.avs_is_420 && h->func.avs_is_422 && h->func.avs_is_444)
-#define AVS_IS_420( vi ) (h->func.avs_is_420 ? h->func.avs_is_420( vi ) : avs_is_yv12( vi ))
-#define AVS_IS_422( vi ) (h->func.avs_is_422 ? h->func.avs_is_422( vi ) : avs_is_yv16( vi ))
-#define AVS_IS_444( vi ) (h->func.avs_is_444 ? h->func.avs_is_444( vi ) : avs_is_yv24( vi ))
-#define AVS_IS_RGB48( vi ) (h->func.avs_is_rgb48 && h->func.avs_is_rgb48( vi ))
-#define AVS_IS_RGB64( vi ) (h->func.avs_is_rgb64 && h->func.avs_is_rgb64( vi ))
-#define AVS_IS_YUV420P16( vi ) (h->func.avs_is_yuv420p16 && h->func.avs_is_yuv420p16( vi ))
-#define AVS_IS_YUV422P16( vi ) (h->func.avs_is_yuv422p16 && h->func.avs_is_yuv422p16( vi ))
-#define AVS_IS_YUV444P16( vi ) (h->func.avs_is_yuv444p16 && h->func.avs_is_yuv444p16( vi ))
-#endif
-
-/* generate a filter sequence to try based on the filename extension */
-static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
-{
-    int i = 0;
-#if USE_AVXSYNTH
-    const char *all_purpose[] = { "FFVideoSource", 0 };
-#else
-    const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
-    if( !strcasecmp( filename_ext, "avi" ) )
-        filter[i++] = "AVISource";
-    if( !strcasecmp( filename_ext, "d2v" ) )
-        filter[i++] = "MPEG2Source";
-    if( !strcasecmp( filename_ext, "dga" ) )
-        filter[i++] = "AVCSource";
-#endif
-    for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
-        filter[i++] = all_purpose[j];
-}
-
-static AVS_Value update_clip( avs_hnd_t *h, const AVS_VideoInfo **vi, AVS_Value res, AVS_Value release )
-{
-    h->func.avs_release_clip( h->clip );
-    h->clip = h->func.avs_take_clip( res, h->env );
-    h->func.avs_release_value( release );
-    *vi = h->func.avs_get_video_info( h->clip );
-    return res;
-}
-
-static float get_avs_version( avs_hnd_t *h )
-{
-/* AvxSynth has its version defined starting at 4.0, even though it's based on
-   AviSynth 2.5.8. This is troublesome for get_avs_version and working around
-   the new colorspaces in 2.6.  So if AvxSynth is detected, explicitly define
-   the version as 2.58. */
-#if USE_AVXSYNTH
-    return 2.58f;
-#else
-    FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" );
-    AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL );
-    FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) );
-    FAIL_IF_ERROR( !avs_is_float( ver ), "VersionNumber did not return a float value\n" );
-    float ret = avs_as_float( ver );
-    h->func.avs_release_value( ver );
-    return ret;
-#endif
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    FILE *fh = x264_fopen( psz_filename, "r" );
-    if( !fh )
-        return -1;
-    int b_regular = x264_is_regular_file( fh );
-    fclose( fh );
-    FAIL_IF_ERROR( !b_regular, "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
-
-    avs_hnd_t *h = calloc( 1, sizeof(avs_hnd_t) );
-    if( !h )
-        return -1;
-    FAIL_IF_ERROR( x264_avs_load_library( h ), "failed to load avisynth\n" );
-    h->env = h->func.avs_create_script_environment( AVS_INTERFACE_25 );
-    if( h->func.avs_get_error )
-    {
-        const char *error = h->func.avs_get_error( h->env );
-        FAIL_IF_ERROR( error, "%s\n", error );
-    }
-    float avs_version = get_avs_version( h );
-    if( avs_version <= 0 )
-        return -1;
-    x264_cli_log( "avs", X264_LOG_DEBUG, "using avisynth version %.2f\n", avs_version );
-
-#ifdef _WIN32
-    /* Avisynth doesn't support Unicode filenames. */
-    char ansi_filename[MAX_PATH];
-    FAIL_IF_ERROR( !x264_ansi_filename( psz_filename, ansi_filename, MAX_PATH, 0 ), "invalid ansi filename\n" );
-    AVS_Value arg = avs_new_value_string( ansi_filename );
-#else
-    AVS_Value arg = avs_new_value_string( psz_filename );
-#endif
-
-    AVS_Value res;
-    char *filename_ext = get_filename_extension( psz_filename );
-
-    if( !strcasecmp( filename_ext, "avs" ) )
-    {
-        res = h->func.avs_invoke( h->env, "Import", arg, NULL );
-        FAIL_IF_ERROR( avs_is_error( res ), "%s\n", avs_as_string( res ) );
-        /* check if the user is using a multi-threaded script and apply distributor if necessary.
-           adapted from avisynth's vfw interface */
-        AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL );
-        int mt_mode = avs_is_int( mt_test ) ? avs_as_int( mt_test ) : 0;
-        h->func.avs_release_value( mt_test );
-        if( mt_mode > 0 && mt_mode < 5 )
-        {
-            AVS_Value temp = h->func.avs_invoke( h->env, "Distributor", res, NULL );
-            h->func.avs_release_value( res );
-            res = temp;
-        }
-    }
-    else /* non script file */
-    {
-        /* cycle through known source filters to find one that works */
-        const char *filter[AVS_MAX_SEQUENCE+1] = { 0 };
-        avs_build_filter_sequence( filename_ext, filter );
-        int i;
-        for( i = 0; filter[i]; i++ )
-        {
-            x264_cli_log( "avs", X264_LOG_INFO, "trying %s... ", filter[i] );
-            if( !h->func.avs_function_exists( h->env, filter[i] ) )
-            {
-                x264_cli_printf( X264_LOG_INFO, "not found\n" );
-                continue;
-            }
-            if( !strncasecmp( filter[i], "FFmpegSource", 12 ) )
-            {
-                x264_cli_printf( X264_LOG_INFO, "indexing... " );
-                fflush( stderr );
-            }
-            res = h->func.avs_invoke( h->env, filter[i], arg, NULL );
-            if( !avs_is_error( res ) )
-            {
-                x264_cli_printf( X264_LOG_INFO, "succeeded\n" );
-                break;
-            }
-            x264_cli_printf( X264_LOG_INFO, "failed\n" );
-        }
-        FAIL_IF_ERROR( !filter[i], "unable to find source filter to open `%s'\n", psz_filename );
-    }
-    FAIL_IF_ERROR( !avs_is_clip( res ), "`%s' didn't return a video clip\n", psz_filename );
-    h->clip = h->func.avs_take_clip( res, h->env );
-    const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip );
-    FAIL_IF_ERROR( !avs_has_video( vi ), "`%s' has no video data\n", psz_filename );
-    /* if the clip is made of fields instead of frames, call weave to make them frames */
-    if( avs_is_field_based( vi ) )
-    {
-        x264_cli_log( "avs", X264_LOG_WARNING, "detected fieldbased (separated) input, weaving to frames\n" );
-        AVS_Value tmp = h->func.avs_invoke( h->env, "Weave", res, NULL );
-        FAIL_IF_ERROR( avs_is_error( tmp ), "couldn't weave fields into frames\n" );
-        res = update_clip( h, &vi, tmp, res );
-        info->interlaced = 1;
-        info->tff = avs_is_tff( vi );
-    }
-#if !HAVE_SWSCALE
-    /* if swscale is not available, convert the CSP if necessary */
-    FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
-                   "avisynth >= 2.6 is required for i422/i444 output\n" );
-    if( (opt->output_csp == X264_CSP_I420 && !AVS_IS_420( vi )) ||
-        (opt->output_csp == X264_CSP_I422 && !AVS_IS_422( vi )) ||
-        (opt->output_csp == X264_CSP_I444 && !AVS_IS_444( vi )) ||
-        (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
-    {
-        const char *csp;
-        if( AVS_IS_AVISYNTHPLUS )
-        {
-            csp = opt->output_csp == X264_CSP_I420 ? "YUV420" :
-                  opt->output_csp == X264_CSP_I422 ? "YUV422" :
-                  opt->output_csp == X264_CSP_I444 ? "YUV444" :
-                  "RGB";
-        }
-        else
-        {
-            csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
-                  opt->output_csp == X264_CSP_I422 ? "YV16" :
-                  opt->output_csp == X264_CSP_I444 ? "YV24" :
-                  "RGB";
-        }
-        x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
-        FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1),
-                       "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height );
-        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3),
-                       "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height );
-        FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
-                       "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height );
-        char conv_func[16];
-        snprintf( conv_func, sizeof(conv_func), "ConvertTo%s", csp );
-        char matrix[7] = "";
-        int arg_count = 2;
-        /* if doing a rgb <-> yuv conversion then range is handled via 'matrix'. though it's only supported in 2.56+ */
-        if( avs_version >= 2.56f && ((opt->output_csp == X264_CSP_RGB && avs_is_yuv( vi )) || (opt->output_csp != X264_CSP_RGB && avs_is_rgb( vi ))) )
-        {
-            // if converting from yuv, then we specify the matrix for the input, otherwise use the output's.
-            int use_pc_matrix = avs_is_yuv( vi ) ? opt->input_range == RANGE_PC : opt->output_range == RANGE_PC;
-            snprintf( matrix, sizeof(matrix), "%s601", use_pc_matrix ? "PC." : "Rec" ); /* FIXME: use correct coefficients */
-            arg_count++;
-            // notification that the input range has changed to the desired one
-            opt->input_range = opt->output_range;
-        }
-        const char *arg_name[] = { NULL, "interlaced", "matrix" };
-        AVS_Value arg_arr[3];
-        arg_arr[0] = res;
-        arg_arr[1] = avs_new_value_bool( info->interlaced );
-        arg_arr[2] = avs_new_value_string( matrix );
-        AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name );
-        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp );
-        res = update_clip( h, &vi, res2, res );
-    }
-    /* if swscale is not available, change the range if necessary. This only applies to YUV-based CSPs however */
-    if( avs_is_yuv( vi ) && opt->output_range != RANGE_AUTO && ((opt->input_range == RANGE_PC) != opt->output_range) )
-    {
-        const char *levels = opt->output_range ? "TV->PC" : "PC->TV";
-        x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels );
-        AVS_Value arg_arr[2];
-        arg_arr[0] = res;
-        arg_arr[1] = avs_new_value_string( levels );
-        const char *arg_name[] = { NULL, "levels" };
-        AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name );
-        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) );
-        res = update_clip( h, &vi, res2, res );
-        // notification that the input range has changed to the desired one
-        opt->input_range = opt->output_range;
-    }
-#endif
-
-    h->func.avs_release_value( res );
-
-    info->width   = vi->width;
-    info->height  = vi->height;
-    info->fps_num = vi->fps_numerator;
-    info->fps_den = vi->fps_denominator;
-    h->num_frames = info->num_frames = vi->num_frames;
-    info->thread_safe = 1;
-    if( AVS_IS_RGB64( vi ) )
-        info->csp = X264_CSP_BGRA | X264_CSP_VFLIP | X264_CSP_HIGH_DEPTH;
-    else if( avs_is_rgb32( vi ) )
-        info->csp = X264_CSP_BGRA | X264_CSP_VFLIP;
-    else if( AVS_IS_RGB48( vi ) )
-        info->csp = X264_CSP_BGR | X264_CSP_VFLIP | X264_CSP_HIGH_DEPTH;
-    else if( avs_is_rgb24( vi ) )
-        info->csp = X264_CSP_BGR | X264_CSP_VFLIP;
-    else if( AVS_IS_YUV444P16( vi ) )
-        info->csp = X264_CSP_I444 | X264_CSP_HIGH_DEPTH;
-    else if( avs_is_yv24( vi ) )
-        info->csp = X264_CSP_I444;
-    else if( AVS_IS_YUV422P16( vi ) )
-        info->csp = X264_CSP_I422 | X264_CSP_HIGH_DEPTH;
-    else if( avs_is_yv16( vi ) )
-        info->csp = X264_CSP_I422;
-    else if( AVS_IS_YUV420P16( vi ) )
-        info->csp = X264_CSP_I420 | X264_CSP_HIGH_DEPTH;
-    else if( avs_is_yv12( vi ) )
-        info->csp = X264_CSP_I420;
-#if HAVE_SWSCALE
-    else if( avs_is_yuy2( vi ) )
-        info->csp = AV_PIX_FMT_YUYV422 | X264_CSP_OTHER;
-    else if( avs_is_yv411( vi ) )
-        info->csp = AV_PIX_FMT_YUV411P | X264_CSP_OTHER;
-    else if( avs_is_y8( vi ) )
-        info->csp = AV_PIX_FMT_GRAY8 | X264_CSP_OTHER;
-#endif
-    else
-    {
-        AVS_Value pixel_type = h->func.avs_invoke( h->env, "PixelType", res, NULL );
-        const char *pixel_type_name = avs_is_string( pixel_type ) ? avs_as_string( pixel_type ) : "unknown";
-        FAIL_IF_ERROR( 1, "not supported pixel type: %s\n", pixel_type_name );
-    }
-    info->vfr = 0;
-
-    *p_handle = h;
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) )
-        return -1;
-    pic->img.csp = csp;
-    const x264_cli_csp_t *cli_csp = x264_cli_get_csp( csp );
-    if( cli_csp )
-        pic->img.planes = cli_csp->planes;
-#if HAVE_SWSCALE
-    else if( csp == (AV_PIX_FMT_YUV411P | X264_CSP_OTHER) )
-        pic->img.planes = 3;
-    else
-        pic->img.planes = 1; //y8 and yuy2 are one plane
-#endif
-    return 0;
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
-{
-    static const int plane[3] = { AVS_PLANAR_Y, AVS_PLANAR_U, AVS_PLANAR_V };
-    avs_hnd_t *h = handle;
-    if( i_frame >= h->num_frames )
-        return -1;
-    AVS_VideoFrame *frm = pic->opaque = h->func.avs_get_frame( h->clip, i_frame );
-    const char *err = h->func.avs_clip_get_error( h->clip );
-    FAIL_IF_ERROR( err, "%s occurred while reading frame %d\n", err, i_frame );
-    for( int i = 0; i < pic->img.planes; i++ )
-    {
-        /* explicitly cast away the const attribute to avoid a warning */
-        pic->img.plane[i] = (uint8_t*)avs_get_read_ptr_p( frm, plane[i] );
-        pic->img.stride[i] = avs_get_pitch_p( frm, plane[i] );
-    }
-    return 0;
-}
-
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    avs_hnd_t *h = handle;
-    h->func.avs_release_video_frame( pic->opaque );
-    return 0;
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    memset( pic, 0, sizeof(cli_pic_t) );
-}
-
-static int close_file( hnd_t handle )
-{
-    avs_hnd_t *h = handle;
-    if( h->func.avs_release_clip && h->clip )
-        h->func.avs_release_clip( h->clip );
-    if( h->func.avs_delete_script_environment && h->env )
-        h->func.avs_delete_script_environment( h->env );
-    if( h->library )
-        avs_close( h->library );
-    free( h );
-    return 0;
-}
-
-const cli_input_t avs_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/ffms.c b/android/src/main/libenc/jni/libx264/input/ffms.c
deleted file mode 100755
index 6723370..0000000
--- a/android/src/main/libenc/jni/libx264/input/ffms.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*****************************************************************************
- * ffms.c: ffmpegsource input
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Mike Gurlitz <mike.gurlitz@gmail.com>
- *          Steven Walters <kemuri9@gmail.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#include <ffms.h>
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "ffms", __VA_ARGS__ )
-
-#undef DECLARE_ALIGNED
-#include <libavcodec/avcodec.h>
-#include <libswscale/swscale.h>
-
-#define PROGRESS_LENGTH 36
-
-typedef struct
-{
-    FFMS_VideoSource *video_source;
-    FFMS_Track *track;
-    int reduce_pts;
-    int vfr_input;
-    int num_frames;
-    int64_t time;
-} ffms_hnd_t;
-
-static int FFMS_CC update_progress( int64_t current, int64_t total, void *private )
-{
-    int64_t *update_time = private;
-    int64_t oldtime = *update_time;
-    int64_t newtime = x264_mdate();
-    if( oldtime && newtime - oldtime < UPDATE_INTERVAL )
-        return 0;
-    *update_time = newtime;
-
-    char buf[PROGRESS_LENGTH+5+1];
-    snprintf( buf, sizeof(buf), "ffms [info]: indexing input file [%.1f%%]", 100.0 * current / total );
-    fprintf( stderr, "%-*s\r", PROGRESS_LENGTH, buf+5 );
-    x264_cli_set_console_title( buf );
-    fflush( stderr );
-    return 0;
-}
-
-/* handle the deprecated jpeg pixel formats */
-static int handle_jpeg( int csp, int *fullrange )
-{
-    switch( csp )
-    {
-        case AV_PIX_FMT_YUVJ420P: *fullrange = 1; return AV_PIX_FMT_YUV420P;
-        case AV_PIX_FMT_YUVJ422P: *fullrange = 1; return AV_PIX_FMT_YUV422P;
-        case AV_PIX_FMT_YUVJ444P: *fullrange = 1; return AV_PIX_FMT_YUV444P;
-        default:                               return csp;
-    }
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) );
-    if( !h )
-        return -1;
-
-    FFMS_Init( 0, 1 );
-    FFMS_ErrorInfo e;
-    e.BufferSize = 0;
-    int seekmode = opt->seek ? FFMS_SEEK_NORMAL : FFMS_SEEK_LINEAR_NO_RW;
-
-    FFMS_Index *idx = NULL;
-    if( opt->index_file )
-    {
-        x264_struct_stat index_s, input_s;
-        if( !x264_stat( opt->index_file, &index_s ) && !x264_stat( psz_filename, &input_s ) && input_s.st_mtime < index_s.st_mtime )
-        {
-            idx = FFMS_ReadIndex( opt->index_file, &e );
-            if( idx && FFMS_IndexBelongsToFile( idx, psz_filename, &e ) )
-            {
-                FFMS_DestroyIndex( idx );
-                idx = NULL;
-            }
-        }
-    }
-    if( !idx )
-    {
-        FFMS_Indexer *indexer = FFMS_CreateIndexer( psz_filename, &e );
-        FAIL_IF_ERROR( !indexer, "could not create indexer\n" );
-
-        if( opt->progress )
-            FFMS_SetProgressCallback( indexer, update_progress, &h->time );
-
-        idx = FFMS_DoIndexing2( indexer, FFMS_IEH_ABORT, &e );
-        fprintf( stderr, "%*c", PROGRESS_LENGTH+1, '\r' );
-        FAIL_IF_ERROR( !idx, "could not create index\n" );
-
-        if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) )
-            x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" );
-    }
-
-    int trackno = FFMS_GetFirstTrackOfType( idx, FFMS_TYPE_VIDEO, &e );
-    if( trackno >= 0 )
-        h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, idx, 1, seekmode, &e );
-    FFMS_DestroyIndex( idx );
-
-    FAIL_IF_ERROR( trackno < 0, "could not find video track\n" );
-    FAIL_IF_ERROR( !h->video_source, "could not create video source\n" );
-
-    const FFMS_VideoProperties *videop = FFMS_GetVideoProperties( h->video_source );
-    info->num_frames   = h->num_frames = videop->NumFrames;
-    info->sar_height   = videop->SARDen;
-    info->sar_width    = videop->SARNum;
-    info->fps_den      = videop->FPSDenominator;
-    info->fps_num      = videop->FPSNumerator;
-    h->vfr_input       = info->vfr;
-    /* ffms is thread unsafe as it uses a single frame buffer for all frame requests */
-    info->thread_safe  = 0;
-
-    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
-    FAIL_IF_ERROR( !frame, "could not read frame 0\n" );
-
-    info->fullrange  = 0;
-    info->width      = frame->EncodedWidth;
-    info->height     = frame->EncodedHeight;
-    info->csp        = handle_jpeg( frame->EncodedPixelFormat, &info->fullrange ) | X264_CSP_OTHER;
-    info->interlaced = frame->InterlacedFrame;
-    info->tff        = frame->TopFieldFirst;
-    info->fullrange |= frame->ColorRange == FFMS_CR_JPEG;
-
-    /* ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase,
-     * so we need to reduce large timebases to prevent overflow */
-    if( h->vfr_input )
-    {
-        h->track = FFMS_GetTrackFromVideo( h->video_source );
-        const FFMS_TrackTimeBase *timebase = FFMS_GetTimeBase( h->track );
-        int64_t timebase_num = timebase->Num;
-        int64_t timebase_den = timebase->Den * 1000;
-        h->reduce_pts = 0;
-
-        while( timebase_num > UINT32_MAX || timebase_den > INT32_MAX )
-        {
-            timebase_num >>= 1;
-            timebase_den >>= 1;
-            h->reduce_pts++;
-        }
-        info->timebase_num = timebase_num;
-        info->timebase_den = timebase_den;
-    }
-
-    *p_handle = h;
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) )
-        return -1;
-    pic->img.csp = csp;
-    pic->img.planes = 4;
-    return 0;
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
-{
-    ffms_hnd_t *h = handle;
-    if( i_frame >= h->num_frames )
-        return -1;
-    FFMS_ErrorInfo e;
-    e.BufferSize = 0;
-    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e );
-    FAIL_IF_ERROR( !frame, "could not read frame %d \n", i_frame );
-
-    memcpy( pic->img.stride, frame->Linesize, sizeof(pic->img.stride) );
-    memcpy( pic->img.plane, frame->Data, sizeof(pic->img.plane) );
-
-    if( h->vfr_input )
-    {
-        const FFMS_FrameInfo *info = FFMS_GetFrameInfo( h->track, i_frame );
-        FAIL_IF_ERROR( info->PTS == AV_NOPTS_VALUE, "invalid timestamp. "
-                       "Use --force-cfr and specify a framerate with --fps\n" );
-
-        pic->pts = info->PTS >> h->reduce_pts;
-        pic->duration = 0;
-    }
-    return 0;
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    memset( pic, 0, sizeof(cli_pic_t) );
-}
-
-static int close_file( hnd_t handle )
-{
-    ffms_hnd_t *h = handle;
-    FFMS_DestroyVideoSource( h->video_source );
-    free( h );
-    return 0;
-}
-
-const cli_input_t ffms_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/input.c b/android/src/main/libenc/jni/libx264/input/input.c
deleted file mode 100755
index 9008398..0000000
--- a/android/src/main/libenc/jni/libx264/input/input.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*****************************************************************************
- * input.c: common input functions
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Steven Walters <kemuri9@gmail.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-
-#ifdef _WIN32
-#include <io.h>
-#elif HAVE_MMAP
-#include <sys/mman.h>
-#include <unistd.h>
-#endif
-
-const x264_cli_csp_t x264_cli_csps[] = {
-    [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
-    [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
-    [X264_CSP_I444] = { "i444", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
-    [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
-    [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
-    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
-    [X264_CSP_NV12] = { "nv12", 2, { 1,  1 },     { 1, .5 },     2, 2 },
-    [X264_CSP_NV21] = { "nv21", 2, { 1,  1 },     { 1, .5 },     2, 2 },
-    [X264_CSP_NV16] = { "nv16", 2, { 1,  1 },     { 1,  1 },     2, 1 },
-    [X264_CSP_BGR]  = { "bgr",  1, { 3 },         { 1 },         1, 1 },
-    [X264_CSP_BGRA] = { "bgra", 1, { 4 },         { 1 },         1, 1 },
-    [X264_CSP_RGB]  = { "rgb",  1, { 3 },         { 1 },         1, 1 },
-};
-
-int x264_cli_csp_is_invalid( int csp )
-{
-    int csp_mask = csp & X264_CSP_MASK;
-    return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX ||
-           csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER;
-}
-
-int x264_cli_csp_depth_factor( int csp )
-{
-    if( x264_cli_csp_is_invalid( csp ) )
-        return 0;
-    return (csp & X264_CSP_HIGH_DEPTH) ? 2 : 1;
-}
-
-uint64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane )
-{
-    int csp_mask = csp & X264_CSP_MASK;
-    if( x264_cli_csp_is_invalid( csp ) || plane < 0 || plane >= x264_cli_csps[csp_mask].planes )
-        return 0;
-    uint64_t size = (uint64_t)width * height;
-    size *= x264_cli_csps[csp_mask].width[plane] * x264_cli_csps[csp_mask].height[plane];
-    size *= x264_cli_csp_depth_factor( csp );
-    return size;
-}
-
-uint64_t x264_cli_pic_size( int csp, int width, int height )
-{
-    if( x264_cli_csp_is_invalid( csp ) )
-        return 0;
-    uint64_t size = 0;
-    int csp_mask = csp & X264_CSP_MASK;
-    for( int i = 0; i < x264_cli_csps[csp_mask].planes; i++ )
-        size += x264_cli_pic_plane_size( csp, width, height, i );
-    return size;
-}
-
-static int x264_cli_pic_init_internal( cli_pic_t *pic, int csp, int width, int height, int align, int alloc )
-{
-    memset( pic, 0, sizeof(cli_pic_t) );
-    int csp_mask = csp & X264_CSP_MASK;
-    if( x264_cli_csp_is_invalid( csp ) )
-        pic->img.planes = 0;
-    else
-        pic->img.planes = x264_cli_csps[csp_mask].planes;
-    pic->img.csp    = csp;
-    pic->img.width  = width;
-    pic->img.height = height;
-    for( int i = 0; i < pic->img.planes; i++ )
-    {
-        int stride = width * x264_cli_csps[csp_mask].width[i];
-        stride *= x264_cli_csp_depth_factor( csp );
-        stride = ALIGN( stride, align );
-        pic->img.stride[i] = stride;
-
-        if( alloc )
-        {
-            size_t size = (size_t)(height * x264_cli_csps[csp_mask].height[i]) * stride;
-            pic->img.plane[i] = x264_malloc( size );
-            if( !pic->img.plane[i] )
-                return -1;
-        }
-    }
-
-    return 0;
-}
-
-int x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height )
-{
-    return x264_cli_pic_init_internal( pic, csp, width, height, 1, 1 );
-}
-
-int x264_cli_pic_alloc_aligned( cli_pic_t *pic, int csp, int width, int height )
-{
-    return x264_cli_pic_init_internal( pic, csp, width, height, NATIVE_ALIGN, 1 );
-}
-
-int x264_cli_pic_init_noalloc( cli_pic_t *pic, int csp, int width, int height )
-{
-    return x264_cli_pic_init_internal( pic, csp, width, height, 1, 0 );
-}
-
-void x264_cli_pic_clean( cli_pic_t *pic )
-{
-    for( int i = 0; i < pic->img.planes; i++ )
-        x264_free( pic->img.plane[i] );
-    memset( pic, 0, sizeof(cli_pic_t) );
-}
-
-const x264_cli_csp_t *x264_cli_get_csp( int csp )
-{
-    if( x264_cli_csp_is_invalid( csp ) )
-        return NULL;
-    return x264_cli_csps + (csp&X264_CSP_MASK);
-}
-
-/* Functions for handling memory-mapped input frames */
-int x264_cli_mmap_init( cli_mmap_t *h, FILE *fh )
-{
-#ifdef _WIN32
-    HANDLE osfhandle = (HANDLE)_get_osfhandle( _fileno( fh ) );
-    if( osfhandle != INVALID_HANDLE_VALUE )
-    {
-        SYSTEM_INFO si;
-        GetSystemInfo( &si );
-        h->align_mask = si.dwAllocationGranularity - 1;
-        h->prefetch_virtual_memory = (void*)GetProcAddress( GetModuleHandleW( L"kernel32.dll" ), "PrefetchVirtualMemory" );
-        h->process_handle = GetCurrentProcess();
-        h->map_handle = CreateFileMappingW( osfhandle, NULL, PAGE_READONLY, 0, 0, NULL );
-        return !h->map_handle;
-    }
-#elif HAVE_MMAP && defined(_SC_PAGESIZE)
-    h->align_mask = sysconf( _SC_PAGESIZE ) - 1;
-    h->fd = fileno( fh );
-    return h->align_mask < 0 || h->fd < 0;
-#endif
-    return -1;
-}
-
-void *x264_cli_mmap( cli_mmap_t *h, int64_t offset, size_t size )
-{
-#if defined(_WIN32) || HAVE_MMAP
-    int align = offset & h->align_mask;
-    offset -= align;
-    size   += align;
-#ifdef _WIN32
-    uint8_t *base = MapViewOfFile( h->map_handle, FILE_MAP_READ, offset >> 32, offset, size );
-    if( base )
-    {
-        /* PrefetchVirtualMemory() is only available on Windows 8 and newer. */
-        if( h->prefetch_virtual_memory )
-        {
-            struct { void *addr; size_t size; } mem_range = { base, size };
-            h->prefetch_virtual_memory( h->process_handle, 1, &mem_range, 0 );
-        }
-        return base + align;
-    }
-#else
-    uint8_t *base = mmap( NULL, size, PROT_READ, MAP_PRIVATE, h->fd, offset );
-    if( base != MAP_FAILED )
-    {
-        /* Ask the OS to readahead pages. This improves performance whereas
-         * forcing page faults by manually accessing every page does not.
-         * Some systems have implemented madvise() but not posix_madvise()
-         * and vice versa, so check both to see if either is available. */
-#ifdef MADV_WILLNEED
-        madvise( base, size, MADV_WILLNEED );
-#elif defined(POSIX_MADV_WILLNEED)
-        posix_madvise( base, size, POSIX_MADV_WILLNEED );
-#endif
-        return base + align;
-    }
-#endif
-#endif
-    return NULL;
-}
-
-int x264_cli_munmap( cli_mmap_t *h, void *addr, size_t size )
-{
-#if defined(_WIN32) || HAVE_MMAP
-    void *base = (void*)((intptr_t)addr & ~h->align_mask);
-#ifdef _WIN32
-    return !UnmapViewOfFile( base );
-#else
-    return munmap( base, size + (intptr_t)addr - (intptr_t)base );
-#endif
-#endif
-    return -1;
-}
-
-void x264_cli_mmap_close( cli_mmap_t *h )
-{
-#ifdef _WIN32
-    CloseHandle( h->map_handle );
-#endif
-}
diff --git a/android/src/main/libenc/jni/libx264/input/input.h b/android/src/main/libenc/jni/libx264/input/input.h
deleted file mode 100755
index d9a716f..0000000
--- a/android/src/main/libenc/jni/libx264/input/input.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*****************************************************************************
- * input.h: file input
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_INPUT_H
-#define X264_INPUT_H
-
-#include "x264cli.h"
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-/* options that are used by only some demuxers */
-typedef struct
-{
-    char *index_file;
-    char *format;
-    char *resolution;
-    char *colorspace;
-    int bit_depth;
-    char *timebase;
-    int seek;
-    int progress;
-    int output_csp; /* convert to this csp, if applicable */
-    int output_range; /* user desired output range */
-    int input_range; /* user override input range */
-} cli_input_opt_t;
-
-/* properties of the source given by the demuxer */
-typedef struct
-{
-    int csp;         /* colorspace of the input */
-    uint32_t fps_num;
-    uint32_t fps_den;
-    int fullrange;   /* has 2^bit_depth-1 instead of 219*2^(bit_depth-8) ranges (YUV only) */
-    int width;
-    int height;
-    int interlaced;
-    int num_frames;
-    uint32_t sar_width;
-    uint32_t sar_height;
-    int tff;
-    int thread_safe; /* demuxer is thread_input safe */
-    uint32_t timebase_num;
-    uint32_t timebase_den;
-    int vfr;
-} video_info_t;
-
-/* image data type used by x264cli */
-typedef struct
-{
-    int     csp;       /* colorspace */
-    int     width;     /* width of the picture */
-    int     height;    /* height of the picture */
-    int     planes;    /* number of planes */
-    uint8_t *plane[4]; /* pointers for each plane */
-    int     stride[4]; /* strides for each plane */
-} cli_image_t;
-
-typedef struct
-{
-    cli_image_t img;
-    int64_t pts;       /* input pts */
-    int64_t duration;  /* frame duration - used for vfr */
-    void    *opaque;   /* opaque handle */
-} cli_pic_t;
-
-typedef struct
-{
-    int (*open_file)( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt );
-    int (*picture_alloc)( cli_pic_t *pic, hnd_t handle, int csp, int width, int height );
-    int (*read_frame)( cli_pic_t *pic, hnd_t handle, int i_frame );
-    int (*release_frame)( cli_pic_t *pic, hnd_t handle );
-    void (*picture_clean)( cli_pic_t *pic, hnd_t handle );
-    int (*close_file)( hnd_t handle );
-} cli_input_t;
-
-extern const cli_input_t raw_input;
-extern const cli_input_t y4m_input;
-extern const cli_input_t avs_input;
-extern const cli_input_t thread_input;
-extern const cli_input_t lavf_input;
-extern const cli_input_t ffms_input;
-extern const cli_input_t timecode_input;
-
-extern cli_input_t cli_input;
-
-/* extended colorspace list that isn't supported by libx264 but by the cli */
-#define X264_CSP_CLI_MAX        X264_CSP_MAX     /* end of list         */
-#define X264_CSP_OTHER          0x4000           /* non x264 colorspace */
-
-typedef struct
-{
-    const char *name;
-    int planes;
-    float width[4];
-    float height[4];
-    int mod_width;
-    int mod_height;
-} x264_cli_csp_t;
-
-extern const x264_cli_csp_t x264_cli_csps[];
-
-int      x264_cli_csp_is_invalid( int csp );
-int      x264_cli_csp_depth_factor( int csp );
-int      x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height );
-int      x264_cli_pic_alloc_aligned( cli_pic_t *pic, int csp, int width, int height );
-int      x264_cli_pic_init_noalloc( cli_pic_t *pic, int csp, int width, int height );
-void     x264_cli_pic_clean( cli_pic_t *pic );
-uint64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane );
-uint64_t x264_cli_pic_size( int csp, int width, int height );
-const x264_cli_csp_t *x264_cli_get_csp( int csp );
-
-typedef struct
-{
-    int align_mask;
-#ifdef _WIN32
-    BOOL (WINAPI *prefetch_virtual_memory)( HANDLE, ULONG_PTR, PVOID, ULONG );
-    HANDLE process_handle;
-    HANDLE map_handle;
-#elif HAVE_MMAP
-    int fd;
-#endif
-} cli_mmap_t;
-
-int x264_cli_mmap_init( cli_mmap_t *h, FILE *fh );
-void *x264_cli_mmap( cli_mmap_t *h, int64_t offset, size_t size );
-int x264_cli_munmap( cli_mmap_t *h, void *addr, size_t size );
-void x264_cli_mmap_close( cli_mmap_t *h );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/input/lavf.c b/android/src/main/libenc/jni/libx264/input/lavf.c
deleted file mode 100755
index 14ed174..0000000
--- a/android/src/main/libenc/jni/libx264/input/lavf.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*****************************************************************************
- * lavf.c: libavformat input
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Mike Gurlitz <mike.gurlitz@gmail.com>
- *          Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "lavf", __VA_ARGS__ )
-#undef DECLARE_ALIGNED
-#include <libavformat/avformat.h>
-#include <libavutil/mem.h>
-#include <libavutil/pixdesc.h>
-#include <libavutil/dict.h>
-
-typedef struct
-{
-    AVFormatContext *lavf;
-    AVFrame *frame;
-    int stream_id;
-    int next_frame;
-    int vfr_input;
-    cli_pic_t *first_pic;
-} lavf_hnd_t;
-
-/* handle the deprecated jpeg pixel formats */
-static int handle_jpeg( int csp, int *fullrange )
-{
-    switch( csp )
-    {
-        case AV_PIX_FMT_YUVJ420P: *fullrange = 1; return AV_PIX_FMT_YUV420P;
-        case AV_PIX_FMT_YUVJ422P: *fullrange = 1; return AV_PIX_FMT_YUV422P;
-        case AV_PIX_FMT_YUVJ444P: *fullrange = 1; return AV_PIX_FMT_YUV444P;
-        default:                               return csp;
-    }
-}
-
-static int read_frame_internal( cli_pic_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info )
-{
-    if( h->first_pic && !info )
-    {
-        /* see if the frame we are requesting is the frame we have already read and stored.
-         * if so, retrieve the pts and image data before freeing it. */
-        if( !i_frame )
-        {
-            XCHG( cli_image_t, p_pic->img, h->first_pic->img );
-            p_pic->pts = h->first_pic->pts;
-        }
-        lavf_input.picture_clean( h->first_pic, h );
-        free( h->first_pic );
-        h->first_pic = NULL;
-        if( !i_frame )
-            return 0;
-    }
-
-    AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
-
-    AVPacket pkt;
-    av_init_packet( &pkt );
-    pkt.data = NULL;
-    pkt.size = 0;
-
-    while( i_frame >= h->next_frame )
-    {
-        int finished = 0;
-        int ret = 0;
-        do
-        {
-            ret = av_read_frame( h->lavf, &pkt );
-
-            if( ret < 0 )
-            {
-                av_init_packet( &pkt );
-                pkt.data = NULL;
-                pkt.size = 0;
-            }
-
-            if( ret < 0 || pkt.stream_index == h->stream_id )
-            {
-                if( avcodec_decode_video2( c, h->frame, &finished, &pkt ) < 0 )
-                    x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
-            }
-
-            if( ret >= 0 )
-                av_free_packet( &pkt );
-        } while( !finished && ret >= 0 );
-
-        if( !finished )
-            return -1;
-
-        h->next_frame++;
-    }
-
-    memcpy( p_pic->img.stride, h->frame->linesize, sizeof(p_pic->img.stride) );
-    memcpy( p_pic->img.plane, h->frame->data, sizeof(p_pic->img.plane) );
-    int is_fullrange   = 0;
-    p_pic->img.width   = c->width;
-    p_pic->img.height  = c->height;
-    p_pic->img.csp     = handle_jpeg( c->pix_fmt, &is_fullrange ) | X264_CSP_OTHER;
-
-    if( info )
-    {
-        info->fullrange  = is_fullrange;
-        info->interlaced = h->frame->interlaced_frame;
-        info->tff        = h->frame->top_field_first;
-    }
-
-    if( h->vfr_input )
-    {
-        p_pic->pts = p_pic->duration = 0;
-        if( h->frame->pkt_pts != AV_NOPTS_VALUE )
-            p_pic->pts = h->frame->pkt_pts;
-        else if( h->frame->pkt_dts != AV_NOPTS_VALUE )
-            p_pic->pts = h->frame->pkt_dts; // for AVI files
-        else if( info )
-        {
-            h->vfr_input = info->vfr = 0;
-            return 0;
-        }
-    }
-
-    return 0;
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    lavf_hnd_t *h = calloc( 1, sizeof(lavf_hnd_t) );
-    if( !h )
-        return -1;
-    av_register_all();
-    if( !strcmp( psz_filename, "-" ) )
-        psz_filename = "pipe:";
-
-    h->frame = av_frame_alloc();
-    if( !h->frame )
-        return -1;
-
-    /* if resolution was passed in, place it and colorspace into options. this allows raw video support */
-    AVDictionary *options = NULL;
-    if( opt->resolution )
-    {
-        av_dict_set( &options, "video_size", opt->resolution, 0 );
-        const char *csp = opt->colorspace ? opt->colorspace : av_get_pix_fmt_name( AV_PIX_FMT_YUV420P );
-        av_dict_set( &options, "pixel_format", csp, 0 );
-    }
-
-    /* specify the input format. this is helpful when lavf fails to guess */
-    AVInputFormat *format = NULL;
-    if( opt->format )
-        FAIL_IF_ERROR( !(format = av_find_input_format( opt->format )), "unknown file format: %s\n", opt->format );
-
-    FAIL_IF_ERROR( avformat_open_input( &h->lavf, psz_filename, format, &options ), "could not open input file\n" );
-    if( options )
-        av_dict_free( &options );
-    FAIL_IF_ERROR( avformat_find_stream_info( h->lavf, NULL ) < 0, "could not find input stream info\n" );
-
-    int i = 0;
-    while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != AVMEDIA_TYPE_VIDEO )
-        i++;
-    FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" );
-    h->stream_id       = i;
-    h->next_frame      = 0;
-    AVCodecContext *c  = h->lavf->streams[i]->codec;
-    info->fps_num      = h->lavf->streams[i]->avg_frame_rate.num;
-    info->fps_den      = h->lavf->streams[i]->avg_frame_rate.den;
-    info->timebase_num = h->lavf->streams[i]->time_base.num;
-    info->timebase_den = h->lavf->streams[i]->time_base.den;
-    /* lavf is thread unsafe as calling av_read_frame invalidates previously read AVPackets */
-    info->thread_safe  = 0;
-    h->vfr_input       = info->vfr;
-    FAIL_IF_ERROR( avcodec_open2( c, avcodec_find_decoder( c->codec_id ), NULL ),
-                   "could not find decoder for video stream\n" );
-
-    /* prefetch the first frame and set/confirm flags */
-    h->first_pic = malloc( sizeof(cli_pic_t) );
-    FAIL_IF_ERROR( !h->first_pic || lavf_input.picture_alloc( h->first_pic, h, X264_CSP_OTHER, info->width, info->height ),
-                   "malloc failed\n" );
-    if( read_frame_internal( h->first_pic, h, 0, info ) )
-        return -1;
-
-    info->width      = c->width;
-    info->height     = c->height;
-    info->csp        = h->first_pic->img.csp;
-    info->num_frames = h->lavf->streams[i]->nb_frames;
-    info->sar_height = c->sample_aspect_ratio.den;
-    info->sar_width  = c->sample_aspect_ratio.num;
-    info->fullrange |= c->color_range == AVCOL_RANGE_JPEG;
-
-    /* avisynth stores rgb data vertically flipped. */
-    if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
-        (c->pix_fmt == AV_PIX_FMT_BGRA || c->pix_fmt == AV_PIX_FMT_BGR24) )
-        info->csp |= X264_CSP_VFLIP;
-
-    *p_handle = h;
-
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) )
-        return -1;
-    pic->img.csp = csp;
-    pic->img.planes = 4;
-    return 0;
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
-{
-    return read_frame_internal( pic, handle, i_frame, NULL );
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    memset( pic, 0, sizeof(cli_pic_t) );
-}
-
-static int close_file( hnd_t handle )
-{
-    lavf_hnd_t *h = handle;
-    avcodec_close( h->lavf->streams[h->stream_id]->codec );
-    avformat_close_input( &h->lavf );
-    av_frame_free( &h->frame );
-    free( h );
-    return 0;
-}
-
-const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/raw.c b/android/src/main/libenc/jni/libx264/input/raw.c
deleted file mode 100755
index 53a22f5..0000000
--- a/android/src/main/libenc/jni/libx264/input/raw.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*****************************************************************************
- * raw.c: raw input
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Steven Walters <kemuri9@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "raw", __VA_ARGS__ )
-
-typedef struct
-{
-    FILE *fh;
-    int next_frame;
-    uint64_t plane_size[4];
-    uint64_t frame_size;
-    int bit_depth;
-    cli_mmap_t mmap;
-    int use_mmap;
-} raw_hnd_t;
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    raw_hnd_t *h = calloc( 1, sizeof(raw_hnd_t) );
-    if( !h )
-        return -1;
-
-    if( !opt->resolution )
-    {
-        /* try to parse the file name */
-        for( char *p = psz_filename; *p; p++ )
-            if( *p >= '0' && *p <= '9' && sscanf( p, "%dx%d", &info->width, &info->height ) == 2 )
-                break;
-    }
-    else
-        sscanf( opt->resolution, "%dx%d", &info->width, &info->height );
-    FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" );
-    if( opt->colorspace )
-    {
-        for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- )
-        {
-            if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) )
-                break;
-        }
-        FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
-    }
-    else /* default */
-        info->csp = X264_CSP_I420;
-
-    h->bit_depth = opt->bit_depth;
-    FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth );
-    if( h->bit_depth > 8 )
-        info->csp |= X264_CSP_HIGH_DEPTH;
-
-    if( !strcmp( psz_filename, "-" ) )
-        h->fh = stdin;
-    else
-        h->fh = x264_fopen( psz_filename, "rb" );
-    if( h->fh == NULL )
-        return -1;
-
-    info->thread_safe = 1;
-    info->num_frames  = 0;
-    info->vfr         = 0;
-
-    const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp );
-    for( int i = 0; i < csp->planes; i++ )
-    {
-        h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i );
-        h->frame_size += h->plane_size[i];
-        /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */
-        h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp );
-    }
-
-    if( x264_is_regular_file( h->fh ) )
-    {
-        fseek( h->fh, 0, SEEK_END );
-        uint64_t size = ftell( h->fh );
-        fseek( h->fh, 0, SEEK_SET );
-        info->num_frames = size / h->frame_size;
-
-        /* Attempt to use memory-mapped input frames if possible */
-        if( !(h->bit_depth & 7) )
-            h->use_mmap = !x264_cli_mmap_init( &h->mmap, h->fh );
-    }
-
-    *p_handle = h;
-    return 0;
-}
-
-static int read_frame_internal( cli_pic_t *pic, raw_hnd_t *h, int bit_depth_uc )
-{
-    int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp );
-
-    for( int i = 0; i < pic->img.planes; i++ )
-    {
-        if( h->use_mmap )
-        {
-            if( i )
-                pic->img.plane[i] = pic->img.plane[i-1] + pixel_depth * h->plane_size[i-1];
-        }
-        else if( fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != h->plane_size[i] )
-            return -1;
-
-        if( bit_depth_uc )
-        {
-            /* upconvert non 16bit high depth planes to 16bit using the same
-             * algorithm as used in the depth filter. */
-            uint16_t *plane = (uint16_t*)pic->img.plane[i];
-            uint64_t pixel_count = h->plane_size[i];
-            int lshift = 16 - h->bit_depth;
-            for( uint64_t j = 0; j < pixel_count; j++ )
-                plane[j] = plane[j] << lshift;
-        }
-    }
-    return 0;
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
-{
-    raw_hnd_t *h = handle;
-
-    if( h->use_mmap )
-    {
-        pic->img.plane[0] = x264_cli_mmap( &h->mmap, i_frame * h->frame_size, h->frame_size );
-        if( !pic->img.plane[0] )
-            return -1;
-    }
-    else if( i_frame > h->next_frame )
-    {
-        if( x264_is_regular_file( h->fh ) )
-            fseek( h->fh, i_frame * h->frame_size, SEEK_SET );
-        else
-            while( i_frame > h->next_frame )
-            {
-                if( read_frame_internal( pic, h, 0 ) )
-                    return -1;
-                h->next_frame++;
-            }
-    }
-
-    if( read_frame_internal( pic, h, h->bit_depth & 7 ) )
-        return -1;
-
-    h->next_frame = i_frame+1;
-    return 0;
-}
-
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    raw_hnd_t *h = handle;
-    if( h->use_mmap )
-        return x264_cli_munmap( &h->mmap, pic->img.plane[0], h->frame_size );
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    raw_hnd_t *h = handle;
-    return (h->use_mmap ? x264_cli_pic_init_noalloc : x264_cli_pic_alloc)( pic, csp, width, height );
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    raw_hnd_t *h = handle;
-    if( h->use_mmap )
-        memset( pic, 0, sizeof(cli_pic_t) );
-    else
-        x264_cli_pic_clean( pic );
-}
-
-static int close_file( hnd_t handle )
-{
-    raw_hnd_t *h = handle;
-    if( !h || !h->fh )
-        return 0;
-    if( h->use_mmap )
-        x264_cli_mmap_close( &h->mmap );
-    fclose( h->fh );
-    free( h );
-    return 0;
-}
-
-const cli_input_t raw_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/thread.c b/android/src/main/libenc/jni/libx264/input/thread.c
deleted file mode 100755
index f600744..0000000
--- a/android/src/main/libenc/jni/libx264/input/thread.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*****************************************************************************
- * thread.c: threaded input
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-
-typedef struct
-{
-    cli_input_t input;
-    hnd_t p_handle;
-    cli_pic_t pic;
-    x264_threadpool_t *pool;
-    int next_frame;
-    int frame_total;
-    struct thread_input_arg_t *next_args;
-} thread_hnd_t;
-
-typedef struct thread_input_arg_t
-{
-    thread_hnd_t *h;
-    cli_pic_t *pic;
-    int i_frame;
-    int status;
-} thread_input_arg_t;
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    thread_hnd_t *h = malloc( sizeof(thread_hnd_t) );
-    FAIL_IF_ERR( !h || cli_input.picture_alloc( &h->pic, *p_handle, info->csp, info->width, info->height ),
-                 "x264", "malloc failed\n" );
-    h->input = cli_input;
-    h->p_handle = *p_handle;
-    h->next_frame = -1;
-    h->next_args = malloc( sizeof(thread_input_arg_t) );
-    if( !h->next_args )
-        return -1;
-    h->next_args->h = h;
-    h->next_args->status = 0;
-    h->frame_total = info->num_frames;
-
-    if( x264_threadpool_init( &h->pool, 1, NULL, NULL ) )
-        return -1;
-
-    *p_handle = h;
-    return 0;
-}
-
-static void read_frame_thread_int( thread_input_arg_t *i )
-{
-    i->status = i->h->input.read_frame( i->pic, i->h->p_handle, i->i_frame );
-}
-
-static int read_frame( cli_pic_t *p_pic, hnd_t handle, int i_frame )
-{
-    thread_hnd_t *h = handle;
-    int ret = 0;
-
-    if( h->next_frame >= 0 )
-    {
-        x264_threadpool_wait( h->pool, h->next_args );
-        ret |= h->next_args->status;
-    }
-
-    if( h->next_frame == i_frame )
-        XCHG( cli_pic_t, *p_pic, h->pic );
-    else
-    {
-        if( h->next_frame >= 0 )
-            thread_input.release_frame( &h->pic, handle );
-        ret |= h->input.read_frame( p_pic, h->p_handle, i_frame );
-    }
-
-    if( !h->frame_total || i_frame+1 < h->frame_total )
-    {
-        h->next_frame =
-        h->next_args->i_frame = i_frame+1;
-        h->next_args->pic = &h->pic;
-        x264_threadpool_run( h->pool, (void*)read_frame_thread_int, h->next_args );
-    }
-    else
-        h->next_frame = -1;
-
-    return ret;
-}
-
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    thread_hnd_t *h = handle;
-    if( h->input.release_frame )
-        return h->input.release_frame( pic, h->p_handle );
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    thread_hnd_t *h = handle;
-    return h->input.picture_alloc( pic, h->p_handle, csp, width, height );
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    thread_hnd_t *h = handle;
-    h->input.picture_clean( pic, h->p_handle );
-}
-
-static int close_file( hnd_t handle )
-{
-    thread_hnd_t *h = handle;
-    x264_threadpool_delete( h->pool );
-    h->input.picture_clean( &h->pic, h->p_handle );
-    h->input.close_file( h->p_handle );
-    free( h->next_args );
-    free( h );
-    return 0;
-}
-
-const cli_input_t thread_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/timecode.c b/android/src/main/libenc/jni/libx264/input/timecode.c
deleted file mode 100755
index 9a94cca..0000000
--- a/android/src/main/libenc/jni/libx264/input/timecode.c
+++ /dev/null
@@ -1,457 +0,0 @@
-/*****************************************************************************
- * timecode.c: timecode file input
- *****************************************************************************
- * Copyright (C) 2010-2016 x264 project
- *
- * Authors: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ )
-
-typedef struct
-{
-    cli_input_t input;
-    hnd_t p_handle;
-    int auto_timebase_num;
-    int auto_timebase_den;
-    uint64_t timebase_num;
-    uint64_t timebase_den;
-    int stored_pts_num;
-    int64_t *pts;
-    double assume_fps;
-    double last_timecode;
-} timecode_hnd_t;
-
-static inline double sigexp10( double value, double *exponent )
-{
-    /* This function separates significand and exp10 from double floating point. */
-    *exponent = pow( 10, floor( log10( value ) ) );
-    return value / *exponent;
-}
-
-#define DOUBLE_EPSILON 5e-6
-#define MKV_TIMEBASE_DEN 1000000000
-
-static double correct_fps( double fps, timecode_hnd_t *h )
-{
-    int i = 1;
-    uint64_t fps_num, fps_den;
-    double exponent;
-    double fps_sig = sigexp10( fps, &exponent );
-    while( 1 )
-    {
-        fps_den = i * h->timebase_num;
-        fps_num = round( fps_den * fps_sig ) * exponent;
-        FAIL_IF_ERROR( fps_num > UINT32_MAX, "tcfile fps correction failed.\n"
-                       "                  Specify an appropriate timebase manually or remake tcfile.\n" );
-        if( fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
-            break;
-        ++i;
-    }
-    if( h->auto_timebase_den )
-    {
-        h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
-        if( h->timebase_den > UINT32_MAX )
-            h->auto_timebase_den = 0;
-    }
-    return (double)fps_num / fps_den;
-}
-
-static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
-{
-    h->timebase_num = 0;
-    h->timebase_den = MKV_TIMEBASE_DEN;
-    for( int num = 0; num < loop_num; num++ )
-    {
-        uint64_t fps_den;
-        double exponent;
-        double fps_sig = sigexp10( fpss[num], &exponent );
-        fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
-        h->timebase_num = fps_den && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
-        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || !h->timebase_num, "automatic timebase generation failed.\n"
-                       "                  Specify timebase manually.\n" );
-    }
-    return 0;
-}
-
-static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info )
-{
-    char buff[256];
-    int ret, tcfv, num, seq_num, timecodes_num;
-    double *timecodes = NULL;
-    double *fpss = NULL;
-
-    ret = fscanf( tcfile_in, "# timecode format v%d", &tcfv );
-    FAIL_IF_ERROR( ret != 1 || (tcfv != 1 && tcfv != 2), "unsupported timecode format\n" );
-#define NO_TIMECODE_LINE (buff[0] == '#' || buff[0] == '\n' || buff[0] == '\r')
-    if( tcfv == 1 )
-    {
-        uint64_t file_pos;
-        double assume_fps, seq_fps;
-        int start, end = -1;
-        int prev_start = -1, prev_end = -1;
-
-        h->assume_fps = 0;
-        for( num = 2; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ )
-        {
-            if( NO_TIMECODE_LINE )
-                continue;
-            FAIL_IF_ERROR( sscanf( buff, "assume %lf", &h->assume_fps ) != 1 && sscanf( buff, "Assume %lf", &h->assume_fps ) != 1,
-                           "tcfile parsing error: assumed fps not found\n" );
-            break;
-        }
-        FAIL_IF_ERROR( h->assume_fps <= 0, "invalid assumed fps %.6f\n", h->assume_fps );
-
-        file_pos = ftell( tcfile_in );
-        h->stored_pts_num = 0;
-        for( seq_num = 0; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ )
-        {
-            if( NO_TIMECODE_LINE )
-            {
-                if( sscanf( buff, "# TDecimate Mode 3:  Last Frame = %d", &end ) == 1 )
-                    h->stored_pts_num = end + 1;
-                continue;
-            }
-            ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
-            FAIL_IF_ERROR( ret != 3 && ret != EOF, "invalid input tcfile\n" );
-            FAIL_IF_ERROR( start > end || start <= prev_start || end <= prev_end || seq_fps <= 0,
-                           "invalid input tcfile at line %d: %s\n", num, buff );
-            prev_start = start;
-            prev_end = end;
-            if( h->auto_timebase_den || h->auto_timebase_num )
-                ++seq_num;
-        }
-        if( !h->stored_pts_num )
-            h->stored_pts_num = end + 2;
-        timecodes_num = h->stored_pts_num;
-        fseek( tcfile_in, file_pos, SEEK_SET );
-
-        timecodes = malloc( timecodes_num * sizeof(double) );
-        if( !timecodes )
-            return -1;
-        if( h->auto_timebase_den || h->auto_timebase_num )
-        {
-            fpss = malloc( (seq_num + 1) * sizeof(double) );
-            if( !fpss )
-                goto fail;
-        }
-
-        assume_fps = correct_fps( h->assume_fps, h );
-        if( assume_fps < 0 )
-            goto fail;
-        timecodes[0] = 0;
-        for( num = seq_num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
-        {
-            if( NO_TIMECODE_LINE )
-                continue;
-            ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
-            if( ret != 3 )
-                start = end = timecodes_num - 1;
-            for( ; num < start && num < timecodes_num - 1; num++ )
-                timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
-            if( num < timecodes_num - 1 )
-            {
-                if( h->auto_timebase_den || h->auto_timebase_num )
-                    fpss[seq_num++] = seq_fps;
-                seq_fps = correct_fps( seq_fps, h );
-                if( seq_fps < 0 )
-                    goto fail;
-                for( num = start; num <= end && num < timecodes_num - 1; num++ )
-                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
-            }
-        }
-        for( ; num < timecodes_num - 1; num++ )
-            timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
-        if( h->auto_timebase_den || h->auto_timebase_num )
-            fpss[seq_num] = h->assume_fps;
-
-        if( h->auto_timebase_num && !h->auto_timebase_den )
-        {
-            double exponent;
-            double assume_fps_sig, seq_fps_sig;
-            if( try_mkv_timebase_den( fpss, h, seq_num + 1 ) < 0 )
-                goto fail;
-            fseek( tcfile_in, file_pos, SEEK_SET );
-            assume_fps_sig = sigexp10( h->assume_fps, &exponent );
-            assume_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / assume_fps_sig ) / exponent );
-            for( num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
-            {
-                if( NO_TIMECODE_LINE )
-                    continue;
-                ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
-                if( ret != 3 )
-                    start = end = timecodes_num - 1;
-                seq_fps_sig = sigexp10( seq_fps, &exponent );
-                seq_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / seq_fps_sig ) / exponent );
-                for( ; num < start && num < timecodes_num - 1; num++ )
-                    timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
-                for( num = start; num <= end && num < timecodes_num - 1; num++ )
-                    timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
-            }
-            for( ; num < timecodes_num - 1; num++ )
-                timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
-        }
-        if( fpss )
-        {
-            free( fpss );
-            fpss = NULL;
-        }
-
-        h->assume_fps = assume_fps;
-        h->last_timecode = timecodes[timecodes_num - 1];
-    }
-    else    /* tcfv == 2 */
-    {
-        uint64_t file_pos = ftell( tcfile_in );
-
-        h->stored_pts_num = 0;
-        while( fgets( buff, sizeof(buff), tcfile_in ) != NULL )
-        {
-            if( NO_TIMECODE_LINE )
-            {
-                if( !h->stored_pts_num )
-                    file_pos = ftell( tcfile_in );
-                continue;
-            }
-            h->stored_pts_num++;
-        }
-        timecodes_num = h->stored_pts_num;
-        FAIL_IF_ERROR( !timecodes_num, "input tcfile doesn't have any timecodes!\n" );
-        fseek( tcfile_in, file_pos, SEEK_SET );
-
-        timecodes = malloc( timecodes_num * sizeof(double) );
-        if( !timecodes )
-            return -1;
-
-        num = 0;
-        if( fgets( buff, sizeof(buff), tcfile_in ) != NULL )
-        {
-            ret = sscanf( buff, "%lf", &timecodes[0] );
-            timecodes[0] *= 1e-3;         /* Timecode format v2 is expressed in milliseconds. */
-            FAIL_IF_ERROR( ret != 1, "invalid input tcfile for frame 0\n" );
-            for( num = 1; num < timecodes_num && fgets( buff, sizeof(buff), tcfile_in ) != NULL; )
-            {
-                if( NO_TIMECODE_LINE )
-                    continue;
-                ret = sscanf( buff, "%lf", &timecodes[num] );
-                timecodes[num] *= 1e-3;         /* Timecode format v2 is expressed in milliseconds. */
-                FAIL_IF_ERROR( ret != 1 || timecodes[num] <= timecodes[num - 1],
-                               "invalid input tcfile for frame %d\n", num );
-                ++num;
-            }
-        }
-        FAIL_IF_ERROR( num < timecodes_num, "failed to read input tcfile for frame %d", num );
-
-        if( timecodes_num == 1 )
-            h->timebase_den = info->fps_num;
-        else if( h->auto_timebase_den )
-        {
-            fpss = malloc( (timecodes_num - 1) * sizeof(double) );
-            if( !fpss )
-                goto fail;
-            for( num = 0; num < timecodes_num - 1; num++ )
-            {
-                fpss[num] = 1 / (timecodes[num + 1] - timecodes[num]);
-                if( h->auto_timebase_den )
-                {
-                    int i = 1;
-                    uint64_t fps_num, fps_den;
-                    double exponent;
-                    double fps_sig = sigexp10( fpss[num], &exponent );
-                    while( 1 )
-                    {
-                        fps_den = i * h->timebase_num;
-                        fps_num = round( fps_den * fps_sig ) * exponent;
-                        if( fps_num > UINT32_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
-                            break;
-                        ++i;
-                    }
-                    h->timebase_den = fps_num && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
-                    if( h->timebase_den > UINT32_MAX )
-                    {
-                        h->auto_timebase_den = 0;
-                        continue;
-                    }
-                }
-            }
-            if( h->auto_timebase_num && !h->auto_timebase_den )
-                if( try_mkv_timebase_den( fpss, h, timecodes_num - 1 ) < 0 )
-                    goto fail;
-            free( fpss );
-            fpss = NULL;
-        }
-
-        if( timecodes_num > 1 )
-            h->assume_fps = 1 / (timecodes[timecodes_num - 1] - timecodes[timecodes_num - 2]);
-        else
-            h->assume_fps = (double)info->fps_num / info->fps_den;
-        h->last_timecode = timecodes[timecodes_num - 1];
-    }
-#undef NO_TIMECODE_LINE
-    if( h->auto_timebase_den || h->auto_timebase_num )
-    {
-        uint64_t i = gcd( h->timebase_num, h->timebase_den );
-        h->timebase_num /= i;
-        h->timebase_den /= i;
-        x264_cli_log( "timecode", X264_LOG_INFO, "automatic timebase generation %"PRIu64"/%"PRIu64"\n", h->timebase_num, h->timebase_den );
-    }
-    else FAIL_IF_ERROR( h->timebase_den > UINT32_MAX || !h->timebase_den, "automatic timebase generation failed.\n"
-                        "                  Specify an appropriate timebase manually.\n" );
-
-    h->pts = malloc( h->stored_pts_num * sizeof(int64_t) );
-    if( !h->pts )
-        goto fail;
-    for( num = 0; num < h->stored_pts_num; num++ )
-    {
-        h->pts[num] = timecodes[num] * ((double)h->timebase_den / h->timebase_num) + 0.5;
-        FAIL_IF_ERROR( num > 0 && h->pts[num] <= h->pts[num - 1], "invalid timebase or timecode for frame %d\n", num );
-    }
-
-    free( timecodes );
-    return 0;
-
-fail:
-    if( timecodes )
-        free( timecodes );
-    if( fpss )
-        free( fpss );
-    return -1;
-}
-
-#undef DOUBLE_EPSILON
-#undef MKV_TIMEBASE_DEN
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    int ret = 0;
-    FILE *tcfile_in;
-    timecode_hnd_t *h = malloc( sizeof(timecode_hnd_t) );
-    FAIL_IF_ERROR( !h, "malloc failed\n" );
-    h->input = cli_input;
-    h->p_handle = *p_handle;
-    h->pts = NULL;
-    if( opt->timebase )
-    {
-        ret = sscanf( opt->timebase, "%"SCNu64"/%"SCNu64, &h->timebase_num, &h->timebase_den );
-        if( ret == 1 )
-        {
-            h->timebase_num = strtoul( opt->timebase, NULL, 10 );
-            h->timebase_den = 0; /* set later by auto timebase generation */
-        }
-        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || h->timebase_den > UINT32_MAX,
-                       "timebase you specified exceeds H.264 maximum\n" );
-    }
-    h->auto_timebase_num = !ret;
-    h->auto_timebase_den = ret < 2;
-    if( h->auto_timebase_num )
-        h->timebase_num = info->fps_den; /* can be changed later by auto timebase generation */
-    if( h->auto_timebase_den )
-        h->timebase_den = 0;             /* set later by auto timebase generation */
-
-    tcfile_in = x264_fopen( psz_filename, "rb" );
-    FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename );
-    if( !x264_is_regular_file( tcfile_in ) )
-    {
-        x264_cli_log( "timecode", X264_LOG_ERROR, "tcfile input incompatible with non-regular file `%s'\n", psz_filename );
-        fclose( tcfile_in );
-        return -1;
-    }
-
-    if( parse_tcfile( tcfile_in, h, info ) < 0 )
-    {
-        if( h->pts )
-            free( h->pts );
-        fclose( tcfile_in );
-        return -1;
-    }
-    fclose( tcfile_in );
-
-    info->timebase_num = h->timebase_num;
-    info->timebase_den = h->timebase_den;
-    info->vfr = 1;
-
-    *p_handle = h;
-    return 0;
-}
-
-static int64_t get_frame_pts( timecode_hnd_t *h, int frame, int real_frame )
-{
-    if( frame < h->stored_pts_num )
-        return h->pts[frame];
-    else
-    {
-        if( h->pts && real_frame )
-        {
-            x264_cli_log( "timecode", X264_LOG_INFO, "input timecode file missing data for frame %d and later\n"
-                          "                 assuming constant fps %.6f\n", frame, h->assume_fps );
-            free( h->pts );
-            h->pts = NULL;
-        }
-        double timecode = h->last_timecode + 1 / h->assume_fps;
-        if( real_frame )
-            h->last_timecode = timecode;
-        return timecode * ((double)h->timebase_den / h->timebase_num) + 0.5;
-    }
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int frame )
-{
-    timecode_hnd_t *h = handle;
-    if( h->input.read_frame( pic, h->p_handle, frame ) )
-        return -1;
-
-    pic->pts = get_frame_pts( h, frame, 1 );
-    pic->duration = get_frame_pts( h, frame + 1, 0 ) - pic->pts;
-
-    return 0;
-}
-
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    timecode_hnd_t *h = handle;
-    if( h->input.release_frame )
-        return h->input.release_frame( pic, h->p_handle );
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    timecode_hnd_t *h = handle;
-    return h->input.picture_alloc( pic, h->p_handle, csp, width, height );
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    timecode_hnd_t *h = handle;
-    h->input.picture_clean( pic, h->p_handle );
-}
-
-static int close_file( hnd_t handle )
-{
-    timecode_hnd_t *h = handle;
-    if( h->pts )
-        free( h->pts );
-    h->input.close_file( h->p_handle );
-    free( h );
-    return 0;
-}
-
-const cli_input_t timecode_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/input/y4m.c b/android/src/main/libenc/jni/libx264/input/y4m.c
deleted file mode 100755
index 46a9ec3..0000000
--- a/android/src/main/libenc/jni/libx264/input/y4m.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*****************************************************************************
- * y4m.c: y4m input
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "input.h"
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "y4m", __VA_ARGS__ )
-
-typedef struct
-{
-    FILE *fh;
-    int next_frame;
-    int seq_header_len;
-    int frame_header_len;
-    uint64_t frame_size;
-    uint64_t plane_size[3];
-    int bit_depth;
-    cli_mmap_t mmap;
-    int use_mmap;
-} y4m_hnd_t;
-
-#define Y4M_MAGIC "YUV4MPEG2"
-#define MAX_YUV4_HEADER 80
-#define Y4M_FRAME_MAGIC "FRAME"
-#define MAX_FRAME_HEADER 80
-
-static int parse_csp_and_depth( char *csp_name, int *bit_depth )
-{
-    int csp    = X264_CSP_MAX;
-
-    /* Set colorspace from known variants */
-    if( !strncmp( "420", csp_name, 3 ) )
-        csp = X264_CSP_I420;
-    else if( !strncmp( "422", csp_name, 3 ) )
-        csp = X264_CSP_I422;
-    else if( !strncmp( "444", csp_name, 3 ) && strncmp( "444alpha", csp_name, 8 ) ) // only accept alphaless 4:4:4
-        csp = X264_CSP_I444;
-
-    /* Set high bit depth from known extensions */
-    if( sscanf( csp_name, "%*d%*[pP]%d", bit_depth ) != 1 )
-        *bit_depth = 8;
-
-    return csp;
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    y4m_hnd_t *h = calloc( 1, sizeof(y4m_hnd_t) );
-    int i;
-    uint32_t n, d;
-    char header[MAX_YUV4_HEADER+10];
-    char *tokend, *header_end;
-    int colorspace = X264_CSP_NONE;
-    int alt_colorspace = X264_CSP_NONE;
-    int alt_bit_depth  = 8;
-    if( !h )
-        return -1;
-
-    info->vfr = 0;
-
-    if( !strcmp( psz_filename, "-" ) )
-        h->fh = stdin;
-    else
-        h->fh = x264_fopen(psz_filename, "rb");
-    if( h->fh == NULL )
-        return -1;
-
-    /* Read header */
-    for( i = 0; i < MAX_YUV4_HEADER; i++ )
-    {
-        header[i] = fgetc( h->fh );
-        if( header[i] == '\n' )
-        {
-            /* Add a space after last option. Makes parsing "444" vs
-               "444alpha" easier. */
-            header[i+1] = 0x20;
-            header[i+2] = 0;
-            break;
-        }
-    }
-    FAIL_IF_ERROR( strncmp( header, Y4M_MAGIC, sizeof(Y4M_MAGIC)-1 ), "bad sequence header magic\n" );
-    FAIL_IF_ERROR( i == MAX_YUV4_HEADER, "bad sequence header length\n" );
-
-    /* Scan properties */
-    header_end = &header[i+1]; /* Include space */
-    h->seq_header_len = i+1;
-    for( char *tokstart = header + sizeof(Y4M_MAGIC); tokstart < header_end; tokstart++ )
-    {
-        if( *tokstart == 0x20 )
-            continue;
-        switch( *tokstart++ )
-        {
-            case 'W': /* Width. Required. */
-                info->width = strtol( tokstart, &tokend, 10 );
-                tokstart=tokend;
-                break;
-            case 'H': /* Height. Required. */
-                info->height = strtol( tokstart, &tokend, 10 );
-                tokstart=tokend;
-                break;
-            case 'C': /* Color space */
-                colorspace = parse_csp_and_depth( tokstart, &h->bit_depth );
-                tokstart = strchr( tokstart, 0x20 );
-                break;
-            case 'I': /* Interlace type */
-                switch( *tokstart++ )
-                {
-                    case 't':
-                        info->interlaced = 1;
-                        info->tff = 1;
-                        break;
-                    case 'b':
-                        info->interlaced = 1;
-                        info->tff = 0;
-                        break;
-                    case 'm':
-                        info->interlaced = 1;
-                        break;
-                    //case '?':
-                    //case 'p':
-                    default:
-                        break;
-                }
-                break;
-            case 'F': /* Frame rate - 0:0 if unknown */
-                if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d )
-                {
-                    x264_reduce_fraction( &n, &d );
-                    info->fps_num = n;
-                    info->fps_den = d;
-                }
-                tokstart = strchr( tokstart, 0x20 );
-                break;
-            case 'A': /* Pixel aspect - 0:0 if unknown */
-                /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
-                if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d )
-                {
-                    x264_reduce_fraction( &n, &d );
-                    info->sar_width  = n;
-                    info->sar_height = d;
-                }
-                tokstart = strchr( tokstart, 0x20 );
-                break;
-            case 'X': /* Vendor extensions */
-                if( !strncmp( "YSCSS=", tokstart, 6 ) )
-                {
-                    /* Older nonstandard pixel format representation */
-                    tokstart += 6;
-                    alt_colorspace = parse_csp_and_depth( tokstart, &alt_bit_depth );
-                }
-                tokstart = strchr( tokstart, 0x20 );
-                break;
-        }
-    }
-
-    if( colorspace == X264_CSP_NONE )
-    {
-        colorspace   = alt_colorspace;
-        h->bit_depth = alt_bit_depth;
-    }
-
-    // default to 8bit 4:2:0 if nothing is specified
-    if( colorspace == X264_CSP_NONE )
-    {
-        colorspace    = X264_CSP_I420;
-        h->bit_depth  = 8;
-    }
-
-    FAIL_IF_ERROR( colorspace <= X264_CSP_NONE || colorspace >= X264_CSP_MAX, "colorspace unhandled\n" );
-    FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth );
-
-    info->thread_safe = 1;
-    info->num_frames  = 0;
-    info->csp         = colorspace;
-
-    if( h->bit_depth > 8 )
-        info->csp |= X264_CSP_HIGH_DEPTH;
-
-    const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp );
-
-    for( i = 0; i < csp->planes; i++ )
-    {
-        h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i );
-        h->frame_size += h->plane_size[i];
-        /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */
-        h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp );
-    }
-
-    if( x264_is_regular_file( h->fh ) )
-    {
-        uint64_t init_pos = ftell( h->fh );
-
-        /* Find out the length of the frame header */
-        int len = 1;
-        while( len <= MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
-            len++;
-        FAIL_IF_ERROR( len > MAX_FRAME_HEADER || len < sizeof(Y4M_FRAME_MAGIC), "bad frame header length\n" );
-        h->frame_header_len = len;
-        h->frame_size += len;
-
-        fseek( h->fh, 0, SEEK_END );
-        uint64_t i_size = ftell( h->fh );
-        fseek( h->fh, init_pos, SEEK_SET );
-        info->num_frames = (i_size - h->seq_header_len) / h->frame_size;
-
-        /* Attempt to use memory-mapped input frames if possible */
-        if( !(h->bit_depth & 7) )
-            h->use_mmap = !x264_cli_mmap_init( &h->mmap, h->fh );
-    }
-
-    *p_handle = h;
-    return 0;
-}
-
-static int read_frame_internal( cli_pic_t *pic, y4m_hnd_t *h, int bit_depth_uc )
-{
-    static const size_t slen = sizeof(Y4M_FRAME_MAGIC)-1;
-    int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp );
-    int i = sizeof(Y4M_FRAME_MAGIC);
-    char header_buf[16];
-    char *header;
-
-    /* Verify that the frame header is valid */
-    if( h->use_mmap )
-    {
-        header = (char*)pic->img.plane[0];
-        pic->img.plane[0] += h->frame_header_len;
-
-        /* If the header length has changed between frames the size of the mapping will be invalid.
-         * It might be possible to work around it, but I'm not aware of any tool beside fuzzers that
-         * produces y4m files with variable-length frame headers so just error out if that happens. */
-        while( i <= h->frame_header_len && header[i-1] != '\n' )
-            i++;
-        FAIL_IF_ERROR( i != h->frame_header_len, "bad frame header length\n" );
-    }
-    else
-    {
-        header = header_buf;
-        if( fread( header, 1, slen, h->fh ) != slen )
-            return -1;
-        while( i <= MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
-            i++;
-        FAIL_IF_ERROR( i > MAX_FRAME_HEADER, "bad frame header length\n" );
-    }
-    FAIL_IF_ERROR( memcmp( header, Y4M_FRAME_MAGIC, slen ), "bad frame header magic\n" );
-
-    for( i = 0; i < pic->img.planes; i++ )
-    {
-        if( h->use_mmap )
-        {
-            if( i )
-                pic->img.plane[i] = pic->img.plane[i-1] + pixel_depth * h->plane_size[i-1];
-        }
-        else if( fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != h->plane_size[i] )
-            return -1;
-
-        if( bit_depth_uc )
-        {
-            /* upconvert non 16bit high depth planes to 16bit using the same
-             * algorithm as used in the depth filter. */
-            uint16_t *plane = (uint16_t*)pic->img.plane[i];
-            uint64_t pixel_count = h->plane_size[i];
-            int lshift = 16 - h->bit_depth;
-            for( uint64_t j = 0; j < pixel_count; j++ )
-                plane[j] = plane[j] << lshift;
-        }
-    }
-    return 0;
-}
-
-static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame )
-{
-    y4m_hnd_t *h = handle;
-
-    if( h->use_mmap )
-    {
-        pic->img.plane[0] = x264_cli_mmap( &h->mmap, h->frame_size * i_frame + h->seq_header_len, h->frame_size );
-        if( !pic->img.plane[0] )
-            return -1;
-    }
-    else if( i_frame > h->next_frame )
-    {
-        if( x264_is_regular_file( h->fh ) )
-            fseek( h->fh, h->frame_size * i_frame + h->seq_header_len, SEEK_SET );
-        else
-            while( i_frame > h->next_frame )
-            {
-                if( read_frame_internal( pic, h, 0 ) )
-                    return -1;
-                h->next_frame++;
-            }
-    }
-
-    if( read_frame_internal( pic, h, h->bit_depth & 7 ) )
-        return -1;
-
-    h->next_frame = i_frame+1;
-    return 0;
-}
-
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    y4m_hnd_t *h = handle;
-    if( h->use_mmap )
-        return x264_cli_munmap( &h->mmap, pic->img.plane[0] - h->frame_header_len, h->frame_size );
-    return 0;
-}
-
-static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height )
-{
-    y4m_hnd_t *h = handle;
-    return (h->use_mmap ? x264_cli_pic_init_noalloc : x264_cli_pic_alloc)( pic, csp, width, height );
-}
-
-static void picture_clean( cli_pic_t *pic, hnd_t handle )
-{
-    y4m_hnd_t *h = handle;
-    if( h->use_mmap )
-        memset( pic, 0, sizeof(cli_pic_t) );
-    else
-        x264_cli_pic_clean( pic );
-}
-
-static int close_file( hnd_t handle )
-{
-    y4m_hnd_t *h = handle;
-    if( !h || !h->fh )
-        return 0;
-    if( h->use_mmap )
-        x264_cli_mmap_close( &h->mmap );
-    fclose( h->fh );
-    free( h );
-    return 0;
-}
-
-const cli_input_t y4m_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/android/src/main/libenc/jni/libx264/output/flv.c b/android/src/main/libenc/jni/libx264/output/flv.c
deleted file mode 100755
index 54647dd..0000000
--- a/android/src/main/libenc/jni/libx264/output/flv.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/*****************************************************************************
- * flv.c: flv muxer
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Kieran Kunhya <kieran@kunhya.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include "flv_bytestream.h"
-
-#define CHECK(x)\
-do {\
-    if( (x) < 0 )\
-        return -1;\
-} while( 0 )
-
-typedef struct
-{
-    flv_buffer *c;
-
-    uint8_t *sei;
-    int sei_len;
-
-    int64_t i_fps_num;
-    int64_t i_fps_den;
-    int64_t i_framenum;
-
-    uint64_t i_framerate_pos;
-    uint64_t i_duration_pos;
-    uint64_t i_filesize_pos;
-    uint64_t i_bitrate_pos;
-
-    uint8_t b_write_length;
-    int64_t i_prev_dts;
-    int64_t i_prev_cts;
-    int64_t i_delay_time;
-    int64_t i_init_delta;
-    int i_delay_frames;
-
-    double d_timebase;
-    int b_vfr_input;
-    int b_dts_compress;
-
-    unsigned start;
-} flv_hnd_t;
-
-static int write_header( flv_buffer *c )
-{
-    flv_put_tag( c, "FLV" ); // Signature
-    flv_put_byte( c, 1 );    // Version
-    flv_put_byte( c, 1 );    // Video Only
-    flv_put_be32( c, 9 );    // DataOffset
-    flv_put_be32( c, 0 );    // PreviousTagSize0
-
-    return flv_flush_data( c );
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
-{
-    flv_hnd_t *p_flv = calloc( 1, sizeof(flv_hnd_t) );
-    if( p_flv )
-    {
-        flv_buffer *c = flv_create_writer( psz_filename );
-        if( c )
-        {
-            if( !write_header( c ) )
-            {
-                p_flv->c = c;
-                p_flv->b_dts_compress = opt->use_dts_compress;
-                *p_handle = p_flv;
-                return 0;
-            }
-
-            fclose( c->fp );
-            free( c->data );
-            free( c );
-        }
-        free( p_flv );
-    }
-
-    *p_handle = NULL;
-    return -1;
-}
-
-static int set_param( hnd_t handle, x264_param_t *p_param )
-{
-    flv_hnd_t *p_flv = handle;
-    flv_buffer *c = p_flv->c;
-
-    flv_put_byte( c, FLV_TAG_TYPE_META ); // Tag Type "script data"
-
-    int start = c->d_cur;
-    flv_put_be24( c, 0 ); // data length
-    flv_put_be24( c, 0 ); // timestamp
-    flv_put_be32( c, 0 ); // reserved
-
-    flv_put_byte( c, AMF_DATA_TYPE_STRING );
-    flv_put_amf_string( c, "onMetaData" );
-
-    flv_put_byte( c, AMF_DATA_TYPE_MIXEDARRAY );
-    flv_put_be32( c, 7 );
-
-    flv_put_amf_string( c, "width" );
-    flv_put_amf_double( c, p_param->i_width );
-
-    flv_put_amf_string( c, "height" );
-    flv_put_amf_double( c, p_param->i_height );
-
-    flv_put_amf_string( c, "framerate" );
-
-    if( !p_param->b_vfr_input )
-        flv_put_amf_double( c, (double)p_param->i_fps_num / p_param->i_fps_den );
-    else
-    {
-        p_flv->i_framerate_pos = c->d_cur + c->d_total + 1;
-        flv_put_amf_double( c, 0 ); // written at end of encoding
-    }
-
-    flv_put_amf_string( c, "videocodecid" );
-    flv_put_amf_double( c, FLV_CODECID_H264 );
-
-    flv_put_amf_string( c, "duration" );
-    p_flv->i_duration_pos = c->d_cur + c->d_total + 1;
-    flv_put_amf_double( c, 0 ); // written at end of encoding
-
-    flv_put_amf_string( c, "filesize" );
-    p_flv->i_filesize_pos = c->d_cur + c->d_total + 1;
-    flv_put_amf_double( c, 0 ); // written at end of encoding
-
-    flv_put_amf_string( c, "videodatarate" );
-    p_flv->i_bitrate_pos = c->d_cur + c->d_total + 1;
-    flv_put_amf_double( c, 0 ); // written at end of encoding
-
-    flv_put_amf_string( c, "" );
-    flv_put_byte( c, AMF_END_OF_OBJECT );
-
-    unsigned length = c->d_cur - start;
-    flv_rewrite_amf_be24( c, length - 10, start );
-
-    flv_put_be32( c, length + 1 ); // tag length
-
-    p_flv->i_fps_num = p_param->i_fps_num;
-    p_flv->i_fps_den = p_param->i_fps_den;
-    p_flv->d_timebase = (double)p_param->i_timebase_num / p_param->i_timebase_den;
-    p_flv->b_vfr_input = p_param->b_vfr_input;
-    p_flv->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0;
-
-    return 0;
-}
-
-static int write_headers( hnd_t handle, x264_nal_t *p_nal )
-{
-    flv_hnd_t *p_flv = handle;
-    flv_buffer *c = p_flv->c;
-
-    int sps_size = p_nal[0].i_payload;
-    int pps_size = p_nal[1].i_payload;
-    int sei_size = p_nal[2].i_payload;
-
-    // SEI
-    /* It is within the spec to write this as-is but for
-     * mplayer/ffmpeg playback this is deferred until before the first frame */
-
-    p_flv->sei = malloc( sei_size );
-    if( !p_flv->sei )
-        return -1;
-    p_flv->sei_len = sei_size;
-
-    memcpy( p_flv->sei, p_nal[2].p_payload, sei_size );
-
-    // SPS
-    uint8_t *sps = p_nal[0].p_payload + 4;
-
-    flv_put_byte( c, FLV_TAG_TYPE_VIDEO );
-    flv_put_be24( c, 0 ); // rewrite later
-    flv_put_be24( c, 0 ); // timestamp
-    flv_put_byte( c, 0 ); // timestamp extended
-    flv_put_be24( c, 0 ); // StreamID - Always 0
-    p_flv->start = c->d_cur; // needed for overwriting length
-
-    flv_put_byte( c, 7 | FLV_FRAME_KEY ); // Frametype and CodecID
-    flv_put_byte( c, 0 ); // AVC sequence header
-    flv_put_be24( c, 0 ); // composition time
-
-    flv_put_byte( c, 1 );      // version
-    flv_put_byte( c, sps[1] ); // profile
-    flv_put_byte( c, sps[2] ); // profile
-    flv_put_byte( c, sps[3] ); // level
-    flv_put_byte( c, 0xff );   // 6 bits reserved (111111) + 2 bits nal size length - 1 (11)
-    flv_put_byte( c, 0xe1 );   // 3 bits reserved (111) + 5 bits number of sps (00001)
-
-    flv_put_be16( c, sps_size - 4 );
-    flv_append_data( c, sps, sps_size - 4 );
-
-    // PPS
-    flv_put_byte( c, 1 ); // number of pps
-    flv_put_be16( c, pps_size - 4 );
-    flv_append_data( c, p_nal[1].p_payload + 4, pps_size - 4 );
-
-    // rewrite data length info
-    unsigned length = c->d_cur - p_flv->start;
-    flv_rewrite_amf_be24( c, length, p_flv->start - 10 );
-    flv_put_be32( c, length + 11 ); // Last tag size
-    CHECK( flv_flush_data( c ) );
-
-    return sei_size + sps_size + pps_size;
-}
-
-static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
-{
-    flv_hnd_t *p_flv = handle;
-    flv_buffer *c = p_flv->c;
-
-#define convert_timebase_ms( timestamp, timebase ) (int64_t)((timestamp) * (timebase) * 1000 + 0.5)
-
-    if( !p_flv->i_framenum )
-    {
-        p_flv->i_delay_time = p_picture->i_dts * -1;
-        if( !p_flv->b_dts_compress && p_flv->i_delay_time )
-            x264_cli_log( "flv", X264_LOG_INFO, "initial delay %"PRId64" ms\n",
-                          convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase ) );
-    }
-
-    int64_t dts;
-    int64_t cts;
-    int64_t offset;
-
-    if( p_flv->b_dts_compress )
-    {
-        if( p_flv->i_framenum == 1 )
-            p_flv->i_init_delta = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase );
-        dts = p_flv->i_framenum > p_flv->i_delay_frames
-            ? convert_timebase_ms( p_picture->i_dts, p_flv->d_timebase )
-            : p_flv->i_framenum * p_flv->i_init_delta / (p_flv->i_delay_frames + 1);
-        cts = convert_timebase_ms( p_picture->i_pts, p_flv->d_timebase );
-    }
-    else
-    {
-        dts = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase );
-        cts = convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase );
-    }
-    offset = cts - dts;
-
-    if( p_flv->i_framenum )
-    {
-        if( p_flv->i_prev_dts == dts )
-            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate DTS %"PRId64" generated by rounding\n"
-                          "               decoding framerate cannot exceed 1000fps\n", dts );
-        if( p_flv->i_prev_cts == cts )
-            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate CTS %"PRId64" generated by rounding\n"
-                          "               composition framerate cannot exceed 1000fps\n", cts );
-    }
-    p_flv->i_prev_dts = dts;
-    p_flv->i_prev_cts = cts;
-
-    // A new frame - write packet header
-    flv_put_byte( c, FLV_TAG_TYPE_VIDEO );
-    flv_put_be24( c, 0 ); // calculated later
-    flv_put_be24( c, dts );
-    flv_put_byte( c, dts >> 24 );
-    flv_put_be24( c, 0 );
-
-    p_flv->start = c->d_cur;
-    flv_put_byte( c, p_picture->b_keyframe ? FLV_FRAME_KEY : FLV_FRAME_INTER );
-    flv_put_byte( c, 1 ); // AVC NALU
-    flv_put_be24( c, offset );
-
-    if( p_flv->sei )
-    {
-        flv_append_data( c, p_flv->sei, p_flv->sei_len );
-        free( p_flv->sei );
-        p_flv->sei = NULL;
-    }
-    flv_append_data( c, p_nalu, i_size );
-
-    unsigned length = c->d_cur - p_flv->start;
-    flv_rewrite_amf_be24( c, length, p_flv->start - 10 );
-    flv_put_be32( c, 11 + length ); // Last tag size
-    CHECK( flv_flush_data( c ) );
-
-    p_flv->i_framenum++;
-
-    return i_size;
-}
-
-static int rewrite_amf_double( FILE *fp, uint64_t position, double value )
-{
-    uint64_t x = endian_fix64( flv_dbl2int( value ) );
-    return !fseek( fp, position, SEEK_SET ) && fwrite( &x, 8, 1, fp ) == 1 ? 0 : -1;
-}
-
-#undef CHECK
-#define CHECK(x)\
-do {\
-    if( (x) < 0 )\
-        goto error;\
-} while( 0 )
-
-static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
-{
-    int ret = -1;
-    flv_hnd_t *p_flv = handle;
-    flv_buffer *c = p_flv->c;
-
-    CHECK( flv_flush_data( c ) );
-
-    double total_duration = (2 * largest_pts - second_largest_pts) * p_flv->d_timebase;
-
-    if( x264_is_regular_file( c->fp ) && total_duration > 0 )
-    {
-        double framerate;
-        uint64_t filesize = ftell( c->fp );
-
-        if( p_flv->i_framerate_pos )
-        {
-            framerate = (double)p_flv->i_framenum / total_duration;
-            CHECK( rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ) );
-        }
-
-        CHECK( rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ) );
-        CHECK( rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ) );
-        CHECK( rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) ) );
-    }
-    ret = 0;
-
-error:
-    fclose( c->fp );
-    free( c->data );
-    free( c );
-    free( p_flv );
-
-    return ret;
-}
-
-const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/android/src/main/libenc/jni/libx264/output/flv_bytestream.c b/android/src/main/libenc/jni/libx264/output/flv_bytestream.c
deleted file mode 100755
index 96f5377..0000000
--- a/android/src/main/libenc/jni/libx264/output/flv_bytestream.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*****************************************************************************
- * flv_bytestream.c: flv muxer utilities
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Kieran Kunhya <kieran@kunhya.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include "flv_bytestream.h"
-
-uint64_t flv_dbl2int( double value )
-{
-    return (union {double f; uint64_t i;}){value}.i;
-}
-
-/* Put functions  */
-
-void flv_put_byte( flv_buffer *c, uint8_t b )
-{
-    flv_append_data( c, &b, 1 );
-}
-
-void flv_put_be32( flv_buffer *c, uint32_t val )
-{
-    flv_put_byte( c, val >> 24 );
-    flv_put_byte( c, val >> 16 );
-    flv_put_byte( c, val >> 8 );
-    flv_put_byte( c, val );
-}
-
-void flv_put_be64( flv_buffer *c, uint64_t val )
-{
-    flv_put_be32( c, val >> 32 );
-    flv_put_be32( c, val );
-}
-
-void flv_put_be16( flv_buffer *c, uint16_t val )
-{
-    flv_put_byte( c, val >> 8 );
-    flv_put_byte( c, val );
-}
-
-void flv_put_be24( flv_buffer *c, uint32_t val )
-{
-    flv_put_be16( c, val >> 8 );
-    flv_put_byte( c, val );
-}
-
-void flv_put_tag( flv_buffer *c, const char *tag )
-{
-    while( *tag )
-        flv_put_byte( c, *tag++ );
-}
-
-void flv_put_amf_string( flv_buffer *c, const char *str )
-{
-    uint16_t len = strlen( str );
-    flv_put_be16( c, len );
-    flv_append_data( c, (uint8_t*)str, len );
-}
-
-void flv_put_amf_double( flv_buffer *c, double d )
-{
-    flv_put_byte( c, AMF_DATA_TYPE_NUMBER );
-    flv_put_be64( c, flv_dbl2int( d ) );
-}
-
-/* flv writing functions */
-
-flv_buffer *flv_create_writer( const char *filename )
-{
-    flv_buffer *c = calloc( 1, sizeof(flv_buffer) );
-    if( !c )
-        return NULL;
-
-    if( !strcmp( filename, "-" ) )
-        c->fp = stdout;
-    else
-        c->fp = x264_fopen( filename, "wb" );
-    if( !c->fp )
-    {
-        free( c );
-        return NULL;
-    }
-
-    return c;
-}
-
-int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size )
-{
-    unsigned ns = c->d_cur + size;
-
-    if( ns > c->d_max )
-    {
-        void *dp;
-        unsigned dn = 16;
-        while( ns > dn )
-            dn <<= 1;
-
-        dp = realloc( c->data, dn );
-        if( !dp )
-            return -1;
-
-        c->data = dp;
-        c->d_max = dn;
-    }
-
-    memcpy( c->data + c->d_cur, data, size );
-
-    c->d_cur = ns;
-
-    return 0;
-}
-
-void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start )
-{
-     *(c->data + start + 0) = length >> 16;
-     *(c->data + start + 1) = length >> 8;
-     *(c->data + start + 2) = length >> 0;
-}
-
-int flv_flush_data( flv_buffer *c )
-{
-    if( !c->d_cur )
-        return 0;
-
-    if( fwrite( c->data, c->d_cur, 1, c->fp ) != 1 )
-        return -1;
-
-    c->d_total += c->d_cur;
-
-    c->d_cur = 0;
-
-    return 0;
-}
diff --git a/android/src/main/libenc/jni/libx264/output/flv_bytestream.h b/android/src/main/libenc/jni/libx264/output/flv_bytestream.h
deleted file mode 100755
index c70c086..0000000
--- a/android/src/main/libenc/jni/libx264/output/flv_bytestream.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*****************************************************************************
- * flv_bytestream.h: flv muxer utilities
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Kieran Kunhya <kieran@kunhya.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_FLV_BYTESTREAM_H
-#define X264_FLV_BYTESTREAM_H
-
-/* offsets for packed values */
-#define FLV_AUDIO_SAMPLESSIZE_OFFSET 1
-#define FLV_AUDIO_SAMPLERATE_OFFSET  2
-#define FLV_AUDIO_CODECID_OFFSET     4
-
-#define FLV_VIDEO_FRAMETYPE_OFFSET   4
-
-/* bitmasks to isolate specific values */
-#define FLV_AUDIO_CHANNEL_MASK    0x01
-#define FLV_AUDIO_SAMPLESIZE_MASK 0x02
-#define FLV_AUDIO_SAMPLERATE_MASK 0x0c
-#define FLV_AUDIO_CODECID_MASK    0xf0
-
-#define FLV_VIDEO_CODECID_MASK    0x0f
-#define FLV_VIDEO_FRAMETYPE_MASK  0xf0
-
-#define AMF_END_OF_OBJECT         0x09
-
-enum
-{
-    FLV_HEADER_FLAG_HASVIDEO = 1,
-    FLV_HEADER_FLAG_HASAUDIO = 4,
-};
-
-enum
-{
-    FLV_TAG_TYPE_AUDIO = 0x08,
-    FLV_TAG_TYPE_VIDEO = 0x09,
-    FLV_TAG_TYPE_META  = 0x12,
-};
-
-enum
-{
-    FLV_MONO   = 0,
-    FLV_STEREO = 1,
-};
-
-enum
-{
-    FLV_SAMPLESSIZE_8BIT  = 0,
-    FLV_SAMPLESSIZE_16BIT = 1 << FLV_AUDIO_SAMPLESSIZE_OFFSET,
-};
-
-enum
-{
-    FLV_SAMPLERATE_SPECIAL = 0, /**< signifies 5512Hz and 8000Hz in the case of NELLYMOSER */
-    FLV_SAMPLERATE_11025HZ = 1 << FLV_AUDIO_SAMPLERATE_OFFSET,
-    FLV_SAMPLERATE_22050HZ = 2 << FLV_AUDIO_SAMPLERATE_OFFSET,
-    FLV_SAMPLERATE_44100HZ = 3 << FLV_AUDIO_SAMPLERATE_OFFSET,
-};
-
-enum
-{
-    FLV_CODECID_MP3 = 2 << FLV_AUDIO_CODECID_OFFSET,
-    FLV_CODECID_AAC = 10<< FLV_AUDIO_CODECID_OFFSET,
-};
-
-enum
-{
-    FLV_CODECID_H264 = 7,
-};
-
-enum
-{
-    FLV_FRAME_KEY   = 1 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
-    FLV_FRAME_INTER = 2 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
-};
-
-typedef enum
-{
-    AMF_DATA_TYPE_NUMBER      = 0x00,
-    AMF_DATA_TYPE_BOOL        = 0x01,
-    AMF_DATA_TYPE_STRING      = 0x02,
-    AMF_DATA_TYPE_OBJECT      = 0x03,
-    AMF_DATA_TYPE_NULL        = 0x05,
-    AMF_DATA_TYPE_UNDEFINED   = 0x06,
-    AMF_DATA_TYPE_REFERENCE   = 0x07,
-    AMF_DATA_TYPE_MIXEDARRAY  = 0x08,
-    AMF_DATA_TYPE_OBJECT_END  = 0x09,
-    AMF_DATA_TYPE_ARRAY       = 0x0a,
-    AMF_DATA_TYPE_DATE        = 0x0b,
-    AMF_DATA_TYPE_LONG_STRING = 0x0c,
-    AMF_DATA_TYPE_UNSUPPORTED = 0x0d,
-} AMFDataType;
-
-typedef struct flv_buffer
-{
-    uint8_t *data;
-    unsigned d_cur;
-    unsigned d_max;
-    FILE *fp;
-    uint64_t d_total;
-} flv_buffer;
-
-flv_buffer *flv_create_writer( const char *filename );
-int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size );
-int flv_write_byte( flv_buffer *c, uint8_t *byte );
-int flv_flush_data( flv_buffer *c );
-void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start );
-
-uint64_t flv_dbl2int( double value );
-void flv_put_byte( flv_buffer *c, uint8_t b );
-void flv_put_be32( flv_buffer *c, uint32_t val );
-void flv_put_be64( flv_buffer *c, uint64_t val );
-void flv_put_be16( flv_buffer *c, uint16_t val );
-void flv_put_be24( flv_buffer *c, uint32_t val );
-void flv_put_tag( flv_buffer *c, const char *tag );
-void flv_put_amf_string( flv_buffer *c, const char *str );
-void flv_put_amf_double( flv_buffer *c, double d );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/output/matroska.c b/android/src/main/libenc/jni/libx264/output/matroska.c
deleted file mode 100755
index 91f42e5..0000000
--- a/android/src/main/libenc/jni/libx264/output/matroska.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*****************************************************************************
- * matroska.c: matroska muxer
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Mike Matsnev <mike@haali.su>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include "matroska_ebml.h"
-
-typedef struct
-{
-    mk_writer *w;
-
-    int width, height, d_width, d_height;
-
-    int display_size_units;
-    int stereo_mode;
-
-    int64_t frame_duration;
-
-    char b_writing_frame;
-    uint32_t i_timebase_num;
-    uint32_t i_timebase_den;
-
-} mkv_hnd_t;
-
-static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
-{
-    *p_handle = NULL;
-    mkv_hnd_t *p_mkv = calloc( 1, sizeof(mkv_hnd_t) );
-    if( !p_mkv )
-        return -1;
-
-    p_mkv->w = mk_create_writer( psz_filename );
-    if( !p_mkv->w )
-    {
-        free( p_mkv );
-        return -1;
-    }
-
-    *p_handle = p_mkv;
-
-    return 0;
-}
-
-#define STEREO_COUNT 7
-static const uint8_t stereo_modes[STEREO_COUNT] = {5,9,7,1,3,13,0};
-static const uint8_t stereo_w_div[STEREO_COUNT] = {1,2,1,2,1,1,1};
-static const uint8_t stereo_h_div[STEREO_COUNT] = {1,1,2,1,2,1,1};
-
-static int set_param( hnd_t handle, x264_param_t *p_param )
-{
-    mkv_hnd_t *p_mkv = handle;
-    int64_t dw, dh;
-
-    if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
-    {
-        p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
-                                (int64_t)1000000000 / p_param->i_fps_num;
-    }
-    else
-    {
-        p_mkv->frame_duration = 0;
-    }
-
-    dw = p_mkv->width = p_param->i_width;
-    dh = p_mkv->height = p_param->i_height;
-    p_mkv->display_size_units = DS_PIXELS;
-    p_mkv->stereo_mode = -1;
-    if( p_param->i_frame_packing >= 0 && p_param->i_frame_packing < STEREO_COUNT )
-    {
-        p_mkv->stereo_mode = stereo_modes[p_param->i_frame_packing];
-        dw /= stereo_w_div[p_param->i_frame_packing];
-        dh /= stereo_h_div[p_param->i_frame_packing];
-    }
-    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height
-        && p_param->vui.i_sar_width != p_param->vui.i_sar_height )
-    {
-        if ( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) {
-            dw = dw * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-        } else {
-            dh = dh * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
-        }
-    }
-    p_mkv->d_width = (int)dw;
-    p_mkv->d_height = (int)dh;
-
-    p_mkv->i_timebase_num = p_param->i_timebase_num;
-    p_mkv->i_timebase_den = p_param->i_timebase_den;
-
-    return 0;
-}
-
-static int write_headers( hnd_t handle, x264_nal_t *p_nal )
-{
-    mkv_hnd_t *p_mkv = handle;
-
-    int sps_size = p_nal[0].i_payload - 4;
-    int pps_size = p_nal[1].i_payload - 4;
-    int sei_size = p_nal[2].i_payload;
-
-    uint8_t *sps = p_nal[0].p_payload + 4;
-    uint8_t *pps = p_nal[1].p_payload + 4;
-    uint8_t *sei = p_nal[2].p_payload;
-
-    int ret;
-    uint8_t *avcC;
-    int avcC_len;
-
-    if( !p_mkv->width || !p_mkv->height ||
-        !p_mkv->d_width || !p_mkv->d_height )
-        return -1;
-
-    avcC_len = 5 + 1 + 2 + sps_size + 1 + 2 + pps_size;
-    avcC = malloc( avcC_len );
-    if( !avcC )
-        return -1;
-
-    avcC[0] = 1;
-    avcC[1] = sps[1];
-    avcC[2] = sps[2];
-    avcC[3] = sps[3];
-    avcC[4] = 0xff; // nalu size length is four bytes
-    avcC[5] = 0xe1; // one sps
-
-    avcC[6] = sps_size >> 8;
-    avcC[7] = sps_size;
-
-    memcpy( avcC+8, sps, sps_size );
-
-    avcC[8+sps_size] = 1; // one pps
-    avcC[9+sps_size] = pps_size >> 8;
-    avcC[10+sps_size] = pps_size;
-
-    memcpy( avcC+11+sps_size, pps, pps_size );
-
-    ret = mk_write_header( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
-                           avcC, avcC_len, p_mkv->frame_duration, 50000,
-                           p_mkv->width, p_mkv->height,
-                           p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode );
-    free( avcC );
-
-    if( ret < 0 )
-        return ret;
-
-    // SEI
-
-    if( !p_mkv->b_writing_frame )
-    {
-        if( mk_start_frame( p_mkv->w ) < 0 )
-            return -1;
-        p_mkv->b_writing_frame = 1;
-    }
-    if( mk_add_frame_data( p_mkv->w, sei, sei_size ) < 0 )
-        return -1;
-
-    return sei_size + sps_size + pps_size;
-}
-
-static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
-{
-    mkv_hnd_t *p_mkv = handle;
-
-    if( !p_mkv->b_writing_frame )
-    {
-        if( mk_start_frame( p_mkv->w ) < 0 )
-            return -1;
-        p_mkv->b_writing_frame = 1;
-    }
-
-    if( mk_add_frame_data( p_mkv->w, p_nalu, i_size ) < 0 )
-        return -1;
-
-    int64_t i_stamp = (int64_t)((p_picture->i_pts * 1e9 * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);
-
-    p_mkv->b_writing_frame = 0;
-
-    if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
-        return -1;
-
-    return i_size;
-}
-
-static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
-{
-    mkv_hnd_t *p_mkv = handle;
-    int ret;
-    int64_t i_last_delta;
-
-    i_last_delta = p_mkv->i_timebase_den ? (int64_t)(((largest_pts - second_largest_pts) * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5) : 0;
-
-    ret = mk_close( p_mkv->w, i_last_delta );
-
-    free( p_mkv );
-
-    return ret;
-}
-
-const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/android/src/main/libenc/jni/libx264/output/matroska_ebml.c b/android/src/main/libenc/jni/libx264/output/matroska_ebml.c
deleted file mode 100755
index f2b04c7..0000000
--- a/android/src/main/libenc/jni/libx264/output/matroska_ebml.c
+++ /dev/null
@@ -1,513 +0,0 @@
-/*****************************************************************************
- * matroska_ebml.c: matroska muxer utilities
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Mike Matsnev <mike@haali.su>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include "matroska_ebml.h"
-
-#define CLSIZE 1048576
-#define CHECK(x)\
-do {\
-    if( (x) < 0 )\
-        return -1;\
-} while( 0 )
-
-struct mk_context
-{
-    struct mk_context *next, **prev, *parent;
-    mk_writer *owner;
-    unsigned id;
-
-    void *data;
-    unsigned d_cur, d_max;
-};
-
-typedef struct mk_context mk_context;
-
-struct mk_writer
-{
-    FILE *fp;
-
-    unsigned duration_ptr;
-
-    mk_context *root, *cluster, *frame;
-    mk_context *freelist;
-    mk_context *actlist;
-
-    int64_t def_duration;
-    int64_t timescale;
-    int64_t cluster_tc_scaled;
-    int64_t frame_tc, max_frame_tc;
-
-    char wrote_header, in_frame, keyframe, skippable;
-};
-
-static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
-{
-    mk_context *c;
-
-    if( w->freelist )
-    {
-        c = w->freelist;
-        w->freelist = w->freelist->next;
-    }
-    else
-    {
-        c = calloc( 1, sizeof(mk_context) );
-        if( !c )
-            return NULL;
-    }
-
-    c->parent = parent;
-    c->owner = w;
-    c->id = id;
-
-    if( c->owner->actlist )
-        c->owner->actlist->prev = &c->next;
-    c->next = c->owner->actlist;
-    c->prev = &c->owner->actlist;
-    c->owner->actlist = c;
-
-    return c;
-}
-
-static int mk_append_context_data( mk_context *c, const void *data, unsigned size )
-{
-    unsigned ns = c->d_cur + size;
-
-    if( ns > c->d_max )
-    {
-        void *dp;
-        unsigned dn = c->d_max ? c->d_max << 1 : 16;
-        while( ns > dn )
-            dn <<= 1;
-
-        dp = realloc( c->data, dn );
-        if( !dp )
-            return -1;
-
-        c->data = dp;
-        c->d_max = dn;
-    }
-
-    memcpy( (char*)c->data + c->d_cur, data, size );
-
-    c->d_cur = ns;
-
-    return 0;
-}
-
-static int mk_write_id( mk_context *c, unsigned id )
-{
-    unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };
-
-    if( c_id[0] )
-        return mk_append_context_data( c, c_id, 4 );
-    if( c_id[1] )
-        return mk_append_context_data( c, c_id+1, 3 );
-    if( c_id[2] )
-        return mk_append_context_data( c, c_id+2, 2 );
-    return mk_append_context_data( c, c_id+3, 1 );
-}
-
-static int mk_write_size( mk_context *c, unsigned size )
-{
-    unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
-
-    if( size < 0x7f )
-    {
-        c_size[4] |= 0x80;
-        return mk_append_context_data( c, c_size+4, 1 );
-    }
-    if( size < 0x3fff )
-    {
-        c_size[3] |= 0x40;
-        return mk_append_context_data( c, c_size+3, 2 );
-    }
-    if( size < 0x1fffff )
-    {
-        c_size[2] |= 0x20;
-        return mk_append_context_data( c, c_size+2, 3 );
-    }
-    if( size < 0x0fffffff )
-    {
-        c_size[1] |= 0x10;
-        return mk_append_context_data( c, c_size+1, 4 );
-    }
-    return mk_append_context_data( c, c_size, 5 );
-}
-
-static int mk_flush_context_id( mk_context *c )
-{
-    unsigned char ff = 0xff;
-
-    if( !c->id )
-        return 0;
-
-    CHECK( mk_write_id( c->parent, c->id ) );
-    CHECK( mk_append_context_data( c->parent, &ff, 1 ) );
-
-    c->id = 0;
-
-    return 0;
-}
-
-static int mk_flush_context_data( mk_context *c )
-{
-    if( !c->d_cur )
-        return 0;
-
-    if( c->parent )
-        CHECK( mk_append_context_data( c->parent, c->data, c->d_cur ) );
-    else if( fwrite( c->data, c->d_cur, 1, c->owner->fp ) != 1 )
-        return -1;
-
-    c->d_cur = 0;
-
-    return 0;
-}
-
-static int mk_close_context( mk_context *c, unsigned *off )
-{
-    if( c->id )
-    {
-        CHECK( mk_write_id( c->parent, c->id ) );
-        CHECK( mk_write_size( c->parent, c->d_cur ) );
-    }
-
-    if( c->parent && off )
-        *off += c->parent->d_cur;
-
-    CHECK( mk_flush_context_data( c ) );
-
-    if( c->next )
-        c->next->prev = c->prev;
-    *(c->prev) = c->next;
-    c->next = c->owner->freelist;
-    c->owner->freelist = c;
-
-    return 0;
-}
-
-static void mk_destroy_contexts( mk_writer *w )
-{
-    mk_context *next;
-
-    for( mk_context *cur = w->freelist; cur; cur = next )
-    {
-        next = cur->next;
-        free( cur->data );
-        free( cur );
-    }
-
-    for( mk_context *cur = w->actlist; cur; cur = next )
-    {
-        next = cur->next;
-        free( cur->data );
-        free( cur );
-    }
-
-    w->freelist = w->actlist = w->root = NULL;
-}
-
-static int mk_write_string( mk_context *c, unsigned id, const char *str )
-{
-    size_t len = strlen( str );
-
-    CHECK( mk_write_id( c, id ) );
-    CHECK( mk_write_size( c, len ) );
-    CHECK( mk_append_context_data( c, str, len ) );
-    return 0;
-}
-
-static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned size )
-{
-    CHECK( mk_write_id( c, id ) );
-    CHECK( mk_write_size( c, size ) );
-    CHECK( mk_append_context_data( c, data, size ) );
-    return 0;
-}
-
-static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
-{
-    unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
-    unsigned i = 0;
-
-    CHECK( mk_write_id( c, id ) );
-    while( i < 7 && !c_ui[i] )
-        ++i;
-    CHECK( mk_write_size( c, 8 - i ) );
-    CHECK( mk_append_context_data( c, c_ui+i, 8 - i ) );
-    return 0;
-}
-
-static int mk_write_float_raw( mk_context *c, float f )
-{
-    union
-    {
-        float f;
-        unsigned u;
-    } u;
-    unsigned char c_f[4];
-
-    u.f = f;
-    c_f[0] = u.u >> 24;
-    c_f[1] = u.u >> 16;
-    c_f[2] = u.u >> 8;
-    c_f[3] = u.u;
-
-    return mk_append_context_data( c, c_f, 4 );
-}
-
-static int mk_write_float( mk_context *c, unsigned id, float f )
-{
-    CHECK( mk_write_id( c, id ) );
-    CHECK( mk_write_size( c, 4 ) );
-    CHECK( mk_write_float_raw( c, f ) );
-    return 0;
-}
-
-mk_writer *mk_create_writer( const char *filename )
-{
-    mk_writer *w = calloc( 1, sizeof(mk_writer) );
-    if( !w )
-        return NULL;
-
-    w->root = mk_create_context( w, NULL, 0 );
-    if( !w->root )
-    {
-        free( w );
-        return NULL;
-    }
-
-    if( !strcmp( filename, "-" ) )
-        w->fp = stdout;
-    else
-        w->fp = x264_fopen( filename, "wb" );
-    if( !w->fp )
-    {
-        mk_destroy_contexts( w );
-        free( w );
-        return NULL;
-    }
-
-    w->timescale = 1000000;
-
-    return w;
-}
-
-int mk_write_header( mk_writer *w, const char *writing_app,
-                     const char *codec_id,
-                     const void *codec_private, unsigned codec_private_size,
-                     int64_t default_frame_duration,
-                     int64_t timescale,
-                     unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode )
-{
-    mk_context  *c, *ti, *v;
-
-    if( w->wrote_header )
-        return -1;
-
-    w->timescale = timescale;
-    w->def_duration = default_frame_duration;
-
-    if( !(c = mk_create_context( w, w->root, 0x1a45dfa3 )) ) // EBML
-        return -1;
-    CHECK( mk_write_uint( c, 0x4286, 1 ) ); // EBMLVersion
-    CHECK( mk_write_uint( c, 0x42f7, 1 ) ); // EBMLReadVersion
-    CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
-    CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
-    CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
-    CHECK( mk_write_uint( c, 0x4287, stereo_mode >= 0 ? 3 : 2 ) ); // DocTypeVersion
-    CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadVersion
-    CHECK( mk_close_context( c, 0 ) );
-
-    if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
-        return -1;
-    CHECK( mk_flush_context_id( c ) );
-    CHECK( mk_close_context( c, 0 ) );
-
-    if( !(c = mk_create_context( w, w->root, 0x1549a966 )) ) // SegmentInfo
-        return -1;
-    CHECK( mk_write_string( c, 0x4d80, "Haali Matroska Writer b0" ) ); // MuxingApp
-    CHECK( mk_write_string( c, 0x5741, writing_app ) ); // WritingApp
-    CHECK( mk_write_uint( c, 0x2ad7b1, w->timescale ) ); // TimecodeScale
-    CHECK( mk_write_float( c, 0x4489, 0) ); // Duration
-    w->duration_ptr = c->d_cur - 4;
-    CHECK( mk_close_context( c, &w->duration_ptr ) );
-
-    if( !(c = mk_create_context( w, w->root, 0x1654ae6b )) ) // Tracks
-        return -1;
-    if( !(ti = mk_create_context( w, c, 0xae )) ) // TrackEntry
-        return -1;
-    CHECK( mk_write_uint( ti, 0xd7, 1 ) ); // TrackNumber
-    CHECK( mk_write_uint( ti, 0x73c5, 1 ) ); // TrackUID
-    CHECK( mk_write_uint( ti, 0x83, 1 ) ); // TrackType
-    CHECK( mk_write_uint( ti, 0x9c, 0 ) ); // FlagLacing
-    CHECK( mk_write_string( ti, 0x86, codec_id ) ); // CodecID
-    if( codec_private_size )
-        CHECK( mk_write_bin( ti, 0x63a2, codec_private, codec_private_size ) ); // CodecPrivate
-    if( default_frame_duration )
-        CHECK( mk_write_uint( ti, 0x23e383, default_frame_duration ) ); // DefaultDuration
-
-    if( !(v = mk_create_context( w, ti, 0xe0 ) ) ) // Video
-        return -1;
-    CHECK( mk_write_uint( v, 0xb0, width ) ); // PixelWidth
-    CHECK( mk_write_uint( v, 0xba, height ) ); // PixelHeight
-    CHECK( mk_write_uint( v, 0x54b2, display_size_units ) ); // DisplayUnit
-    CHECK( mk_write_uint( v, 0x54b0, d_width ) ); // DisplayWidth
-    CHECK( mk_write_uint( v, 0x54ba, d_height ) ); // DisplayHeight
-    if( stereo_mode >= 0 )
-        CHECK( mk_write_uint( v, 0x53b8, stereo_mode ) ); // StereoMode
-    CHECK( mk_close_context( v, 0 ) );
-
-    CHECK( mk_close_context( ti, 0 ) );
-
-    CHECK( mk_close_context( c, 0 ) );
-
-    CHECK( mk_flush_context_data( w->root ) );
-
-    w->wrote_header = 1;
-
-    return 0;
-}
-
-static int mk_close_cluster( mk_writer *w )
-{
-    if( w->cluster == NULL )
-        return 0;
-    CHECK( mk_close_context( w->cluster, 0 ) );
-    w->cluster = NULL;
-    CHECK( mk_flush_context_data( w->root ) );
-    return 0;
-}
-
-static int mk_flush_frame( mk_writer *w )
-{
-    int64_t delta;
-    unsigned fsize;
-    unsigned char c_delta_flags[3];
-
-    if( !w->in_frame )
-        return 0;
-
-    delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
-    if( delta > 32767ll || delta < -32768ll )
-        CHECK( mk_close_cluster( w ) );
-
-    if( !w->cluster )
-    {
-        w->cluster_tc_scaled = w->frame_tc / w->timescale;
-        w->cluster = mk_create_context( w, w->root, 0x1f43b675 ); // Cluster
-        if( !w->cluster )
-            return -1;
-
-        CHECK( mk_write_uint( w->cluster, 0xe7, w->cluster_tc_scaled ) ); // Timecode
-
-        delta = 0;
-    }
-
-    fsize = w->frame ? w->frame->d_cur : 0;
-
-    CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
-    CHECK( mk_write_size( w->cluster, fsize + 4 ) ); // Size
-    CHECK( mk_write_size( w->cluster, 1 ) ); // TrackNumber
-
-    c_delta_flags[0] = delta >> 8;
-    c_delta_flags[1] = delta;
-    c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
-    CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) ); // Timecode, Flags
-    if( w->frame )
-    {
-        CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) ); // Data
-        w->frame->d_cur = 0;
-    }
-
-    w->in_frame = 0;
-
-    if( w->cluster->d_cur > CLSIZE )
-        CHECK( mk_close_cluster( w ) );
-
-    return 0;
-}
-
-int mk_start_frame( mk_writer *w )
-{
-    if( mk_flush_frame( w ) < 0 )
-        return -1;
-
-    w->in_frame  = 1;
-    w->keyframe  = 0;
-    w->skippable = 0;
-
-    return 0;
-}
-
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
-{
-    if( !w->in_frame )
-        return -1;
-
-    w->frame_tc  = timestamp;
-    w->keyframe  = keyframe  != 0;
-    w->skippable = skippable != 0;
-
-    if( w->max_frame_tc < timestamp )
-        w->max_frame_tc = timestamp;
-
-    return 0;
-}
-
-int mk_add_frame_data( mk_writer *w, const void *data, unsigned size )
-{
-    if( !w->in_frame )
-        return -1;
-
-    if( !w->frame )
-        if( !(w->frame = mk_create_context( w, NULL, 0 )) )
-        return -1;
-
-    return mk_append_context_data( w->frame, data, size );
-}
-
-int mk_close( mk_writer *w, int64_t last_delta )
-{
-    int ret = 0;
-    if( mk_flush_frame( w ) < 0 || mk_close_cluster( w ) < 0 )
-        ret = -1;
-    if( w->wrote_header && x264_is_regular_file( w->fp ) )
-    {
-        int64_t last_frametime = w->def_duration ? w->def_duration : last_delta;
-        int64_t total_duration = w->max_frame_tc + last_frametime;
-        if( fseek( w->fp, w->duration_ptr, SEEK_SET ) ||
-            mk_write_float_raw( w->root, (float)((double)total_duration / w->timescale) ) < 0 ||
-            mk_flush_context_data( w->root ) < 0 )
-            ret = -1;
-    }
-    mk_destroy_contexts( w );
-    fclose( w->fp );
-    free( w );
-    return ret;
-}
diff --git a/android/src/main/libenc/jni/libx264/output/matroska_ebml.h b/android/src/main/libenc/jni/libx264/output/matroska_ebml.h
deleted file mode 100755
index 7d354c2..0000000
--- a/android/src/main/libenc/jni/libx264/output/matroska_ebml.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*****************************************************************************
- * matroska_ebml.h: matroska muxer utilities
- *****************************************************************************
- * Copyright (C) 2005-2016 x264 project
- *
- * Authors: Mike Matsnev <mike@haali.su>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_MATROSKA_EBML_H
-#define X264_MATROSKA_EBML_H
-
-/* Matroska display size units from the spec */
-#define DS_PIXELS        0
-#define DS_CM            1
-#define DS_INCHES        2
-#define DS_ASPECT_RATIO  3
-
-typedef struct mk_writer mk_writer;
-
-mk_writer *mk_create_writer( const char *filename );
-
-int mk_write_header( mk_writer *w, const char *writing_app,
-                     const char *codec_id,
-                     const void *codec_private, unsigned codec_private_size,
-                     int64_t default_frame_duration,
-                     int64_t timescale,
-                     unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode );
-
-int mk_start_frame( mk_writer *w );
-int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
-int mk_close( mk_writer *w, int64_t last_delta );
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/output/mp4.c b/android/src/main/libenc/jni/libx264/output/mp4.c
deleted file mode 100755
index f21619a..0000000
--- a/android/src/main/libenc/jni/libx264/output/mp4.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*****************************************************************************
- * mp4.c: mp4 muxer
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include <gpac/isomedia.h>
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-typedef struct
-{
-    GF_ISOFile *p_file;
-    GF_AVCConfig *p_config;
-    GF_ISOSample *p_sample;
-    int i_track;
-    uint32_t i_descidx;
-    uint64_t i_time_res;
-    int64_t i_time_inc;
-    int64_t i_delay_time;
-    int64_t i_init_delta;
-    int i_numframe;
-    int i_delay_frames;
-    int b_dts_compress;
-    int i_dts_compress_multiplier;
-    int i_data_size;
-} mp4_hnd_t;
-
-static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track )
-{
-    u32 count, di, timescale, time_wnd, rate;
-    u64 offset;
-    Double br;
-    GF_ESD *esd;
-
-    esd = gf_isom_get_esd( p_file, i_track, 1 );
-    if( !esd )
-        return;
-
-    esd->decoderConfig->avgBitrate = 0;
-    esd->decoderConfig->maxBitrate = 0;
-    rate = time_wnd = 0;
-
-    timescale = gf_isom_get_media_timescale( p_file, i_track );
-    count = gf_isom_get_sample_count( p_file, i_track );
-    for( u32 i = 0; i < count; i++ )
-    {
-        GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset );
-        if( !samp )
-        {
-            x264_cli_log( "mp4", X264_LOG_ERROR, "failure reading back frame %u\n", i );
-            break;
-        }
-
-        if( esd->decoderConfig->bufferSizeDB < samp->dataLength )
-            esd->decoderConfig->bufferSizeDB = samp->dataLength;
-
-        esd->decoderConfig->avgBitrate += samp->dataLength;
-        rate += samp->dataLength;
-        if( samp->DTS > time_wnd + timescale )
-        {
-            if( rate > esd->decoderConfig->maxBitrate )
-                esd->decoderConfig->maxBitrate = rate;
-            time_wnd = samp->DTS;
-            rate = 0;
-        }
-
-        gf_isom_sample_del( &samp );
-    }
-
-    br = (Double)(s64)gf_isom_get_media_duration( p_file, i_track );
-    br /= timescale;
-    esd->decoderConfig->avgBitrate = (u32)(esd->decoderConfig->avgBitrate / br);
-    /*move to bps*/
-    esd->decoderConfig->avgBitrate *= 8;
-    esd->decoderConfig->maxBitrate *= 8;
-
-    gf_isom_change_mpeg4_description( p_file, i_track, 1, esd );
-    gf_odf_desc_del( (GF_Descriptor*)esd );
-}
-
-static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
-{
-    mp4_hnd_t *p_mp4 = handle;
-
-    if( !p_mp4 )
-        return 0;
-
-    if( p_mp4->p_config )
-        gf_odf_avc_cfg_del( p_mp4->p_config );
-
-    if( p_mp4->p_sample )
-    {
-        if( p_mp4->p_sample->data )
-            free( p_mp4->p_sample->data );
-
-        p_mp4->p_sample->dataLength = 0;
-        gf_isom_sample_del( &p_mp4->p_sample );
-    }
-
-    if( p_mp4->p_file )
-    {
-        if( p_mp4->i_track )
-        {
-            /* The mdhd duration is defined as CTS[final] - CTS[0] + duration of last frame.
-             * The mdhd duration (in seconds) should be able to be longer than the tkhd duration since the track is managed by edts.
-             * So, if mdhd duration is equal to the last DTS or less, we give the last composition time delta to the last sample duration.
-             * And then, the mdhd duration is updated, but it time-wise doesn't give the actual duration.
-             * The tkhd duration is the actual track duration. */
-            uint64_t mdhd_duration = (2 * largest_pts - second_largest_pts) * p_mp4->i_time_inc;
-            if( mdhd_duration != gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track ) )
-            {
-                uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
-                uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
-                gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
-            }
-
-            /* Write an Edit Box if the first CTS offset is positive.
-             * A media_time is given by not the mvhd timescale but rather the mdhd timescale.
-             * The reason is that an Edit Box maps the presentation time-line to the media time-line.
-             * Any demuxers should follow the Edit Box if it exists. */
-            GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
-            if( sample && sample->CTS_Offset > 0 )
-            {
-                uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
-                uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
-                gf_isom_append_edit_segment( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL );
-            }
-            gf_isom_sample_del( &sample );
-
-            recompute_bitrate_mp4( p_mp4->p_file, p_mp4->i_track );
-        }
-        gf_isom_set_pl_indication( p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15 );
-        gf_isom_set_storage_mode( p_mp4->p_file, GF_ISOM_STORE_FLAT );
-        gf_isom_close( p_mp4->p_file );
-    }
-
-    free( p_mp4 );
-
-    return 0;
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
-{
-    *p_handle = NULL;
-    FILE *fh = x264_fopen( psz_filename, "w" );
-    if( !fh )
-        return -1;
-    int b_regular = x264_is_regular_file( fh );
-    fclose( fh );
-    FAIL_IF_ERR( !b_regular, "mp4", "MP4 output is incompatible with non-regular file `%s'\n", psz_filename );
-
-    mp4_hnd_t *p_mp4 = calloc( 1, sizeof(mp4_hnd_t) );
-    if( !p_mp4 )
-        return -1;
-
-#ifdef _WIN32
-    /* GPAC doesn't support Unicode filenames. */
-    char ansi_filename[MAX_PATH];
-    FAIL_IF_ERR( !x264_ansi_filename( psz_filename, ansi_filename, MAX_PATH, 1 ), "mp4", "invalid ansi filename\n" );
-    p_mp4->p_file = gf_isom_open( ansi_filename, GF_ISOM_OPEN_WRITE, NULL );
-#else
-    p_mp4->p_file = gf_isom_open( psz_filename, GF_ISOM_OPEN_WRITE, NULL );
-#endif
-
-    p_mp4->b_dts_compress = opt->use_dts_compress;
-
-    if( !(p_mp4->p_sample = gf_isom_sample_new()) )
-    {
-        close_file( p_mp4, 0, 0 );
-        return -1;
-    }
-
-    gf_isom_set_brand_info( p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0 );
-
-    *p_handle = p_mp4;
-
-    return 0;
-}
-
-static int set_param( hnd_t handle, x264_param_t *p_param )
-{
-    mp4_hnd_t *p_mp4 = handle;
-
-    p_mp4->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0;
-    p_mp4->i_dts_compress_multiplier = p_mp4->b_dts_compress * p_mp4->i_delay_frames + 1;
-
-    p_mp4->i_time_res = (uint64_t)p_param->i_timebase_den * p_mp4->i_dts_compress_multiplier;
-    p_mp4->i_time_inc = (uint64_t)p_param->i_timebase_num * p_mp4->i_dts_compress_multiplier;
-    FAIL_IF_ERR( p_mp4->i_time_res > UINT32_MAX, "mp4", "MP4 media timescale %"PRIu64" exceeds maximum\n", p_mp4->i_time_res );
-
-    p_mp4->i_track = gf_isom_new_track( p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
-                                        p_mp4->i_time_res );
-
-    p_mp4->p_config = gf_odf_avc_cfg_new();
-    gf_isom_avc_config_new( p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
-                            NULL, NULL, &p_mp4->i_descidx );
-
-    gf_isom_set_track_enabled( p_mp4->p_file, p_mp4->i_track, 1 );
-
-    gf_isom_set_visual_info( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
-                             p_param->i_width, p_param->i_height );
-
-    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
-    {
-        uint64_t dw = p_param->i_width << 16;
-        uint64_t dh = p_param->i_height << 16;
-        double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-        if( sar > 1.0 )
-            dw *= sar;
-        else
-            dh /= sar;
-        gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
-        gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
-    }
-
-    p_mp4->i_data_size = p_param->i_width * p_param->i_height * 3 / 2;
-    p_mp4->p_sample->data = malloc( p_mp4->i_data_size );
-    if( !p_mp4->p_sample->data )
-    {
-        p_mp4->i_data_size = 0;
-        return -1;
-    }
-
-    return 0;
-}
-
-static int check_buffer( mp4_hnd_t *p_mp4, int needed_size )
-{
-    if( needed_size > p_mp4->i_data_size )
-    {
-        void *ptr = realloc( p_mp4->p_sample->data, needed_size );
-        if( !ptr )
-            return -1;
-        p_mp4->p_sample->data = ptr;
-        p_mp4->i_data_size = needed_size;
-    }
-    return 0;
-}
-
-static int write_headers( hnd_t handle, x264_nal_t *p_nal )
-{
-    mp4_hnd_t *p_mp4 = handle;
-    GF_AVCConfigSlot *p_slot;
-
-    int sps_size = p_nal[0].i_payload - 4;
-    int pps_size = p_nal[1].i_payload - 4;
-    int sei_size = p_nal[2].i_payload;
-
-    uint8_t *sps = p_nal[0].p_payload + 4;
-    uint8_t *pps = p_nal[1].p_payload + 4;
-    uint8_t *sei = p_nal[2].p_payload;
-
-    // SPS
-
-    p_mp4->p_config->configurationVersion = 1;
-    p_mp4->p_config->AVCProfileIndication = sps[1];
-    p_mp4->p_config->profile_compatibility = sps[2];
-    p_mp4->p_config->AVCLevelIndication = sps[3];
-    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
-    if( !p_slot )
-        return -1;
-    p_slot->size = sps_size;
-    p_slot->data = malloc( p_slot->size );
-    if( !p_slot->data )
-        return -1;
-    memcpy( p_slot->data, sps, sps_size );
-    gf_list_add( p_mp4->p_config->sequenceParameterSets, p_slot );
-
-    // PPS
-
-    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
-    if( !p_slot )
-        return -1;
-    p_slot->size = pps_size;
-    p_slot->data = malloc( p_slot->size );
-    if( !p_slot->data )
-        return -1;
-    memcpy( p_slot->data, pps, pps_size );
-    gf_list_add( p_mp4->p_config->pictureParameterSets, p_slot );
-    gf_isom_avc_config_update( p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config );
-
-    // SEI
-
-    if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + sei_size ) )
-        return -1;
-    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size );
-    p_mp4->p_sample->dataLength += sei_size;
-
-    return sei_size + sps_size + pps_size;
-}
-
-static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
-{
-    mp4_hnd_t *p_mp4 = handle;
-    int64_t dts;
-    int64_t cts;
-
-    if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + i_size ) )
-        return -1;
-    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size );
-    p_mp4->p_sample->dataLength += i_size;
-
-    if( !p_mp4->i_numframe )
-        p_mp4->i_delay_time = p_picture->i_dts * -1;
-
-    if( p_mp4->b_dts_compress )
-    {
-        if( p_mp4->i_numframe == 1 )
-            p_mp4->i_init_delta = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
-        dts = p_mp4->i_numframe > p_mp4->i_delay_frames
-            ? p_picture->i_dts * p_mp4->i_time_inc
-            : p_mp4->i_numframe * (p_mp4->i_init_delta / p_mp4->i_dts_compress_multiplier);
-        cts = p_picture->i_pts * p_mp4->i_time_inc;
-    }
-    else
-    {
-        dts = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
-        cts = (p_picture->i_pts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
-    }
-
-    p_mp4->p_sample->IsRAP = p_picture->b_keyframe;
-    p_mp4->p_sample->DTS = dts;
-    p_mp4->p_sample->CTS_Offset = (uint32_t)(cts - dts);
-    gf_isom_add_sample( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample );
-
-    p_mp4->p_sample->dataLength = 0;
-    p_mp4->i_numframe++;
-
-    return i_size;
-}
-
-const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/android/src/main/libenc/jni/libx264/output/mp4_lsmash.c b/android/src/main/libenc/jni/libx264/output/mp4_lsmash.c
deleted file mode 100755
index 6a10a51..0000000
--- a/android/src/main/libenc/jni/libx264/output/mp4_lsmash.c
+++ /dev/null
@@ -1,429 +0,0 @@
-/*****************************************************************************
- * mp4_lsmash.c: mp4 muxer using L-SMASH
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Yusuke Nakamura <muken.the.vfrmaniac@gmail.com>
- *          Takashi Hirata <silverfilain@gmail.com>
- *          golgol7777 <golgol7777@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-#include <lsmash.h>
-
-#define H264_NALU_LENGTH_SIZE 4
-
-/*******************/
-
-#define MP4_LOG_ERROR( ... )                x264_cli_log( "mp4", X264_LOG_ERROR, __VA_ARGS__ )
-#define MP4_LOG_WARNING( ... )              x264_cli_log( "mp4", X264_LOG_WARNING, __VA_ARGS__ )
-#define MP4_LOG_INFO( ... )                 x264_cli_log( "mp4", X264_LOG_INFO, __VA_ARGS__ )
-#define MP4_FAIL_IF_ERR( cond, ... )        FAIL_IF_ERR( cond, "mp4", __VA_ARGS__ )
-
-/* For close_file() */
-#define MP4_LOG_IF_ERR( cond, ... )\
-do\
-{\
-    if( cond )\
-    {\
-        MP4_LOG_ERROR( __VA_ARGS__ );\
-    }\
-} while( 0 )
-
-/* For open_file() */
-#define MP4_FAIL_IF_ERR_EX( cond, ... )\
-do\
-{\
-    if( cond )\
-    {\
-        remove_mp4_hnd( p_mp4 );\
-        MP4_LOG_ERROR( __VA_ARGS__ );\
-        return -1;\
-    }\
-} while( 0 )
-
-/*******************/
-
-typedef struct
-{
-    lsmash_root_t *p_root;
-    lsmash_video_summary_t *summary;
-    int b_stdout;
-    uint32_t i_movie_timescale;
-    uint32_t i_video_timescale;
-    uint32_t i_track;
-    uint32_t i_sample_entry;
-    uint64_t i_time_inc;
-    int64_t i_start_offset;
-    uint64_t i_first_cts;
-    uint64_t i_prev_dts;
-    uint32_t i_sei_size;
-    uint8_t *p_sei_buffer;
-    int i_numframe;
-    int64_t i_init_delta;
-    int i_delay_frames;
-    int b_dts_compress;
-    int i_dts_compress_multiplier;
-    int b_use_recovery;
-    int b_fragments;
-    lsmash_file_parameters_t file_param;
-} mp4_hnd_t;
-
-/*******************/
-
-static void remove_mp4_hnd( hnd_t handle )
-{
-    mp4_hnd_t *p_mp4 = handle;
-    if( !p_mp4 )
-        return;
-    lsmash_cleanup_summary( (lsmash_summary_t *)p_mp4->summary );
-    lsmash_close_file( &p_mp4->file_param );
-    lsmash_destroy_root( p_mp4->p_root );
-    free( p_mp4->p_sei_buffer );
-    free( p_mp4 );
-}
-
-/*******************/
-
-static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
-{
-    mp4_hnd_t *p_mp4 = handle;
-
-    if( !p_mp4 )
-        return 0;
-
-    if( p_mp4->p_root )
-    {
-        double actual_duration = 0;
-        if( p_mp4->i_track )
-        {
-            /* Flush the rest of samples and add the last sample_delta. */
-            uint32_t last_delta = largest_pts - second_largest_pts;
-            MP4_LOG_IF_ERR( lsmash_flush_pooled_samples( p_mp4->p_root, p_mp4->i_track, (last_delta ? last_delta : 1) * p_mp4->i_time_inc ),
-                            "failed to flush the rest of samples.\n" );
-
-            if( p_mp4->i_movie_timescale != 0 && p_mp4->i_video_timescale != 0 )    /* avoid zero division */
-                actual_duration = ((double)((largest_pts + last_delta) * p_mp4->i_time_inc) / p_mp4->i_video_timescale) * p_mp4->i_movie_timescale;
-            else
-                MP4_LOG_ERROR( "timescale is broken.\n" );
-
-            /*
-             * Declare the explicit time-line mapping.
-             * A segment_duration is given by movie timescale, while a media_time that is the start time of this segment
-             * is given by not the movie timescale but rather the media timescale.
-             * The reason is that ISO media have two time-lines, presentation and media time-line,
-             * and an edit maps the presentation time-line to the media time-line.
-             * According to QuickTime file format specification and the actual playback in QuickTime Player,
-             * if the Edit Box doesn't exist in the track, the ratio of the summation of sample durations and track's duration becomes
-             * the track's media_rate so that the entire media can be used by the track.
-             * So, we add Edit Box here to avoid this implicit media_rate could distort track's presentation timestamps slightly.
-             * Note: Any demuxers should follow the Edit List Box if it exists.
-             */
-            lsmash_edit_t edit;
-            edit.duration   = actual_duration;
-            edit.start_time = p_mp4->i_first_cts;
-            edit.rate       = ISOM_EDIT_MODE_NORMAL;
-            if( !p_mp4->b_fragments )
-            {
-                MP4_LOG_IF_ERR( lsmash_create_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, edit ),
-                                "failed to set timeline map for video.\n" );
-            }
-            else if( !p_mp4->b_stdout )
-                MP4_LOG_IF_ERR( lsmash_modify_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, 1, edit ),
-                                "failed to update timeline map for video.\n" );
-        }
-
-        MP4_LOG_IF_ERR( lsmash_finish_movie( p_mp4->p_root, NULL ), "failed to finish movie.\n" );
-    }
-
-    remove_mp4_hnd( p_mp4 ); /* including lsmash_destroy_root( p_mp4->p_root ); */
-
-    return 0;
-}
-
-static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
-{
-    *p_handle = NULL;
-
-    int b_regular = strcmp( psz_filename, "-" );
-    b_regular = b_regular && x264_is_regular_file_path( psz_filename );
-    if( b_regular )
-    {
-        FILE *fh = x264_fopen( psz_filename, "wb" );
-        MP4_FAIL_IF_ERR( !fh, "cannot open output file `%s'.\n", psz_filename );
-        b_regular = x264_is_regular_file( fh );
-        fclose( fh );
-    }
-
-    mp4_hnd_t *p_mp4 = calloc( 1, sizeof(mp4_hnd_t) );
-    MP4_FAIL_IF_ERR( !p_mp4, "failed to allocate memory for muxer information.\n" );
-
-    p_mp4->b_dts_compress = opt->use_dts_compress;
-    p_mp4->b_use_recovery = 0; // we don't really support recovery
-    p_mp4->b_fragments    = !b_regular;
-    p_mp4->b_stdout       = !strcmp( psz_filename, "-" );
-
-    p_mp4->p_root = lsmash_create_root();
-    MP4_FAIL_IF_ERR_EX( !p_mp4->p_root, "failed to create root.\n" );
-
-    MP4_FAIL_IF_ERR_EX( lsmash_open_file( psz_filename, 0, &p_mp4->file_param ) < 0, "failed to open an output file.\n" );
-    if( p_mp4->b_fragments )
-        p_mp4->file_param.mode |= LSMASH_FILE_MODE_FRAGMENTED;
-
-    p_mp4->summary = (lsmash_video_summary_t *)lsmash_create_summary( LSMASH_SUMMARY_TYPE_VIDEO );
-    MP4_FAIL_IF_ERR_EX( !p_mp4->summary,
-                        "failed to allocate memory for summary information of video.\n" );
-    p_mp4->summary->sample_type = ISOM_CODEC_TYPE_AVC1_VIDEO;
-
-    *p_handle = p_mp4;
-
-    return 0;
-}
-
-static int set_param( hnd_t handle, x264_param_t *p_param )
-{
-    mp4_hnd_t *p_mp4 = handle;
-    uint64_t i_media_timescale;
-
-    p_mp4->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0;
-    p_mp4->i_dts_compress_multiplier = p_mp4->b_dts_compress * p_mp4->i_delay_frames + 1;
-
-    i_media_timescale = (uint64_t)p_param->i_timebase_den * p_mp4->i_dts_compress_multiplier;
-    p_mp4->i_time_inc = (uint64_t)p_param->i_timebase_num * p_mp4->i_dts_compress_multiplier;
-    MP4_FAIL_IF_ERR( i_media_timescale > UINT32_MAX, "MP4 media timescale %"PRIu64" exceeds maximum\n", i_media_timescale );
-
-    /* Select brands. */
-    lsmash_brand_type brands[6] = { 0 };
-    uint32_t brand_count = 0;
-    brands[brand_count++] = ISOM_BRAND_TYPE_MP42;
-    brands[brand_count++] = ISOM_BRAND_TYPE_MP41;
-    brands[brand_count++] = ISOM_BRAND_TYPE_ISOM;
-    if( p_mp4->b_use_recovery )
-    {
-        brands[brand_count++] = ISOM_BRAND_TYPE_AVC1;   /* sdtp, sgpd, sbgp and visual roll recovery grouping */
-        if( p_param->b_open_gop )
-            brands[brand_count++] = ISOM_BRAND_TYPE_ISO6;   /* cslg and visual random access grouping */
-    }
-
-    /* Set file */
-    lsmash_file_parameters_t *file_param = &p_mp4->file_param;
-    file_param->major_brand   = brands[0];
-    file_param->brands        = brands;
-    file_param->brand_count   = brand_count;
-    file_param->minor_version = 0;
-    MP4_FAIL_IF_ERR( !lsmash_set_file( p_mp4->p_root, file_param ), "failed to add an output file into a ROOT.\n" );
-
-    /* Set movie parameters. */
-    lsmash_movie_parameters_t movie_param;
-    lsmash_initialize_movie_parameters( &movie_param );
-    MP4_FAIL_IF_ERR( lsmash_set_movie_parameters( p_mp4->p_root, &movie_param ),
-                     "failed to set movie parameters.\n" );
-    p_mp4->i_movie_timescale = lsmash_get_movie_timescale( p_mp4->p_root );
-    MP4_FAIL_IF_ERR( !p_mp4->i_movie_timescale, "movie timescale is broken.\n" );
-
-    /* Create a video track. */
-    p_mp4->i_track = lsmash_create_track( p_mp4->p_root, ISOM_MEDIA_HANDLER_TYPE_VIDEO_TRACK );
-    MP4_FAIL_IF_ERR( !p_mp4->i_track, "failed to create a video track.\n" );
-
-    p_mp4->summary->width = p_param->i_width;
-    p_mp4->summary->height = p_param->i_height;
-    uint32_t i_display_width = p_param->i_width << 16;
-    uint32_t i_display_height = p_param->i_height << 16;
-    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
-    {
-        double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-        if( sar > 1.0 )
-            i_display_width *= sar;
-        else
-            i_display_height /= sar;
-        p_mp4->summary->par_h = p_param->vui.i_sar_width;
-        p_mp4->summary->par_v = p_param->vui.i_sar_height;
-    }
-    p_mp4->summary->color.primaries_index = p_param->vui.i_colorprim;
-    p_mp4->summary->color.transfer_index  = p_param->vui.i_transfer;
-    p_mp4->summary->color.matrix_index    = p_param->vui.i_colmatrix >= 0 ? p_param->vui.i_colmatrix : ISOM_MATRIX_INDEX_UNSPECIFIED;
-    p_mp4->summary->color.full_range      = p_param->vui.b_fullrange >= 0 ? p_param->vui.b_fullrange : 0;
-
-    /* Set video track parameters. */
-    lsmash_track_parameters_t track_param;
-    lsmash_initialize_track_parameters( &track_param );
-    lsmash_track_mode track_mode = ISOM_TRACK_ENABLED | ISOM_TRACK_IN_MOVIE | ISOM_TRACK_IN_PREVIEW;
-    track_param.mode = track_mode;
-    track_param.display_width = i_display_width;
-    track_param.display_height = i_display_height;
-    MP4_FAIL_IF_ERR( lsmash_set_track_parameters( p_mp4->p_root, p_mp4->i_track, &track_param ),
-                     "failed to set track parameters for video.\n" );
-
-    /* Set video media parameters. */
-    lsmash_media_parameters_t media_param;
-    lsmash_initialize_media_parameters( &media_param );
-    media_param.timescale = i_media_timescale;
-    media_param.media_handler_name = "L-SMASH Video Media Handler";
-    if( p_mp4->b_use_recovery )
-    {
-        media_param.roll_grouping = p_param->b_intra_refresh;
-        media_param.rap_grouping = p_param->b_open_gop;
-    }
-    MP4_FAIL_IF_ERR( lsmash_set_media_parameters( p_mp4->p_root, p_mp4->i_track, &media_param ),
-                     "failed to set media parameters for video.\n" );
-    p_mp4->i_video_timescale = lsmash_get_media_timescale( p_mp4->p_root, p_mp4->i_track );
-    MP4_FAIL_IF_ERR( !p_mp4->i_video_timescale, "media timescale for video is broken.\n" );
-
-    return 0;
-}
-
-static int write_headers( hnd_t handle, x264_nal_t *p_nal )
-{
-    mp4_hnd_t *p_mp4 = handle;
-
-    uint32_t sps_size = p_nal[0].i_payload - H264_NALU_LENGTH_SIZE;
-    uint32_t pps_size = p_nal[1].i_payload - H264_NALU_LENGTH_SIZE;
-    uint32_t sei_size = p_nal[2].i_payload;
-
-    uint8_t *sps = p_nal[0].p_payload + H264_NALU_LENGTH_SIZE;
-    uint8_t *pps = p_nal[1].p_payload + H264_NALU_LENGTH_SIZE;
-    uint8_t *sei = p_nal[2].p_payload;
-
-    lsmash_codec_specific_t *cs = lsmash_create_codec_specific_data( LSMASH_CODEC_SPECIFIC_DATA_TYPE_ISOM_VIDEO_H264,
-                                                                     LSMASH_CODEC_SPECIFIC_FORMAT_STRUCTURED );
-
-    lsmash_h264_specific_parameters_t *param = (lsmash_h264_specific_parameters_t *)cs->data.structured;
-    param->lengthSizeMinusOne = H264_NALU_LENGTH_SIZE - 1;
-
-    /* SPS
-     * The remaining parameters are automatically set by SPS. */
-    if( lsmash_append_h264_parameter_set( param, H264_PARAMETER_SET_TYPE_SPS, sps, sps_size ) )
-    {
-        MP4_LOG_ERROR( "failed to append SPS.\n" );
-        return -1;
-    }
-
-    /* PPS */
-    if( lsmash_append_h264_parameter_set( param, H264_PARAMETER_SET_TYPE_PPS, pps, pps_size ) )
-    {
-        MP4_LOG_ERROR( "failed to append PPS.\n" );
-        return -1;
-    }
-
-    if( lsmash_add_codec_specific_data( (lsmash_summary_t *)p_mp4->summary, cs ) )
-    {
-        MP4_LOG_ERROR( "failed to add H.264 specific info.\n" );
-        return -1;
-    }
-
-    lsmash_destroy_codec_specific_data( cs );
-
-    /* Additional extensions */
-    /* Bitrate info */
-    cs = lsmash_create_codec_specific_data( LSMASH_CODEC_SPECIFIC_DATA_TYPE_ISOM_VIDEO_H264_BITRATE,
-                                            LSMASH_CODEC_SPECIFIC_FORMAT_STRUCTURED );
-    if( cs )
-        lsmash_add_codec_specific_data( (lsmash_summary_t *)p_mp4->summary, cs );
-    lsmash_destroy_codec_specific_data( cs );
-
-    p_mp4->i_sample_entry = lsmash_add_sample_entry( p_mp4->p_root, p_mp4->i_track, p_mp4->summary );
-    MP4_FAIL_IF_ERR( !p_mp4->i_sample_entry,
-                     "failed to add sample entry for video.\n" );
-
-    /* SEI */
-    p_mp4->p_sei_buffer = malloc( sei_size );
-    MP4_FAIL_IF_ERR( !p_mp4->p_sei_buffer,
-                     "failed to allocate sei transition buffer.\n" );
-    memcpy( p_mp4->p_sei_buffer, sei, sei_size );
-    p_mp4->i_sei_size = sei_size;
-
-    return sei_size + sps_size + pps_size;
-}
-
-static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
-{
-    mp4_hnd_t *p_mp4 = handle;
-    uint64_t dts, cts;
-
-    if( !p_mp4->i_numframe )
-    {
-        p_mp4->i_start_offset = p_picture->i_dts * -1;
-        p_mp4->i_first_cts = p_mp4->b_dts_compress ? 0 : p_mp4->i_start_offset * p_mp4->i_time_inc;
-        if( p_mp4->b_fragments )
-        {
-            lsmash_edit_t edit;
-            edit.duration   = ISOM_EDIT_DURATION_UNKNOWN32;     /* QuickTime doesn't support 64bit duration. */
-            edit.start_time = p_mp4->i_first_cts;
-            edit.rate       = ISOM_EDIT_MODE_NORMAL;
-            MP4_LOG_IF_ERR( lsmash_create_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, edit ),
-                            "failed to set timeline map for video.\n" );
-        }
-    }
-
-    lsmash_sample_t *p_sample = lsmash_create_sample( i_size + p_mp4->i_sei_size );
-    MP4_FAIL_IF_ERR( !p_sample,
-                     "failed to create a video sample data.\n" );
-
-    if( p_mp4->p_sei_buffer )
-    {
-        memcpy( p_sample->data, p_mp4->p_sei_buffer, p_mp4->i_sei_size );
-        free( p_mp4->p_sei_buffer );
-        p_mp4->p_sei_buffer = NULL;
-    }
-
-    memcpy( p_sample->data + p_mp4->i_sei_size, p_nalu, i_size );
-    p_mp4->i_sei_size = 0;
-
-    if( p_mp4->b_dts_compress )
-    {
-        if( p_mp4->i_numframe == 1 )
-            p_mp4->i_init_delta = (p_picture->i_dts + p_mp4->i_start_offset) * p_mp4->i_time_inc;
-        dts = p_mp4->i_numframe > p_mp4->i_delay_frames
-            ? p_picture->i_dts * p_mp4->i_time_inc
-            : p_mp4->i_numframe * (p_mp4->i_init_delta / p_mp4->i_dts_compress_multiplier);
-        cts = p_picture->i_pts * p_mp4->i_time_inc;
-    }
-    else
-    {
-        dts = (p_picture->i_dts + p_mp4->i_start_offset) * p_mp4->i_time_inc;
-        cts = (p_picture->i_pts + p_mp4->i_start_offset) * p_mp4->i_time_inc;
-    }
-
-    p_sample->dts = dts;
-    p_sample->cts = cts;
-    p_sample->index = p_mp4->i_sample_entry;
-    p_sample->prop.ra_flags = p_picture->b_keyframe ? ISOM_SAMPLE_RANDOM_ACCESS_FLAG_SYNC : ISOM_SAMPLE_RANDOM_ACCESS_FLAG_NONE;
-
-    if( p_mp4->b_fragments && p_mp4->i_numframe && p_sample->prop.ra_flags != ISOM_SAMPLE_RANDOM_ACCESS_FLAG_NONE )
-    {
-        MP4_FAIL_IF_ERR( lsmash_flush_pooled_samples( p_mp4->p_root, p_mp4->i_track, p_sample->dts - p_mp4->i_prev_dts ),
-                         "failed to flush the rest of samples.\n" );
-        MP4_FAIL_IF_ERR( lsmash_create_fragment_movie( p_mp4->p_root ),
-                         "failed to create a movie fragment.\n" );
-    }
-
-    /* Append data per sample. */
-    MP4_FAIL_IF_ERR( lsmash_append_sample( p_mp4->p_root, p_mp4->i_track, p_sample ),
-                     "failed to append a video frame.\n" );
-
-    p_mp4->i_prev_dts = dts;
-    p_mp4->i_numframe++;
-
-    return i_size;
-}
-
-const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/android/src/main/libenc/jni/libx264/output/output.h b/android/src/main/libenc/jni/libx264/output/output.h
deleted file mode 100755
index 2230241..0000000
--- a/android/src/main/libenc/jni/libx264/output/output.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*****************************************************************************
- * output.h: x264 file output modules
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_OUTPUT_H
-#define X264_OUTPUT_H
-
-#include "x264cli.h"
-
-typedef struct
-{
-    int use_dts_compress;
-} cli_output_opt_t;
-
-typedef struct
-{
-    int (*open_file)( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt );
-    int (*set_param)( hnd_t handle, x264_param_t *p_param );
-    int (*write_headers)( hnd_t handle, x264_nal_t *p_nal );
-    int (*write_frame)( hnd_t handle, uint8_t *p_nal, int i_size, x264_picture_t *p_picture );
-    int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
-} cli_output_t;
-
-extern const cli_output_t raw_output;
-extern const cli_output_t mkv_output;
-extern const cli_output_t mp4_output;
-extern const cli_output_t flv_output;
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/output/raw.c b/android/src/main/libenc/jni/libx264/output/raw.c
deleted file mode 100755
index a6f37d4..0000000
--- a/android/src/main/libenc/jni/libx264/output/raw.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*****************************************************************************
- * raw.c: raw muxer
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "output.h"
-
-static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
-{
-    if( !strcmp( psz_filename, "-" ) )
-        *p_handle = stdout;
-    else if( !(*p_handle = x264_fopen( psz_filename, "w+b" )) )
-        return -1;
-
-    return 0;
-}
-
-static int set_param( hnd_t handle, x264_param_t *p_param )
-{
-    return 0;
-}
-
-static int write_headers( hnd_t handle, x264_nal_t *p_nal )
-{
-    int size = p_nal[0].i_payload + p_nal[1].i_payload + p_nal[2].i_payload;
-
-    if( fwrite( p_nal[0].p_payload, size, 1, (FILE*)handle ) )
-        return size;
-    return -1;
-}
-
-static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
-{
-    if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) )
-        return i_size;
-    return -1;
-}
-
-static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
-{
-    if( !handle || handle == stdout )
-        return 0;
-
-    return fclose( (FILE*)handle );
-}
-
-const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
-
diff --git a/android/src/main/libenc/jni/libx264/tools/checkasm-a.asm b/android/src/main/libenc/jni/libx264/tools/checkasm-a.asm
deleted file mode 100755
index 1f3cf04..0000000
--- a/android/src/main/libenc/jni/libx264/tools/checkasm-a.asm
+++ /dev/null
@@ -1,223 +0,0 @@
-;*****************************************************************************
-;* checkasm-a.asm: assembly check tool
-;*****************************************************************************
-;* Copyright (C) 2008-2016 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing@x264.com.
-;*****************************************************************************
-
-%include "x86inc.asm"
-
-SECTION_RODATA
-
-error_message: db "failed to preserve register", 0
-
-%if ARCH_X86_64
-; just random numbers to reduce the chance of incidental match
-ALIGN 16
-x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
-x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
-x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
-x9:  dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
-x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
-x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
-x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
-x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
-x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
-x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
-n7:  dq 0x21f86d66c8ca00ce
-n8:  dq 0x75b6ba21077c48ad
-n9:  dq 0xed56bb2dcb3c7736
-n10: dq 0x8bda43d3fd1a7e06
-n11: dq 0xb64a9c9e5d318408
-n12: dq 0xdf9a54b303f1d3a3
-n13: dq 0x4a75479abd64e097
-n14: dq 0x249214109d5d1c88
-%endif
-
-SECTION .text
-
-cextern_naked puts
-
-; max number of args used by any x264 asm function.
-; (max_args % 4) must equal 3 for stack alignment
-%define max_args 15
-
-%if ARCH_X86_64
-
-;-----------------------------------------------------------------------------
-; void x264_checkasm_stack_clobber( uint64_t clobber, ... )
-;-----------------------------------------------------------------------------
-cglobal checkasm_stack_clobber, 1,2
-    ; Clobber the stack with junk below the stack pointer
-    %define argsize (max_args+6)*8
-    SUB  rsp, argsize
-    mov   r1, argsize-8
-.loop:
-    mov [rsp+r1], r0
-    sub   r1, 8
-    jge .loop
-    ADD  rsp, argsize
-    RET
-
-%if WIN64
-    %assign free_regs 7
-%else
-    %assign free_regs 9
-%endif
-
-;-----------------------------------------------------------------------------
-; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
-;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal checkasm_call, 2,15,16,max_args*8+8
-    mov  r6, r0
-    mov  [rsp+max_args*8], r1
-
-    ; All arguments have been pushed on the stack instead of registers in order to
-    ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
-    mov  r0, r6mp
-    mov  r1, r7mp
-    mov  r2, r8mp
-    mov  r3, r9mp
-%if UNIX64
-    mov  r4, r10mp
-    mov  r5, r11mp
-    %assign i 6
-    %rep max_args-6
-        mov  r9, [rsp+stack_offset+(i+1)*8]
-        mov  [rsp+(i-6)*8], r9
-        %assign i i+1
-    %endrep
-%else
-    %assign i 4
-    %rep max_args-4
-        mov  r9, [rsp+stack_offset+(i+7)*8]
-        mov  [rsp+i*8], r9
-        %assign i i+1
-    %endrep
-%endif
-
-%if WIN64
-    %assign i 6
-    %rep 16-6
-        mova m %+ i, [x %+ i]
-        %assign i i+1
-    %endrep
-%endif
-
-%assign i 14
-%rep 15-free_regs
-    mov  r %+ i, [n %+ i]
-    %assign i i-1
-%endrep
-    call r6
-%assign i 14
-%rep 15-free_regs
-    xor  r %+ i, [n %+ i]
-    or  r14, r %+ i
-    %assign i i-1
-%endrep
-
-%if WIN64
-    %assign i 6
-    %rep 16-6
-        pxor m %+ i, [x %+ i]
-        por  m6, m %+ i
-        %assign i i+1
-    %endrep
-    packsswb m6, m6
-    movq r5, m6
-    or  r14, r5
-%endif
-
-    jz .ok
-    mov  r9, rax
-    mov r10, rdx
-    lea  r0, [error_message]
-    call puts
-    mov  r1, [rsp+max_args*8]
-    mov  dword [r1], 0
-    mov  rdx, r10
-    mov  rax, r9
-.ok:
-    RET
-
-%else
-
-; just random numbers to reduce the chance of incidental match
-%define n3 dword 0x6549315c
-%define n4 dword 0xe02f3e23
-%define n5 dword 0xb78d0d1d
-%define n6 dword 0x33627ba7
-
-;-----------------------------------------------------------------------------
-; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
-;-----------------------------------------------------------------------------
-cglobal checkasm_call, 1,7
-    mov  r3, n3
-    mov  r4, n4
-    mov  r5, n5
-    mov  r6, n6
-%rep max_args
-    push dword [esp+24+max_args*4]
-%endrep
-    call r0
-    add  esp, max_args*4
-    xor  r3, n3
-    xor  r4, n4
-    xor  r5, n5
-    xor  r6, n6
-    or   r3, r4
-    or   r5, r6
-    or   r3, r5
-    jz .ok
-    mov  r3, eax
-    mov  r4, edx
-    lea  r1, [error_message]
-    push r1
-    call puts
-    add  esp, 4
-    mov  r1, r1m
-    mov  dword [r1], 0
-    mov  edx, r4
-    mov  eax, r3
-.ok:
-    REP_RET
-
-%endif ; ARCH_X86_64
-
-;-----------------------------------------------------------------------------
-; int x264_stack_pagealign( int (*func)(), int align )
-;-----------------------------------------------------------------------------
-cglobal stack_pagealign, 2,2
-    movsxdifnidn r1, r1d
-    push rbp
-    mov  rbp, rsp
-%if WIN64
-    sub  rsp, 32 ; shadow space
-%endif
-    and  rsp, ~0xfff
-    sub  rsp, r1
-    call r0
-    leave
-    RET
-
diff --git a/android/src/main/libenc/jni/libx264/tools/checkasm-aarch64.S b/android/src/main/libenc/jni/libx264/tools/checkasm-aarch64.S
deleted file mode 100755
index f96576b..0000000
--- a/android/src/main/libenc/jni/libx264/tools/checkasm-aarch64.S
+++ /dev/null
@@ -1,156 +0,0 @@
-/****************************************************************************
- * checkasm-aarch64.S: assembly check tool
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "../common/aarch64/asm.S"
-
-.section .rodata
-.align 4
-register_init:
-.quad 0x21f86d66c8ca00ce
-.quad 0x75b6ba21077c48ad
-.quad 0xed56bb2dcb3c7736
-.quad 0x8bda43d3fd1a7e06
-.quad 0xb64a9c9e5d318408
-.quad 0xdf9a54b303f1d3a3
-.quad 0x4a75479abd64e097
-.quad 0x249214109d5d1c88
-.quad 0x1a1b2550a612b48c
-.quad 0x79445c159ce79064
-.quad 0x2eed899d5a28ddcd
-.quad 0x86b2536fcd8cf636
-.quad 0xb0856806085e7943
-.quad 0x3f2bf84fc0fcca4e
-.quad 0xacbd382dcf5b8de2
-.quad 0xd229e1f5b281303f
-.quad 0x71aeaff20b095fd9
-.quad 0xab63e2e11fa38ed9
-
-
-error_message:
-.asciz "failed to preserve register"
-
-.text
-
-// max number of args used by any x264 asm function.
-#define MAX_ARGS 15
-
-#define ARG_STACK ((8*(MAX_ARGS - 6) + 15) & ~15)
-
-function x264_checkasm_call, export=1
-    stp         x29, x30, [sp, #-16]!
-    mov         x29, sp
-    stp         x19, x20, [sp, #-16]!
-    stp         x21, x22, [sp, #-16]!
-    stp         x23, x24, [sp, #-16]!
-    stp         x25, x26, [sp, #-16]!
-    stp         x27, x28, [sp, #-16]!
-    stp         d8,  d9,  [sp, #-16]!
-    stp         d10, d11, [sp, #-16]!
-    stp         d12, d13, [sp, #-16]!
-    stp         d14, d15, [sp, #-16]!
-
-    movrel      x9, register_init
-    ldp         d8,  d9,  [x9], #16
-    ldp         d10, d11, [x9], #16
-    ldp         d12, d13, [x9], #16
-    ldp         d14, d15, [x9], #16
-    ldp         x19, x20, [x9], #16
-    ldp         x21, x22, [x9], #16
-    ldp         x23, x24, [x9], #16
-    ldp         x25, x26, [x9], #16
-    ldp         x27, x28, [x9], #16
-
-    str         x1,  [sp, #-16]!
-
-    sub         sp,  sp,  #ARG_STACK
-.equ pos, 0
-// first two stacked args are copied to x6, x7
-.rept MAX_ARGS-6
-    ldr         x9, [x29, #16 + 16 + pos]
-    str         x9, [sp, #pos]
-.equ pos, pos + 8
-.endr
-
-    mov         x12, x0
-    mov         x0,  x2
-    mov         x1,  x3
-    mov         x2,  x4
-    mov         x3,  x5
-    mov         x4,  x6
-    mov         x5,  x7
-    ldp         x6,  x7,  [x29, #16]
-    blr         x12
-    add         sp,  sp,  #ARG_STACK
-    ldr         x2,  [sp]
-    stp         x0,  x1, [sp]
-    movrel      x9, register_init
-    movi        v3.8h,  #0
-
-.macro check_reg_neon reg1, reg2
-    ldr         q0,  [x9], #16
-    uzp1        v1.2d,  v\reg1\().2d, v\reg2\().2d
-    eor         v0.16b, v0.16b, v1.16b
-    orr         v3.16b, v3.16b, v0.16b
-.endm
-    check_reg_neon  8,  9
-    check_reg_neon  10, 11
-    check_reg_neon  12, 13
-    check_reg_neon  14, 15
-    uqxtn       v3.8b,  v3.8h
-    umov        x3,  v3.d[0]
-
-.macro check_reg reg1, reg2
-    ldp         x0,  x1,  [x9], #16
-    eor         x0,  x0,  \reg1
-    eor         x1,  x1,  \reg2
-    orr         x3,  x3,  x0
-    orr         x3,  x3,  x1
-.endm
-    check_reg   x19, x20
-    check_reg   x21, x22
-    check_reg   x23, x24
-    check_reg   x25, x26
-    check_reg   x27, x28
-
-    cbz         x3,  0f
-
-    mov         w9,  #0
-    str         w9,  [x2]
-    movrel      x0, error_message
-    bl          puts
-0:
-    ldp         x0,  x1,  [sp], #16
-    ldp         d14, d15, [sp], #16
-    ldp         d12, d13, [sp], #16
-    ldp         d10, d11, [sp], #16
-    ldp         d8,  d9,  [sp], #16
-    ldp         x27, x28, [sp], #16
-    ldp         x25, x26, [sp], #16
-    ldp         x23, x24, [sp], #16
-    ldp         x21, x22, [sp], #16
-    ldp         x19, x20, [sp], #16
-    ldp         x29, x30, [sp], #16
-    ret
-endfunc
diff --git a/android/src/main/libenc/jni/libx264/tools/checkasm-arm.S b/android/src/main/libenc/jni/libx264/tools/checkasm-arm.S
deleted file mode 100755
index 57b4079..0000000
--- a/android/src/main/libenc/jni/libx264/tools/checkasm-arm.S
+++ /dev/null
@@ -1,139 +0,0 @@
-/****************************************************************************
- * checkasm-arm.S: assembly check tool
- *****************************************************************************
- * Copyright (C) 2015-2016 x264 project
- *
- * Authors: Martin Storsjo <martin@martin.st>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "../common/arm/asm.S"
-
-.section .rodata
-.align 4
-register_init:
-.quad 0x21f86d66c8ca00ce
-.quad 0x75b6ba21077c48ad
-.quad 0xed56bb2dcb3c7736
-.quad 0x8bda43d3fd1a7e06
-.quad 0xb64a9c9e5d318408
-.quad 0xdf9a54b303f1d3a3
-.quad 0x4a75479abd64e097
-.quad 0x249214109d5d1c88
-
-error_message:
-.asciz "failed to preserve register"
-
-.text
-
-@ max number of args used by any x264 asm function.
-#define MAX_ARGS 15
-
-#define ARG_STACK 4*(MAX_ARGS - 2)
-
-.macro clobbercheck variant
-.equ pushed, 4*10
-function x264_checkasm_call_\variant
-    push        {r4-r11, lr}
-.ifc \variant, neon
-    vpush       {q4-q7}
-.equ pushed, pushed + 16*4
-.endif
-
-    movrel      r12, register_init
-.ifc \variant, neon
-    vldm        r12, {q4-q7}
-.endif
-    ldm         r12, {r4-r11}
-
-    push        {r1}
-
-    sub         sp,  sp,  #ARG_STACK
-.equ pos, 0
-.rept MAX_ARGS-2
-    ldr         r12, [sp, #ARG_STACK + pushed + 8 + pos]
-    str         r12, [sp, #pos]
-.equ pos, pos + 4
-.endr
-
-    mov         r12, r0
-    mov         r0,  r2
-    mov         r1,  r3
-    ldrd        r2,  r3,  [sp, #ARG_STACK + pushed]
-    blx         r12
-    add         sp,  sp,  #ARG_STACK
-    pop         {r2}
-
-    push        {r0, r1}
-    movrel      r12, register_init
-.ifc \variant, neon
-    vldm        r12, {q0-q3}
-    veor        q0,  q0,  q4
-    veor        q1,  q1,  q5
-    veor        q2,  q2,  q6
-    veor        q3,  q3,  q7
-    vorr        q0,  q0,  q1
-    vorr        q0,  q0,  q2
-    vorr        q0,  q0,  q3
-    vorr        d0,  d0,  d1
-    vrev64.32   d1,  d0
-    vorr        d0,  d0,  d1
-    vmov.32     r3,  d0[0]
-.else
-    mov         r3,  #0
-.endif
-
-.macro check_reg reg1, reg2=
-    ldrd        r0,  r1,  [r12], #8
-    eor         r0,  r0, \reg1
-    orr         r3,  r3, r0
-.ifnb \reg2
-    eor         r1,  r1, \reg2
-    orr         r3,  r3, r1
-.endif
-.endm
-    check_reg   r4,  r5
-    check_reg   r6,  r7
-@ r9 is a volatile register in the ios ABI
-#if SYS_MACOSX
-    check_reg   r8
-#else
-    check_reg   r8,  r9
-#endif
-    check_reg   r10, r11
-.purgem check_reg
-
-    cmp         r3,  #0
-    beq         0f
-
-    mov         r12, #0
-    str         r12, [r2]
-    movrel      r0, error_message
-    blx         X(puts)
-0:
-    pop         {r0, r1}
-.ifc \variant, neon
-    vpop        {q4-q7}
-.endif
-    pop         {r4-r11, pc}
-endfunc
-.endm
-
-clobbercheck neon
-clobbercheck noneon
diff --git a/android/src/main/libenc/jni/libx264/tools/checkasm.c b/android/src/main/libenc/jni/libx264/tools/checkasm.c
deleted file mode 100755
index 8785cc8..0000000
--- a/android/src/main/libenc/jni/libx264/tools/checkasm.c
+++ /dev/null
@@ -1,2955 +0,0 @@
-/*****************************************************************************
- * checkasm.c: assembly check tool
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include <ctype.h>
-#include "common/common.h"
-#include "common/cpu.h"
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-// GCC doesn't align stack variables on ARM, so use .bss
-#if ARCH_ARM
-#undef ALIGNED_16
-#define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
-#endif
-
-/* buf1, buf2: initialised to random data and shouldn't write into them */
-uint8_t *buf1, *buf2;
-/* buf3, buf4: used to store output */
-uint8_t *buf3, *buf4;
-/* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
-pixel *pbuf1, *pbuf2;
-/* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
-pixel *pbuf3, *pbuf4;
-
-int quiet = 0;
-
-#define report( name ) { \
-    if( used_asm && !quiet ) \
-        fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
-    if( !ok ) ret = -1; \
-}
-
-#define BENCH_RUNS 100  // tradeoff between accuracy and speed
-#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
-#define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
-#define MAX_CPUS 30     // number of different combinations of cpu flags
-
-typedef struct
-{
-    void *pointer; // just for detecting duplicates
-    uint32_t cpu;
-    uint64_t cycles;
-    uint32_t den;
-} bench_t;
-
-typedef struct
-{
-    char *name;
-    bench_t vers[MAX_CPUS];
-} bench_func_t;
-
-int do_bench = 0;
-int bench_pattern_len = 0;
-const char *bench_pattern = "";
-char func_name[100];
-static bench_func_t benchs[MAX_FUNCS];
-
-static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
-static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
-static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
-static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
-static const char **intra_predict_8x8_names = intra_predict_4x4_names;
-static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
-
-#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
-
-static inline uint32_t read_time(void)
-{
-    uint32_t a = 0;
-#if HAVE_X86_INLINE_ASM
-    asm volatile( "lfence \n"
-                  "rdtsc  \n"
-                  : "=a"(a) :: "edx", "memory" );
-#elif ARCH_PPC
-    asm volatile( "mftb %0" : "=r"(a) :: "memory" );
-#elif ARCH_ARM     // ARMv7 only
-    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
-#elif ARCH_AARCH64
-    uint64_t b = 0;
-    asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" );
-    a = b;
-#elif ARCH_MIPS
-    asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
-#endif
-    return a;
-}
-
-static bench_t* get_bench( const char *name, int cpu )
-{
-    int i, j;
-    for( i = 0; benchs[i].name && strcmp(name, benchs[i].name); i++ )
-        assert( i < MAX_FUNCS );
-    if( !benchs[i].name )
-        benchs[i].name = strdup( name );
-    if( !cpu )
-        return &benchs[i].vers[0];
-    for( j = 1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ )
-        assert( j < MAX_CPUS );
-    benchs[i].vers[j].cpu = cpu;
-    return &benchs[i].vers[j];
-}
-
-static int cmp_nop( const void *a, const void *b )
-{
-    return *(uint16_t*)a - *(uint16_t*)b;
-}
-
-static int cmp_bench( const void *a, const void *b )
-{
-    // asciibetical sort except preserving numbers
-    const char *sa = ((bench_func_t*)a)->name;
-    const char *sb = ((bench_func_t*)b)->name;
-    for( ;; sa++, sb++ )
-    {
-        if( !*sa && !*sb )
-            return 0;
-        if( isdigit( *sa ) && isdigit( *sb ) && isdigit( sa[1] ) != isdigit( sb[1] ) )
-            return isdigit( sa[1] ) - isdigit( sb[1] );
-        if( *sa != *sb )
-            return *sa - *sb;
-    }
-}
-
-static void print_bench(void)
-{
-    uint16_t nops[10000];
-    int nfuncs, nop_time=0;
-
-    for( int i = 0; i < 10000; i++ )
-    {
-        uint32_t t = read_time();
-        nops[i] = read_time() - t;
-    }
-    qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
-    for( int i = 500; i < 9500; i++ )
-        nop_time += nops[i];
-    nop_time /= 900;
-    printf( "nop: %d\n", nop_time );
-
-    for( nfuncs = 0; nfuncs < MAX_FUNCS && benchs[nfuncs].name; nfuncs++ );
-    qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench );
-    for( int i = 0; i < nfuncs; i++ )
-        for( int j = 0; j < MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ )
-        {
-            int k;
-            bench_t *b = &benchs[i].vers[j];
-            if( !b->den )
-                continue;
-            for( k = 0; k < j && benchs[i].vers[k].pointer != b->pointer; k++ );
-            if( k < j )
-                continue;
-            printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
-#if HAVE_MMX
-                    b->cpu&X264_CPU_AVX2 ? "avx2" :
-                    b->cpu&X264_CPU_FMA3 ? "fma3" :
-                    b->cpu&X264_CPU_FMA4 ? "fma4" :
-                    b->cpu&X264_CPU_XOP ? "xop" :
-                    b->cpu&X264_CPU_AVX ? "avx" :
-                    b->cpu&X264_CPU_SSE42 ? "sse42" :
-                    b->cpu&X264_CPU_SSE4 ? "sse4" :
-                    b->cpu&X264_CPU_SSSE3 ? "ssse3" :
-                    b->cpu&X264_CPU_SSE3 ? "sse3" :
-                    /* print sse2slow only if there's also a sse2fast version of the same func */
-                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
-                    b->cpu&X264_CPU_SSE2 ? "sse2" :
-                    b->cpu&X264_CPU_SSE ? "sse" :
-                    b->cpu&X264_CPU_MMX ? "mmx" :
-#elif ARCH_PPC
-                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
-#elif ARCH_ARM
-                    b->cpu&X264_CPU_NEON ? "neon" :
-                    b->cpu&X264_CPU_ARMV6 ? "armv6" :
-#elif ARCH_AARCH64
-                    b->cpu&X264_CPU_NEON ? "neon" :
-                    b->cpu&X264_CPU_ARMV8 ? "armv8" :
-#elif ARCH_MIPS
-                    b->cpu&X264_CPU_MSA ? "msa" :
-#endif
-                    "c",
-#if HAVE_MMX
-                    b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
-                    b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
-                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
-                    b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
-                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
-                    b->cpu&X264_CPU_BMI2 ? "_bmi2" :
-                    b->cpu&X264_CPU_BMI1 ? "_bmi1" :
-                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
-                    b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
-#elif ARCH_ARM
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
-#endif
-                    "",
-                    (int64_t)(10*b->cycles/b->den - nop_time)/4 );
-        }
-}
-
-#if ARCH_X86 || ARCH_X86_64
-int x264_stack_pagealign( int (*func)(), int align );
-
-/* detect when callee-saved regs aren't saved
- * needs an explicit asm check because it only sometimes crashes in normal use. */
-intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
-#else
-#define x264_stack_pagealign( func, align ) func()
-#endif
-
-#if ARCH_AARCH64
-intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
-#endif
-
-#if ARCH_ARM
-intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... );
-intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
-intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
-#endif
-
-#define call_c1(func,...) func(__VA_ARGS__)
-
-#if ARCH_X86_64
-/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
- * This is done by clobbering the stack with junk around the stack pointer and calling the
- * assembly function through x264_checkasm_call with added dummy arguments which forces all
- * real arguments to be passed on the stack and not in registers. For 32-bit argument the
- * upper half of the 64-bit register location on the stack will now contain junk. Note that
- * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
- * overwrite the junk written to the stack so there's no guarantee that it will always
- * detect all functions that assumes zero-extension.
- */
-void x264_checkasm_stack_clobber( uint64_t clobber, ... );
-#define call_a1(func,...) ({ \
-    uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
-    x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
-    x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
-#elif ARCH_X86 || (ARCH_AARCH64 && !defined(__APPLE__)) || ARCH_ARM
-#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
-#else
-#define call_a1 call_c1
-#endif
-
-#if ARCH_ARM
-#define call_a1_64(func,...) ((uint64_t (*)(intptr_t(*)(), int*, ...))x264_checkasm_call)( (intptr_t(*)())func, &ok, __VA_ARGS__ )
-#else
-#define call_a1_64 call_a1
-#endif
-
-#define call_bench(func,cpu,...)\
-    if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
-    {\
-        uint64_t tsum = 0;\
-        int tcount = 0;\
-        call_a1(func, __VA_ARGS__);\
-        for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
-        {\
-            uint32_t t = read_time();\
-            func(__VA_ARGS__);\
-            func(__VA_ARGS__);\
-            func(__VA_ARGS__);\
-            func(__VA_ARGS__);\
-            t = read_time() - t;\
-            if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\
-            {\
-                tsum += t;\
-                tcount++;\
-            }\
-        }\
-        bench_t *b = get_bench( func_name, cpu );\
-        b->cycles += tsum;\
-        b->den += tcount;\
-        b->pointer = func;\
-    }
-
-/* for most functions, run benchmark and correctness test at the same time.
- * for those that modify their inputs, run the above macros separately */
-#define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
-#define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
-#define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
-#define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })
-#define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); })
-
-
-static int check_pixel( int cpu_ref, int cpu_new )
-{
-    x264_pixel_function_t pixel_c;
-    x264_pixel_function_t pixel_ref;
-    x264_pixel_function_t pixel_asm;
-    x264_predict_t predict_4x4[12];
-    x264_predict8x8_t predict_8x8[12];
-    x264_predict_8x8_filter_t predict_8x8_filter;
-    ALIGNED_16( pixel edge[36] );
-    uint16_t cost_mv[32];
-    int ret = 0, ok, used_asm;
-
-    x264_pixel_init( 0, &pixel_c );
-    x264_pixel_init( cpu_ref, &pixel_ref );
-    x264_pixel_init( cpu_new, &pixel_asm );
-    x264_predict_4x4_init( 0, predict_4x4 );
-    x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
-    predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-
-    // maximize sum
-    for( int i = 0; i < 256; i++ )
-    {
-        int z = i|(i>>4);
-        z ^= z>>2;
-        z ^= z>>1;
-        pbuf4[i] = -(z&1) & PIXEL_MAX;
-        pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
-    }
-    // random pattern made of maxed pixel differences, in case an intermediate value overflows
-    for( int i = 256; i < 0x1000; i++ )
-    {
-        pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
-        pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
-    }
-
-#define TEST_PIXEL( name, align ) \
-    ok = 1, used_asm = 0; \
-    for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \
-    { \
-        int res_c, res_asm; \
-        if( pixel_asm.name[i] != pixel_ref.name[i] ) \
-        { \
-            set_func_name( "%s_%s", #name, pixel_names[i] ); \
-            used_asm = 1; \
-            for( int j = 0; j < 64; j++ ) \
-            { \
-                res_c   = call_c( pixel_c.name[i],   pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
-                res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
-                if( res_c != res_asm ) \
-                { \
-                    ok = 0; \
-                    fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
-                    break; \
-                } \
-            } \
-            for( int j = 0; j < 0x1000 && ok; j += 256 ) \
-            { \
-                res_c   = pixel_c  .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
-                res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
-                if( res_c != res_asm ) \
-                { \
-                    ok = 0; \
-                    fprintf( stderr, #name "[%d]: overflow %d != %d\n", i, res_c, res_asm ); \
-                } \
-            } \
-        } \
-    } \
-    report( "pixel " #name " :" );
-
-    TEST_PIXEL( sad, 0 );
-    TEST_PIXEL( sad_aligned, 1 );
-    TEST_PIXEL( ssd, 1 );
-    TEST_PIXEL( satd, 0 );
-    TEST_PIXEL( sa8d, 1 );
-
-    ok = 1, used_asm = 0;
-    if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
-    {
-        set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
-        used_asm = 1;
-        for( int j = 0; j < 64; j++ )
-        {
-            uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
-            uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
-            uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
-            uint32_t cost8_a = res_a;
-            uint32_t cost4_a = res_a >> 32;
-            if( cost8_a != cost8_c || cost4_a != cost4_c )
-            {
-                ok = 0;
-                fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
-                         cost8_c, cost4_c, cost8_a, cost4_a );
-                break;
-            }
-        }
-        for( int j = 0; j < 0x1000 && ok; j += 256 ) \
-        {
-            uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
-            uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
-            uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
-            uint32_t cost8_a = res_a;
-            uint32_t cost4_a = res_a >> 32;
-            if( cost8_a != cost8_c || cost4_a != cost4_c )
-            {
-                ok = 0;
-                fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
-                         cost8_c, cost4_c, cost8_a, cost4_a );
-            }
-        }
-    }
-    report( "pixel sa8d_satd :" );
-
-#define TEST_PIXEL_X( N ) \
-    ok = 1; used_asm = 0; \
-    for( int i = 0; i < 7; i++ ) \
-    { \
-        ALIGNED_16( int res_c[4] ) = {0}; \
-        ALIGNED_16( int res_asm[4] ) = {0}; \
-        if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
-        { \
-            set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
-            used_asm = 1; \
-            for( int j = 0; j < 64; j++ ) \
-            { \
-                pixel *pix2 = pbuf2+j; \
-                res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2,   64 ); \
-                res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
-                res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
-                if( N == 4 ) \
-                { \
-                    res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
-                    call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
-                } \
-                else \
-                    call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
-                if( memcmp(res_c, res_asm, N*sizeof(int)) ) \
-                { \
-                    ok = 0; \
-                    fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
-                             i, res_c[0], res_c[1], res_c[2], res_c[3], \
-                             res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
-                } \
-                if( N == 4 ) \
-                    call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
-                else \
-                    call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
-            } \
-        } \
-    } \
-    report( "pixel sad_x"#N" :" );
-
-    TEST_PIXEL_X(3);
-    TEST_PIXEL_X(4);
-
-#define TEST_PIXEL_VAR( i ) \
-    if( pixel_asm.var[i] != pixel_ref.var[i] ) \
-    { \
-        set_func_name( "%s_%s", "var", pixel_names[i] ); \
-        used_asm = 1; \
-        /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
-        call_c1( pixel_c.var[i],   pbuf1,           16 ); \
-        call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
-        uint64_t res_c   = pixel_c.var[i]( pbuf1, 16 ); \
-        uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
-        if( res_c != res_asm ) \
-        { \
-            ok = 0; \
-            fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
-        } \
-        call_c2( pixel_c.var[i],   pbuf1, (intptr_t)16 ); \
-        call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
-    }
-
-    ok = 1; used_asm = 0;
-    TEST_PIXEL_VAR( PIXEL_16x16 );
-    TEST_PIXEL_VAR( PIXEL_8x16 );
-    TEST_PIXEL_VAR( PIXEL_8x8 );
-    report( "pixel var :" );
-
-#define TEST_PIXEL_VAR2( i ) \
-    if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
-    { \
-        int res_c, res_asm, ssd_c, ssd_asm; \
-        set_func_name( "%s_%s", "var2", pixel_names[i] ); \
-        used_asm = 1; \
-        res_c   = call_c( pixel_c.var2[i],   pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c   ); \
-        res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
-        if( res_c != res_asm || ssd_c != ssd_asm ) \
-        { \
-            ok = 0; \
-            fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
-        } \
-    }
-
-    ok = 1; used_asm = 0;
-    TEST_PIXEL_VAR2( PIXEL_8x16 );
-    TEST_PIXEL_VAR2( PIXEL_8x8 );
-    report( "pixel var2 :" );
-
-    ok = 1; used_asm = 0;
-    for( int i = 0; i < 4; i++ )
-        if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
-        {
-            set_func_name( "hadamard_ac_%s", pixel_names[i] );
-            used_asm = 1;
-            for( int j = 0; j < 32; j++ )
-            {
-                pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
-                call_c1( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
-                call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
-                uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
-                uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
-                if( rc != ra )
-                {
-                    ok = 0;
-                    fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) );
-                    break;
-                }
-            }
-            call_c2( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
-            call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
-        }
-    report( "pixel hadamard_ac :" );
-
-    // maximize sum
-    for( int i = 0; i < 32; i++ )
-        for( int j = 0; j < 16; j++ )
-            pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX;
-    ok = 1; used_asm = 0;
-    if( pixel_asm.vsad != pixel_ref.vsad )
-    {
-        for( int h = 2; h <= 32; h += 2 )
-        {
-            int res_c, res_asm;
-            set_func_name( "vsad" );
-            used_asm = 1;
-            for( int j = 0; j < 2 && ok; j++ )
-            {
-                pixel *p = j ? pbuf4 : pbuf1;
-                res_c   = call_c( pixel_c.vsad,   p, (intptr_t)16, h );
-                res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h );
-                if( res_c != res_asm )
-                {
-                    ok = 0;
-                    fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
-                    break;
-                }
-            }
-        }
-    }
-    report( "pixel vsad :" );
-
-    ok = 1; used_asm = 0;
-    if( pixel_asm.asd8 != pixel_ref.asd8 )
-    {
-        set_func_name( "asd8" );
-        used_asm = 1;
-        int res_c = call_c( pixel_c.asd8,   pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
-        int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
-        if( res_c != res_a )
-        {
-            ok = 0;
-            fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
-        }
-    }
-    report( "pixel asd :" );
-
-#define TEST_INTRA_X3( name, i8x8, ... ) \
-    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
-    { \
-        ALIGNED_16( int res_c[3] ); \
-        ALIGNED_16( int res_asm[3] ); \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
-        call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \
-        if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
-        { \
-            ok = 0; \
-            fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
-                     res_c[0], res_c[1], res_c[2], \
-                     res_asm[0], res_asm[1], res_asm[2] ); \
-        } \
-    }
-
-#define TEST_INTRA_X9( name, cmp ) \
-    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
-    { \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
-        for( int i=0; i<17; i++ ) \
-            bitcosts[i] = 9*(i!=8); \
-        memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
-        memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
-        for( int i=0; i<32; i++ ) \
-        { \
-            pixel *fenc = pbuf1+48+i*12; \
-            pixel *fdec1 = pbuf3+48+i*12; \
-            pixel *fdec2 = pbuf4+48+i*12; \
-            int pred_mode = i%9; \
-            int res_c = INT_MAX; \
-            for( int j=0; j<9; j++ ) \
-            { \
-                predict_4x4[j]( fdec1 ); \
-                int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
-                if( cost < (uint16_t)res_c ) \
-                    res_c = cost + (j<<16); \
-            } \
-            predict_4x4[res_c>>16]( fdec1 ); \
-            int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
-            if( res_c != res_a ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
-                break; \
-            } \
-            if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name" [FAILED]\n" ); \
-                for( int j=0; j<16; j++ ) \
-                    fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
-                fprintf( stderr, "\n" ); \
-                for( int j=0; j<16; j++ ) \
-                    fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
-                fprintf( stderr, "\n" ); \
-                break; \
-            } \
-        } \
-    }
-
-#define TEST_INTRA8_X9( name, cmp ) \
-    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
-    { \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
-        ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
-        ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
-        memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
-        memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
-        for( int i=0; i<17; i++ ) \
-            bitcosts[i] = 9*(i!=8); \
-        for( int i=0; i<32; i++ ) \
-        { \
-            pixel *fenc = pbuf1+48+i*12; \
-            pixel *fdec1 = pbuf3+48+i*12; \
-            pixel *fdec2 = pbuf4+48+i*12; \
-            int pred_mode = i%9; \
-            int res_c = INT_MAX; \
-            predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
-            for( int j=0; j<9; j++ ) \
-            { \
-                predict_8x8[j]( fdec1, edge ); \
-                satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
-                if( satds_c[j] < (uint16_t)res_c ) \
-                    res_c = satds_c[j] + (j<<16); \
-            } \
-            predict_8x8[res_c>>16]( fdec1, edge ); \
-            int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
-            if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
-                for( int j = 0; j < 9; j++ ) \
-                    fprintf( stderr, "%5d ", satds_c[j]); \
-                fprintf( stderr, "\n" ); \
-                for( int j = 0; j < 9; j++ ) \
-                    fprintf( stderr, "%5d ", satds_a[j]); \
-                fprintf( stderr, "\n" ); \
-                break; \
-            } \
-            for( int j=0; j<8; j++ ) \
-                if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
-                    ok = 0; \
-            if( !ok ) \
-            { \
-                fprintf( stderr, #name" [FAILED]\n" ); \
-                for( int j=0; j<8; j++ ) \
-                { \
-                    for( int k=0; k<8; k++ ) \
-                        fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
-                    fprintf( stderr, "\n" ); \
-                } \
-                fprintf( stderr, "\n" ); \
-                for( int j=0; j<8; j++ ) \
-                { \
-                    for( int k=0; k<8; k++ ) \
-                        fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
-                    fprintf( stderr, "\n" ); \
-                } \
-                fprintf( stderr, "\n" ); \
-                break; \
-            } \
-        } \
-    }
-
-    memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
-    ok = 1; used_asm = 0;
-    TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
-    TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
-    TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
-    TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
-    TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
-    report( "intra satd_x3 :" );
-    ok = 1; used_asm = 0;
-    TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
-    TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
-    TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
-    TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
-    TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
-    report( "intra sad_x3 :" );
-    ok = 1; used_asm = 0;
-    TEST_INTRA_X9( intra_satd_x9_4x4, satd );
-    TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
-    report( "intra satd_x9 :" );
-    ok = 1; used_asm = 0;
-    TEST_INTRA_X9( intra_sad_x9_4x4, sad );
-    TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
-    report( "intra sad_x9 :" );
-
-    ok = 1; used_asm = 0;
-    if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )
-    {
-        used_asm = 1;
-        set_func_name( "ssd_nv12" );
-        uint64_t res_u_c, res_v_c, res_u_a, res_v_a;
-        for( int w = 8; w <= 360; w += 8 )
-        {
-            pixel_c.ssd_nv12_core(   pbuf1, 368, pbuf2, 368, w, 8, &res_u_c, &res_v_c );
-            pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_a, &res_v_a );
-            if( res_u_c != res_u_a || res_v_c != res_v_a )
-            {
-                ok = 0;
-                fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
-                         res_u_c, res_v_c, res_u_a, res_v_a );
-            }
-        }
-        call_c( pixel_c.ssd_nv12_core,   pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c );
-        call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a );
-    }
-    report( "ssd_nv12 :" );
-
-    if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
-        pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
-    {
-        int cnt;
-        float res_c, res_a;
-        ALIGNED_16( int sums[5][4] ) = {{0}};
-        used_asm = ok = 1;
-        x264_emms();
-        res_c = x264_pixel_ssim_wxh( &pixel_c,   pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
-        res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
-        if( fabs( res_c - res_a ) > 1e-6 )
-        {
-            ok = 0;
-            fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
-        }
-        set_func_name( "ssim_core" );
-        call_c( pixel_c.ssim_4x4x2_core,   pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
-        call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
-        set_func_name( "ssim_end" );
-        call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
-        call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
-        /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */
-        call_c1( pixel_c.ssim_end4,   sums, sums, 3 );
-        call_a1( pixel_asm.ssim_end4, sums, sums, 3 );
-        report( "ssim :" );
-    }
-
-    ok = 1; used_asm = 0;
-    for( int i = 0; i < 32; i++ )
-        cost_mv[i] = i*10;
-    for( int i = 0; i < 100 && ok; i++ )
-        if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
-        {
-            ALIGNED_16( uint16_t sums[72] );
-            ALIGNED_16( int dc[4] );
-            ALIGNED_16( int16_t mvs_a[48] );
-            ALIGNED_16( int16_t mvs_c[48] );
-            int mvn_a, mvn_c;
-            int thresh = rand() & 0x3fff;
-            set_func_name( "esa_ads" );
-            for( int j = 0; j < 72; j++ )
-                sums[j] = rand() & 0x3fff;
-            for( int j = 0; j < 4; j++ )
-                dc[j] = rand() & 0x3fff;
-            used_asm = 1;
-            mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
-            mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
-            if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
-            {
-                ok = 0;
-                printf( "c%d: ", i&3 );
-                for( int j = 0; j < mvn_c; j++ )
-                    printf( "%d ", mvs_c[j] );
-                printf( "\na%d: ", i&3 );
-                for( int j = 0; j < mvn_a; j++ )
-                    printf( "%d ", mvs_a[j] );
-                printf( "\n\n" );
-            }
-        }
-    report( "esa ads:" );
-
-    return ret;
-}
-
-static int check_dct( int cpu_ref, int cpu_new )
-{
-    x264_dct_function_t dct_c;
-    x264_dct_function_t dct_ref;
-    x264_dct_function_t dct_asm;
-    x264_quant_function_t qf;
-    int ret = 0, ok, used_asm, interlace = 0;
-    ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
-    ALIGNED_16( dctcoef dctdc[2][8] );
-    x264_t h_buf;
-    x264_t *h = &h_buf;
-
-    x264_dct_init( 0, &dct_c );
-    x264_dct_init( cpu_ref, &dct_ref);
-    x264_dct_init( cpu_new, &dct_asm );
-
-    memset( h, 0, sizeof(*h) );
-    x264_param_default( &h->param );
-    h->sps->i_chroma_format_idc = 1;
-    h->chroma_qp_table = i_chroma_qp_table + 12;
-    h->param.analyse.i_luma_deadzone[0] = 0;
-    h->param.analyse.i_luma_deadzone[1] = 0;
-    h->param.analyse.b_transform_8x8 = 1;
-    for( int i = 0; i < 6; i++ )
-        h->pps->scaling_list[i] = x264_cqm_flat16;
-    x264_cqm_init( h );
-    x264_quant_init( h, 0, &qf );
-
-    /* overflow test cases */
-    for( int i = 0; i < 5; i++ )
-    {
-        pixel *enc = &pbuf3[16*i*FENC_STRIDE];
-        pixel *dec = &pbuf4[16*i*FDEC_STRIDE];
-
-        for( int j = 0; j < 16; j++ )
-        {
-            int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
-            int cond_b = (i == 0) ? 1 : !cond_a;
-            enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0;
-            enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0;
-
-            for( int k = 0; k < 4; k++ )
-                dec[k] = PIXEL_MAX - enc[k];
-
-            enc += FENC_STRIDE;
-            dec += FDEC_STRIDE;
-        }
-    }
-
-#define TEST_DCT( name, t1, t2, size ) \
-    if( dct_asm.name != dct_ref.name ) \
-    { \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        pixel *enc = pbuf3; \
-        pixel *dec = pbuf4; \
-        for( int j = 0; j < 5; j++) \
-        { \
-            call_c( dct_c.name, t1, &pbuf1[j*64], &pbuf2[j*64] ); \
-            call_a( dct_asm.name, t2, &pbuf1[j*64], &pbuf2[j*64] ); \
-            if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name " [FAILED]\n" ); \
-                for( int k = 0; k < size; k++ )\
-                    printf( "%d ", ((dctcoef*)t1)[k] );\
-                printf("\n");\
-                for( int k = 0; k < size; k++ )\
-                    printf( "%d ", ((dctcoef*)t2)[k] );\
-                printf("\n");\
-                break; \
-            } \
-            call_c( dct_c.name, t1, enc, dec ); \
-            call_a( dct_asm.name, t2, enc, dec ); \
-            if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name " [FAILED] (overflow)\n" ); \
-                break; \
-            } \
-            enc += 16*FENC_STRIDE; \
-            dec += 16*FDEC_STRIDE; \
-        } \
-    }
-    ok = 1; used_asm = 0;
-    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
-    TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
-    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
-    TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
-    TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
-    report( "sub_dct4 :" );
-
-    ok = 1; used_asm = 0;
-    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
-    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
-    report( "sub_dct8 :" );
-#undef TEST_DCT
-
-    // fdct and idct are denormalized by different factors, so quant/dequant
-    // is needed to force the coefs into the right range.
-    dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 );
-    dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 );
-    for( int i = 0; i < 16; i++ )
-    {
-        qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
-        qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
-    }
-    for( int i = 0; i < 4; i++ )
-    {
-        qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
-        qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
-    }
-    x264_cqm_delete( h );
-
-#define TEST_IDCT( name, src ) \
-    if( dct_asm.name != dct_ref.name ) \
-    { \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
-        memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
-        memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
-        memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
-        call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
-        call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
-        if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
-        { \
-            ok = 0; \
-            fprintf( stderr, #name " [FAILED]\n" ); \
-        } \
-        call_c2( dct_c.name, pbuf3, (void*)dct1 ); \
-        call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \
-    }
-    ok = 1; used_asm = 0;
-    TEST_IDCT( add4x4_idct, dct4 );
-    TEST_IDCT( add8x8_idct, dct4 );
-    TEST_IDCT( add8x8_idct_dc, dct4 );
-    TEST_IDCT( add16x16_idct, dct4 );
-    TEST_IDCT( add16x16_idct_dc, dct4 );
-    report( "add_idct4 :" );
-
-    ok = 1; used_asm = 0;
-    TEST_IDCT( add8x8_idct8, dct8 );
-    TEST_IDCT( add16x16_idct8, dct8 );
-    report( "add_idct8 :" );
-#undef TEST_IDCT
-
-#define TEST_DCTDC( name )\
-    ok = 1; used_asm = 0;\
-    if( dct_asm.name != dct_ref.name )\
-    {\
-        set_func_name( #name );\
-        used_asm = 1;\
-        uint16_t *p = (uint16_t*)buf1;\
-        for( int i = 0; i < 16 && ok; i++ )\
-        {\
-            for( int j = 0; j < 16; j++ )\
-                dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
-                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
-                           : ((*p++)&0x1fff)-0x1000; /* general case */\
-            memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
-            call_c1( dct_c.name, dct1[0] );\
-            call_a1( dct_asm.name, dct2[0] );\
-            if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
-                ok = 0;\
-        }\
-        call_c2( dct_c.name, dct1[0] );\
-        call_a2( dct_asm.name, dct2[0] );\
-    }\
-    report( #name " :" );
-
-    TEST_DCTDC(  dct4x4dc );
-    TEST_DCTDC( idct4x4dc );
-#undef TEST_DCTDC
-
-#define TEST_DCTDC_CHROMA( name )\
-    ok = 1; used_asm = 0;\
-    if( dct_asm.name != dct_ref.name )\
-    {\
-        set_func_name( #name );\
-        used_asm = 1;\
-        uint16_t *p = (uint16_t*)buf1;\
-        for( int i = 0; i < 16 && ok; i++ )\
-        {\
-            for( int j = 0; j < 8; j++ )\
-                dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
-                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
-                           : ((*p++)&0x1fff)-0x1000; /* general case */\
-            memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
-            call_c1( dct_c.name, dctdc[0], dct1 );\
-            call_a1( dct_asm.name, dctdc[1], dct2 );\
-            if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
-            {\
-                ok = 0;\
-                fprintf( stderr, #name " [FAILED]\n" ); \
-            }\
-        }\
-        call_c2( dct_c.name, dctdc[0], dct1 );\
-        call_a2( dct_asm.name, dctdc[1], dct2 );\
-    }\
-    report( #name " :" );
-
-    TEST_DCTDC_CHROMA( dct2x4dc );
-#undef TEST_DCTDC_CHROMA
-
-    x264_zigzag_function_t zigzag_c[2];
-    x264_zigzag_function_t zigzag_ref[2];
-    x264_zigzag_function_t zigzag_asm[2];
-
-    ALIGNED_ARRAY_16( dctcoef, level1,[64] );
-    ALIGNED_ARRAY_16( dctcoef, level2,[64] );
-
-#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
-    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
-    { \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
-        used_asm = 1; \
-        for( int i = 0; i < size*size; i++ ) \
-            dct[i] = i; \
-        call_c( zigzag_c[interlace].name, t1, dct ); \
-        call_a( zigzag_asm[interlace].name, t2, dct ); \
-        if( memcmp( t1, t2, size*size*sizeof(dctcoef) ) ) \
-        { \
-            ok = 0; \
-            for( int i = 0; i < 2; i++ ) \
-            { \
-                dctcoef *d = (dctcoef*)(i ? t2 : t1); \
-                for( int j = 0; j < size; j++ ) \
-                { \
-                    for( int k = 0; k < size; k++ ) \
-                        fprintf( stderr, "%2d ", d[k+j*8] ); \
-                    fprintf( stderr, "\n" ); \
-                } \
-                fprintf( stderr, "\n" ); \
-            } \
-            fprintf( stderr, #name " [FAILED]\n" ); \
-        } \
-    }
-
-#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
-    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
-    { \
-        int nz_a, nz_c; \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
-        used_asm = 1; \
-        memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
-        memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
-        nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
-        nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
-        if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
-        { \
-            ok = 0; \
-            fprintf( stderr, #name " [FAILED]\n" ); \
-        } \
-        call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
-        call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
-    }
-
-#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
-    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
-    { \
-        int nz_a, nz_c; \
-        dctcoef dc_a, dc_c; \
-        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
-        used_asm = 1; \
-        for( int i = 0; i < 2; i++ ) \
-        { \
-            memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
-            memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
-            for( int j = 0; j < 4; j++ ) \
-            { \
-                memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
-                memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
-            } \
-            nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
-            nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
-            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name " [FAILED]\n" ); \
-                break; \
-            } \
-        } \
-        call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
-        call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
-    }
-
-#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
-    if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
-    { \
-        for( int j = 0; j < 100; j++ ) \
-        { \
-            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
-            used_asm = 1; \
-            memcpy(dct, buf1, size*sizeof(dctcoef)); \
-            for( int i = 0; i < size; i++ ) \
-                dct[i] = rand()&0x1F ? 0 : dct[i]; \
-            memcpy(buf3, buf4, 10); \
-            call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \
-            call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
-            if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
-            { \
-                ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
-            } \
-        } \
-    }
-
-    x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] );
-    x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] );
-    x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );
-
-    ok = 1; used_asm = 0;
-    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 );
-    report( "zigzag_interleave :" );
-
-    for( interlace = 0; interlace <= 1; interlace++ )
-    {
-        ok = 1; used_asm = 0;
-        TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 );
-        TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 );
-        TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
-        TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 );
-        TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
-        report( interlace ? "zigzag_field :" : "zigzag_frame :" );
-    }
-#undef TEST_ZIGZAG_SCAN
-#undef TEST_ZIGZAG_SUB
-
-    return ret;
-}
-
-static int check_mc( int cpu_ref, int cpu_new )
-{
-    x264_mc_functions_t mc_c;
-    x264_mc_functions_t mc_ref;
-    x264_mc_functions_t mc_a;
-    x264_pixel_function_t pixf;
-
-    pixel *src     = &(pbuf1)[2*64+2];
-    pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2],
-                       &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] };
-    pixel *dst1    = pbuf3;
-    pixel *dst2    = pbuf4;
-
-    int ret = 0, ok, used_asm;
-
-    x264_mc_init( 0, &mc_c, 0 );
-    x264_mc_init( cpu_ref, &mc_ref, 0 );
-    x264_mc_init( cpu_new, &mc_a, 0 );
-    x264_pixel_init( 0, &pixf );
-
-#define MC_TEST_LUMA( w, h ) \
-        if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
-        { \
-            const x264_weight_t *weight = x264_weight_none; \
-            set_func_name( "mc_luma_%dx%d", w, h ); \
-            used_asm = 1; \
-            for( int i = 0; i < 1024; i++ ) \
-                pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
-            call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
-            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
-            { \
-                fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
-                ok = 0; \
-            } \
-        } \
-        if( mc_a.get_ref != mc_ref.get_ref ) \
-        { \
-            pixel *ref = dst2; \
-            intptr_t ref_stride = 32; \
-            int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
-            const x264_weight_t *weight = x264_weight_none; \
-            set_func_name( "get_ref_%dx%d", w_checked, h ); \
-            used_asm = 1; \
-            for( int i = 0; i < 1024; i++ ) \
-                pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
-            ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \
-            for( int i = 0; i < h; i++ ) \
-                if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
-                { \
-                    fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w_checked, h ); \
-                    ok = 0; \
-                    break; \
-                } \
-        }
-
-#define MC_TEST_CHROMA( w, h ) \
-        if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
-        { \
-            set_func_name( "mc_chroma_%dx%d", w, h ); \
-            used_asm = 1; \
-            for( int i = 0; i < 1024; i++ ) \
-                pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
-            call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
-            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
-            for( int j = 0; j < h; j++ ) \
-                for( int i = w; i < 8; i++ ) \
-                { \
-                    dst2[i+j*16+8] = dst1[i+j*16+8]; \
-                    dst2[i+j*16  ] = dst1[i+j*16  ]; \
-                } \
-            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
-            { \
-                fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
-                ok = 0; \
-            } \
-        }
-    ok = 1; used_asm = 0;
-    for( int dy = -8; dy < 8; dy++ )
-        for( int dx = -128; dx < 128; dx++ )
-        {
-            if( rand()&15 ) continue; // running all of them is too slow
-            MC_TEST_LUMA( 20, 18 );
-            MC_TEST_LUMA( 16, 16 );
-            MC_TEST_LUMA( 16, 8 );
-            MC_TEST_LUMA( 12, 10 );
-            MC_TEST_LUMA( 8, 16 );
-            MC_TEST_LUMA( 8, 8 );
-            MC_TEST_LUMA( 8, 4 );
-            MC_TEST_LUMA( 4, 8 );
-            MC_TEST_LUMA( 4, 4 );
-        }
-    report( "mc luma :" );
-
-    ok = 1; used_asm = 0;
-    for( int dy = -1; dy < 9; dy++ )
-        for( int dx = -128; dx < 128; dx++ )
-        {
-            if( rand()&15 ) continue;
-            MC_TEST_CHROMA( 8, 8 );
-            MC_TEST_CHROMA( 8, 4 );
-            MC_TEST_CHROMA( 4, 8 );
-            MC_TEST_CHROMA( 4, 4 );
-            MC_TEST_CHROMA( 4, 2 );
-            MC_TEST_CHROMA( 2, 4 );
-            MC_TEST_CHROMA( 2, 2 );
-        }
-    report( "mc chroma :" );
-#undef MC_TEST_LUMA
-#undef MC_TEST_CHROMA
-
-#define MC_TEST_AVG( name, weight ) \
-{ \
-    for( int i = 0; i < 12; i++ ) \
-    { \
-        memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
-        memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
-        if( mc_a.name[i] != mc_ref.name[i] ) \
-        { \
-            set_func_name( "%s_%s", #name, pixel_names[i] ); \
-            used_asm = 1; \
-            call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
-            call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
-            if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
-            } \
-            call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
-            call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
-        } \
-    } \
-}
-
-    ok = 1, used_asm = 0;
-    for( int w = -63; w <= 127 && ok; w++ )
-        MC_TEST_AVG( avg, w );
-    report( "mc wpredb :" );
-
-#define MC_TEST_WEIGHT( name, weight, aligned ) \
-    int align_off = (aligned ? 0 : rand()%16); \
-    for( int i = 1; i <= 5; i++ ) \
-    { \
-        ALIGNED_16( pixel buffC[640] ); \
-        ALIGNED_16( pixel buffA[640] ); \
-        int j = X264_MAX( i*4, 2 ); \
-        memset( buffC, 0, 640 * sizeof(pixel) ); \
-        memset( buffA, 0, 640 * sizeof(pixel) ); \
-        x264_t ha; \
-        ha.mc = mc_a; \
-        /* w12 is the same as w16 in some cases */ \
-        if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
-            continue; \
-        if( mc_a.name[i] != mc_ref.name[i] ) \
-        { \
-            set_func_name( "%s_w%d", #name, j ); \
-            used_asm = 1; \
-            call_c1( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
-            mc_a.weight_cache(&ha, &weight); \
-            call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
-            for( int k = 0; k < 16; k++ ) \
-                if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
-                { \
-                    ok = 0; \
-                    fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
-                    break; \
-                } \
-            /* omit unlikely high scales for benchmarking */ \
-            if( (s << (8-d)) < 512 ) \
-            { \
-                call_c2( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
-                call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
-            } \
-        } \
-    }
-
-    ok = 1; used_asm = 0;
-
-    int align_cnt = 0;
-    for( int s = 0; s <= 127 && ok; s++ )
-    {
-        for( int o = -128; o <= 127 && ok; o++ )
-        {
-            if( rand() & 2047 ) continue;
-            for( int d = 0; d <= 7 && ok; d++ )
-            {
-                if( s == 1<<d )
-                    continue;
-                x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
-                MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
-            }
-        }
-
-    }
-    report( "mc weight :" );
-
-    ok = 1; used_asm = 0;
-    for( int o = 0; o <= 127 && ok; o++ )
-    {
-        int s = 1, d = 0;
-        if( rand() & 15 ) continue;
-        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
-        MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
-    }
-    report( "mc offsetadd :" );
-    ok = 1; used_asm = 0;
-    for( int o = -128; o < 0 && ok; o++ )
-    {
-        int s = 1, d = 0;
-        if( rand() & 15 ) continue;
-        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
-        MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
-    }
-    report( "mc offsetsub :" );
-
-    ok = 1; used_asm = 0;
-    for( int height = 8; height <= 16; height += 8 )
-    {
-        if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
-        {
-            set_func_name( "store_interleave_chroma" );
-            used_asm = 1;
-            memset( pbuf3, 0, 64*height );
-            memset( pbuf4, 0, 64*height );
-            call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
-            call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
-            if( memcmp( pbuf3, pbuf4, 64*height ) )
-            {
-                ok = 0;
-                fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
-                break;
-            }
-        }
-        if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
-        {
-            set_func_name( "load_deinterleave_chroma_fenc" );
-            used_asm = 1;
-            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height );
-            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height );
-            if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
-            {
-                ok = 0;
-                fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
-                break;
-            }
-        }
-        if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
-        {
-            set_func_name( "load_deinterleave_chroma_fdec" );
-            used_asm = 1;
-            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height );
-            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height );
-            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
-            {
-                ok = 0;
-                fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
-                break;
-            }
-        }
-    }
-    report( "store_interleave :" );
-
-    struct plane_spec {
-        int w, h, src_stride;
-    } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} };
-    ok = 1; used_asm = 0;
-    if( mc_a.plane_copy != mc_ref.plane_copy )
-    {
-        set_func_name( "plane_copy" );
-        used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = plane_specs[i].w;
-            int h = plane_specs[i].h;
-            intptr_t src_stride = plane_specs[i].src_stride;
-            intptr_t dst_stride = (w + 127) & ~63;
-            assert( dst_stride * h <= 0x1000 );
-            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
-            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
-            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
-            call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
-            call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h );
-            for( int y = 0; y < h; y++ )
-                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
-                    break;
-                }
-        }
-    }
-
-    if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap )
-    {
-        set_func_name( "plane_copy_swap" );
-        used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = (plane_specs[i].w + 1) >> 1;
-            int h = plane_specs[i].h;
-            intptr_t src_stride = plane_specs[i].src_stride;
-            intptr_t dst_stride = (2*w + 127) & ~63;
-            assert( dst_stride * h <= 0x1000 );
-            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
-            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
-            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
-            call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h );
-            call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h );
-            for( int y = 0; y < h; y++ )
-                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
-                    break;
-                }
-        }
-    }
-
-    if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
-    {
-        set_func_name( "plane_copy_interleave" );
-        used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = (plane_specs[i].w + 1) >> 1;
-            int h = plane_specs[i].h;
-            intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
-            intptr_t dst_stride = (2*w + 127) & ~63;
-            assert( dst_stride * h <= 0x1000 );
-            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
-            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
-            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
-            call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
-            call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
-            for( int y = 0; y < h; y++ )
-                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
-                    break;
-                }
-        }
-    }
-
-    if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave )
-    {
-        set_func_name( "plane_copy_deinterleave" );
-        used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = (plane_specs[i].w + 1) >> 1;
-            int h = plane_specs[i].h;
-            intptr_t dst_stride = w;
-            intptr_t src_stride = (2*w + 127) & ~63;
-            intptr_t offv = (dst_stride*h + 31) & ~15;
-            memset( pbuf3, 0, 0x1000 );
-            memset( pbuf4, 0, 0x1000 );
-            call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
-            call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h );
-            for( int y = 0; y < h; y++ )
-                if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride, w ) ||
-                    memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
-                    break;
-                }
-        }
-    }
-
-    if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
-    {
-        set_func_name( "plane_copy_deinterleave_rgb" );
-        used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = (plane_specs[i].w + 2) >> 2;
-            int h = plane_specs[i].h;
-            intptr_t src_stride = plane_specs[i].src_stride;
-            intptr_t dst_stride = ALIGN( w, 16 );
-            intptr_t offv = dst_stride*h + 16;
-
-            for( int pw = 3; pw <= 4; pw++ )
-            {
-                memset( pbuf3, 0, 0x1000 );
-                memset( pbuf4, 0, 0x1000 );
-                call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
-                call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
-                for( int y = 0; y < h; y++ )
-                    if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) ||
-                        memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) ||
-                        memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) )
-                    {
-                        ok = 0;
-                        fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw );
-                        break;
-                    }
-            }
-        }
-    }
-    report( "plane_copy :" );
-
-    if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
-    {
-        set_func_name( "plane_copy_deinterleave_v210" );
-        ok = 1; used_asm = 1;
-        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
-        {
-            int w = (plane_specs[i].w + 1) >> 1;
-            int h = plane_specs[i].h;
-            intptr_t dst_stride = ALIGN( w, 16 );
-            intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
-            intptr_t offv = dst_stride*h + 32;
-            memset( pbuf3, 0, 0x1000 );
-            memset( pbuf4, 0, 0x1000 );
-            call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
-            call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
-            for( int y = 0; y < h; y++ )
-                if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride,      w*sizeof(uint16_t) ) ||
-                    memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
-                    break;
-                }
-        }
-        report( "v210 :" );
-    }
-
-    if( mc_a.hpel_filter != mc_ref.hpel_filter )
-    {
-        pixel *srchpel = pbuf1+8+2*64;
-        pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 };
-        pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 };
-        void *tmp = pbuf3+49*64;
-        set_func_name( "hpel_filter" );
-        ok = 1; used_asm = 1;
-        memset( pbuf3, 0, 4096 * sizeof(pixel) );
-        memset( pbuf4, 0, 4096 * sizeof(pixel) );
-        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp );
-        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp );
-        for( int i = 0; i < 3; i++ )
-            for( int j = 0; j < 10; j++ )
-                //FIXME ideally the first pixels would match too, but they aren't actually used
-                if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * sizeof(pixel) ) )
-                {
-                    ok = 0;
-                    fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
-                    for( int k = 0; k < 48; k++ )
-                        printf( "%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " " );
-                    printf( "\n" );
-                    for( int k = 0; k < 48; k++ )
-                        printf( "%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " " );
-                    printf( "\n" );
-                    break;
-                }
-        report( "hpel filter :" );
-    }
-
-    if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
-    {
-        pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 };
-        pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
-        set_func_name( "lowres_init" );
-        ok = 1; used_asm = 1;
-        for( int w = 96; w <= 96+24; w += 8 )
-        {
-            intptr_t stride = (w*2+31)&~31;
-            intptr_t stride_lowres = (w+31)&~31;
-            call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 );
-            call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 );
-            for( int i = 0; i < 8; i++ )
-            {
-                for( int j = 0; j < 4; j++ )
-                    if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) )
-                    {
-                        ok = 0;
-                        fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
-                        for( int k = 0; k < w; k++ )
-                            printf( "%d ", dstc[j][k+i*stride_lowres] );
-                        printf( "\n" );
-                        for( int k = 0; k < w; k++ )
-                            printf( "%d ", dsta[j][k+i*stride_lowres] );
-                        printf( "\n" );
-                        break;
-                    }
-            }
-        }
-        report( "lowres init :" );
-    }
-
-#define INTEGRAL_INIT( name, size, offset, cmp_len, ... )\
-    if( mc_a.name != mc_ref.name )\
-    {\
-        intptr_t stride = 96;\
-        set_func_name( #name );\
-        used_asm = 1;\
-        memcpy( buf3, buf1, size*2*stride );\
-        memcpy( buf4, buf1, size*2*stride );\
-        uint16_t *sum = (uint16_t*)buf3;\
-        call_c1( mc_c.name, sum+offset, __VA_ARGS__ );\
-        sum = (uint16_t*)buf4;\
-        call_a1( mc_a.name, sum+offset, __VA_ARGS__ );\
-        if( memcmp( buf3+2*offset, buf4+2*offset, cmp_len*2 )\
-            || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
-            ok = 0;\
-        call_c2( mc_c.name, sum+offset, __VA_ARGS__ );\
-        call_a2( mc_a.name, sum+offset, __VA_ARGS__ );\
-    }
-    ok = 1; used_asm = 0;
-    INTEGRAL_INIT( integral_init4h, 2, stride, stride-4, pbuf2, stride );
-    INTEGRAL_INIT( integral_init8h, 2, stride, stride-8, pbuf2, stride );
-    INTEGRAL_INIT( integral_init4v, 14, 0, stride-8, sum+9*stride, stride );
-    INTEGRAL_INIT( integral_init8v, 9, 0, stride-8, stride );
-    report( "integral init :" );
-
-    ok = 1; used_asm = 0;
-    if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
-    {
-        used_asm = 1;
-        x264_emms();
-        for( int i = 0; i < 10; i++ )
-        {
-            float fps_factor = (rand()&65535) / 65535.0f;
-            set_func_name( "mbtree_propagate_cost" );
-            int16_t *dsta = (int16_t*)buf3;
-            int16_t *dstc = dsta+400;
-            uint16_t *prop = (uint16_t*)buf1;
-            uint16_t *intra = (uint16_t*)buf4;
-            uint16_t *inter = intra+128;
-            uint16_t *qscale = inter+128;
-            uint16_t *rnd = (uint16_t*)buf2;
-            x264_emms();
-            for( int j = 0; j < 100; j++ )
-            {
-                intra[j]  = *rnd++ & 0x7fff;
-                intra[j] += !intra[j];
-                inter[j]  = *rnd++ & 0x7fff;
-                qscale[j] = *rnd++ & 0x7fff;
-            }
-            call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
-            call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
-            // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
-            x264_emms();
-            for( int j = 0; j < 100 && ok; j++ )
-            {
-                ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
-                if( !ok )
-                    fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
-            }
-        }
-    }
-
-    if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
-    {
-        used_asm = 1;
-        for( int i = 0; i < 8; i++ )
-        {
-            set_func_name( "mbtree_propagate_list" );
-            x264_t h;
-            int height = 4;
-            int width = 128;
-            int size = width*height;
-            h.mb.i_mb_stride = width;
-            h.mb.i_mb_width = width;
-            h.mb.i_mb_height = height;
-
-            uint16_t *ref_costsc = (uint16_t*)buf3;
-            uint16_t *ref_costsa = (uint16_t*)buf4;
-            int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
-            int16_t *propagate_amount = (int16_t*)(mvs + width);
-            uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
-            h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
-            int bipred_weight = (rand()%63)+1;
-            int list = i&1;
-            for( int j = 0; j < size; j++ )
-                ref_costsc[j] = ref_costsa[j] = rand()&32767;
-            for( int j = 0; j < width; j++ )
-            {
-                static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
-                for( int k = 0; k < 2; k++ )
-                    mvs[j][k] = (rand()&127) - 64;
-                propagate_amount[j] = rand()&32767;
-                lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
-            }
-
-            call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
-            call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
-
-            for( int j = 0; j < size && ok; j++ )
-            {
-                ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
-                if( !ok )
-                    fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
-            }
-
-            call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
-            call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
-        }
-    }
-
-    if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
-    {
-        set_func_name( "mbtree_fix8_pack" );
-        used_asm = 1;
-        float *fix8_src = (float*)(buf3 + 0x800);
-        uint16_t *dstc = (uint16_t*)buf3;
-        uint16_t *dsta = (uint16_t*)buf4;
-        for( int i = 0; i < 5; i++ )
-        {
-            int count = 256 + i;
-
-            for( int j = 0; j < count; j++ )
-                fix8_src[j] = (int16_t)(rand()) / 256.0f;
-            dsta[count] = 0xAAAA;
-
-            call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count );
-            call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count );
-
-            if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA )
-            {
-                ok = 0;
-                fprintf( stderr, "mbtree_fix8_pack FAILED\n" );
-                break;
-            }
-        }
-    }
-
-    if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack )
-    {
-        set_func_name( "mbtree_fix8_unpack" );
-        used_asm = 1;
-        uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
-        float *dstc = (float*)buf3;
-        float *dsta = (float*)buf4;
-        for( int i = 0; i < 5; i++ )
-        {
-            int count = 256 + i;
-
-            for( int j = 0; j < count; j++ )
-                fix8_src[j] = rand();
-            M32( &dsta[count] ) = 0xAAAAAAAA;
-
-            call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count );
-            call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count );
-
-            if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA )
-            {
-                ok = 0;
-                fprintf( stderr, "mbtree_fix8_unpack FAILED\n" );
-                break;
-            }
-        }
-    }
-    report( "mbtree :" );
-
-    if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
-    {
-        set_func_name( "memcpy_aligned" );
-        ok = 1; used_asm = 1;
-        for( size_t size = 16; size < 256; size += 16 )
-        {
-            memset( buf4, 0xAA, size + 1 );
-            call_c( mc_c.memcpy_aligned, buf3, buf1, size );
-            call_a( mc_a.memcpy_aligned, buf4, buf1, size );
-            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
-            {
-                ok = 0;
-                fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
-                break;
-            }
-        }
-        report( "memcpy aligned :" );
-    }
-
-    if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
-    {
-        set_func_name( "memzero_aligned" );
-        ok = 1; used_asm = 1;
-        for( size_t size = 128; size < 1024; size += 128 )
-        {
-            memset( buf4, 0xAA, size + 1 );
-            call_c( mc_c.memzero_aligned, buf3, size );
-            call_a( mc_a.memzero_aligned, buf4, size );
-            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
-            {
-                ok = 0;
-                fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
-                break;
-            }
-        }
-        report( "memzero aligned :" );
-    }
-
-    return ret;
-}
-
-static int check_deblock( int cpu_ref, int cpu_new )
-{
-    x264_deblock_function_t db_c;
-    x264_deblock_function_t db_ref;
-    x264_deblock_function_t db_a;
-    int ret = 0, ok = 1, used_asm = 0;
-    int alphas[36], betas[36];
-    int8_t tcs[36][4];
-
-    x264_deblock_init( 0, &db_c, 0 );
-    x264_deblock_init( cpu_ref, &db_ref, 0 );
-    x264_deblock_init( cpu_new, &db_a, 0 );
-
-    /* not exactly the real values of a,b,tc but close enough */
-    for( int i = 35, a = 255, c = 250; i >= 0; i-- )
-    {
-        alphas[i] = a << (BIT_DEPTH-8);
-        betas[i] = (i+1)/2 << (BIT_DEPTH-8);
-        tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
-        tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
-        tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
-        a = a*9/10;
-        c = c*9/10;
-    }
-
-#define TEST_DEBLOCK( name, align, ... ) \
-    for( int i = 0; i < 36; i++ ) \
-    { \
-        intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
-        for( int j = 0; j < 1024; j++ ) \
-            /* two distributions of random to excersize different failure modes */ \
-            pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
-        memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
-        if( db_a.name != db_ref.name ) \
-        { \
-            set_func_name( #name ); \
-            used_asm = 1; \
-            call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
-                break; \
-            } \
-            call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-        } \
-    }
-
-    TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
-    TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
-    TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
-    TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
-    TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
-    TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
-    TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
-    TEST_DEBLOCK( deblock_luma_intra[0], 0 );
-    TEST_DEBLOCK( deblock_luma_intra[1], 1 );
-    TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
-    TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
-    TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
-    TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
-    TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
-
-    if( db_a.deblock_strength != db_ref.deblock_strength )
-    {
-        for( int i = 0; i < 100; i++ )
-        {
-            ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
-            ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
-            ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
-            ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
-            memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
-            for( int j = 0; j < X264_SCAN8_SIZE; j++ )
-                nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
-            for( int j = 0; j < 2; j++ )
-                for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ )
-                {
-                    ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
-                    for( int l = 0; l < 2; l++ )
-                        mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
-                }
-            set_func_name( "deblock_strength" );
-            call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
-            call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
-            if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
-            {
-                ok = 0;
-                fprintf( stderr, "deblock_strength: [FAILED]\n" );
-                for( int j = 0; j < 2; j++ )
-                {
-                    for( int k = 0; k < 2; k++ )
-                        for( int l = 0; l < 4; l++ )
-                        {
-                            for( int m = 0; m < 4; m++ )
-                                printf("%d ",bs[j][k][l][m]);
-                            printf("\n");
-                        }
-                    printf("\n");
-                }
-                break;
-            }
-        }
-    }
-
-    report( "deblock :" );
-
-    return ret;
-}
-
-static int check_quant( int cpu_ref, int cpu_new )
-{
-    x264_quant_function_t qf_c;
-    x264_quant_function_t qf_ref;
-    x264_quant_function_t qf_a;
-    ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
-    ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
-    ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
-    ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
-    int ret = 0, ok, used_asm;
-    int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
-    x264_t h_buf;
-    x264_t *h = &h_buf;
-    memset( h, 0, sizeof(*h) );
-    h->sps->i_chroma_format_idc = 1;
-    x264_param_default( &h->param );
-    h->chroma_qp_table = i_chroma_qp_table + 12;
-    h->param.analyse.b_transform_8x8 = 1;
-
-    for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
-    {
-        if( i_cqm == 0 )
-        {
-            for( int i = 0; i < 6; i++ )
-                h->pps->scaling_list[i] = x264_cqm_flat16;
-            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_FLAT;
-        }
-        else if( i_cqm == 1 )
-        {
-            for( int i = 0; i < 6; i++ )
-                h->pps->scaling_list[i] = x264_cqm_jvt[i];
-            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_JVT;
-        }
-        else
-        {
-            int max_scale = BIT_DEPTH < 10 ? 255 : 228;
-            if( i_cqm == 2 )
-                for( int i = 0; i < 64; i++ )
-                    cqm_buf[i] = 10 + rand() % (max_scale - 9);
-            else
-                for( int i = 0; i < 64; i++ )
-                    cqm_buf[i] = 1;
-            for( int i = 0; i < 6; i++ )
-                h->pps->scaling_list[i] = cqm_buf;
-            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_CUSTOM;
-        }
-
-        h->param.rc.i_qp_min = 0;
-        h->param.rc.i_qp_max = QP_MAX_SPEC;
-        x264_cqm_init( h );
-        x264_quant_init( h, 0, &qf_c );
-        x264_quant_init( h, cpu_ref, &qf_ref );
-        x264_quant_init( h, cpu_new, &qf_a );
-
-#define INIT_QUANT8(j,max) \
-        { \
-            static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
-            for( int i = 0; i < max; i++ ) \
-            { \
-                unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \
-                dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \
-            } \
-        }
-
-#define INIT_QUANT4(j,max) \
-        { \
-            static const int scale1d[4] = {4,6,4,6}; \
-            for( int i = 0; i < max; i++ ) \
-            { \
-                unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \
-                dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \
-            } \
-        }
-
-#define TEST_QUANT_DC( name, cqm ) \
-        if( qf_a.name != qf_ref.name ) \
-        { \
-            set_func_name( #name ); \
-            used_asms[0] = 1; \
-            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
-            { \
-                for( int j = 0; j < 2; j++ ) \
-                { \
-                    int result_c, result_a; \
-                    for( int i = 0; i < 16; i++ ) \
-                        dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
-                    result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
-                    { \
-                        oks[0] = 0; \
-                        fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
-                        break; \
-                    } \
-                    call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                } \
-            } \
-        }
-
-#define TEST_QUANT( qname, block, type, w, maxj ) \
-        if( qf_a.qname != qf_ref.qname ) \
-        { \
-            set_func_name( #qname ); \
-            used_asms[0] = 1; \
-            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
-            { \
-                for( int j = 0; j < maxj; j++ ) \
-                { \
-                    INIT_QUANT##type(j, w*w) \
-                    int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
-                    int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
-                    if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
-                    { \
-                        oks[0] = 0; \
-                        fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
-                        break; \
-                    } \
-                    call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
-                    call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
-                } \
-            } \
-        }
-
-        TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 );
-        TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 );
-        TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 );
-        TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 );
-        TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 );
-        TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 );
-        TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
-        TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
-
-#define TEST_DEQUANT( qname, dqname, block, w ) \
-        if( qf_a.dqname != qf_ref.dqname ) \
-        { \
-            set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
-            used_asms[1] = 1; \
-            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
-            { \
-                INIT_QUANT##w(1, w*w) \
-                qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
-                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
-                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
-                { \
-                    oks[1] = 0; \
-                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
-                    break; \
-                } \
-                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
-                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-            } \
-        }
-
-        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
-        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
-        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
-        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
-
-#define TEST_DEQUANT_DC( qname, dqname, block, w ) \
-        if( qf_a.dqname != qf_ref.dqname ) \
-        { \
-            set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
-            used_asms[1] = 1; \
-            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
-            { \
-                for( int i = 0; i < 16; i++ ) \
-                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
-                qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
-                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
-                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
-                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
-                { \
-                    oks[1] = 0; \
-                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
-                } \
-                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
-                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-            } \
-        }
-
-        TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
-
-        if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
-        {
-            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
-            used_asms[1] = 1;
-            for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
-            {
-                for( int i = 0; i < 8; i++ )
-                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
-                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
-                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
-                call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
-                call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
-                for( int i = 0; i < 8; i++ )
-                    if( dct3[i][0] != dct4[i][0] )
-                    {
-                        oks[1] = 0;
-                        fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
-                        break;
-                    }
-            }
-        }
-
-        if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
-        {
-            set_func_name( "idct_dequant_2x4_dconly_%s", i_cqm?"cqm":"flat" );
-            used_asms[1] = 1;
-            for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
-            {
-                for( int i = 0; i < 8; i++ )
-                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
-                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
-                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
-                memcpy( dct2, dct1, 8*sizeof(dctcoef) );
-                call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
-                call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
-                if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
-                {
-                    oks[1] = 0;
-                    fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
-                    break;
-                }
-                call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
-                call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
-            }
-        }
-
-#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
-        if( qf_a.optname != qf_ref.optname ) \
-        { \
-            set_func_name( #optname ); \
-            used_asms[2] = 1; \
-            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
-            { \
-                int qpdc = qp + (size == 8 ? 3 : 0); \
-                int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
-                if( dmf > 32*64 ) \
-                    continue; \
-                for( int i = 16;; i <<= 1 ) \
-                { \
-                    int res_c, res_asm; \
-                    int max = X264_MIN( i, PIXEL_MAX*16 ); \
-                    for( int j = 0; j < size; j++ ) \
-                        dct1[j] = rand()%(max*2+1) - max; \
-                    for( int j = 0; i <= size; j += 4 ) \
-                        qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
-                    memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
-                    res_c   = call_c1( qf_c.optname, dct1, dmf ); \
-                    res_asm = call_a1( qf_a.optname, dct2, dmf ); \
-                    if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
-                    { \
-                        oks[2] = 0; \
-                        fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
-                    } \
-                    call_c2( qf_c.optname, dct1, dmf ); \
-                    call_a2( qf_a.optname, dct2, dmf ); \
-                    if( i >= PIXEL_MAX*16 ) \
-                        break; \
-                } \
-            } \
-        }
-
-        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
-        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
-
-        x264_cqm_delete( h );
-    }
-
-    ok = oks[0]; used_asm = used_asms[0];
-    report( "quant :" );
-
-    ok = oks[1]; used_asm = used_asms[1];
-    report( "dequant :" );
-
-    ok = oks[2]; used_asm = used_asms[2];
-    report( "optimize chroma dc :" );
-
-    ok = 1; used_asm = 0;
-    if( qf_a.denoise_dct != qf_ref.denoise_dct )
-    {
-        used_asm = 1;
-        for( int size = 16; size <= 64; size += 48 )
-        {
-            set_func_name( "denoise_dct" );
-            memcpy( dct1, buf1, size*sizeof(dctcoef) );
-            memcpy( dct2, buf1, size*sizeof(dctcoef) );
-            memcpy( buf3+256, buf3, 256 );
-            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
-            call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
-            if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
-                ok = 0;
-            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
-            call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
-        }
-    }
-    report( "denoise dct :" );
-
-#define TEST_DECIMATE( decname, w, ac, thresh ) \
-    if( qf_a.decname != qf_ref.decname ) \
-    { \
-        set_func_name( #decname ); \
-        used_asm = 1; \
-        for( int i = 0; i < 100; i++ ) \
-        { \
-            static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\
-            static const int zerorate_lut[4] = {3,7,15,31};\
-            int zero_rate = zerorate_lut[i&3];\
-            for( int idx = 0; idx < w*w; idx++ ) \
-            { \
-                int sign = (rand()&1) ? -1 : 1; \
-                int abs_level = distrib[rand()&15]; \
-                if( abs_level == 4 ) abs_level = rand()&0x3fff; \
-                int zero = !(rand()&zero_rate); \
-                dct1[idx] = zero * abs_level * sign; \
-            } \
-            if( ac ) \
-                dct1[0] = 0; \
-            int result_c = call_c( qf_c.decname, dct1 ); \
-            int result_a = call_a( qf_a.decname, dct1 ); \
-            if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #decname ": [FAILED]\n" ); \
-                break; \
-            } \
-        } \
-    }
-
-    ok = 1; used_asm = 0;
-    TEST_DECIMATE( decimate_score64, 8, 0, 6 );
-    TEST_DECIMATE( decimate_score16, 4, 0, 6 );
-    TEST_DECIMATE( decimate_score15, 4, 1, 7 );
-    report( "decimate_score :" );
-
-#define TEST_LAST( last, lastname, size, ac ) \
-    if( qf_a.last != qf_ref.last ) \
-    { \
-        set_func_name( #lastname ); \
-        used_asm = 1; \
-        for( int i = 0; i < 100; i++ ) \
-        { \
-            int nnz = 0; \
-            int max = rand() & (size-1); \
-            memset( dct1, 0, size*sizeof(dctcoef) ); \
-            for( int idx = ac; idx < max; idx++ ) \
-                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
-            if( !nnz ) \
-                dct1[ac] = 1; \
-            int result_c = call_c( qf_c.last, dct1+ac ); \
-            int result_a = call_a( qf_a.last, dct1+ac ); \
-            if( result_c != result_a ) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #lastname ": [FAILED]\n" ); \
-                break; \
-            } \
-        } \
-    }
-
-    ok = 1; used_asm = 0;
-    TEST_LAST( coeff_last4              , coeff_last4,   4, 0 );
-    TEST_LAST( coeff_last8              , coeff_last8,   8, 0 );
-    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 16, 1 );
-    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
-    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
-    report( "coeff_last :" );
-
-#define TEST_LEVELRUN( lastname, name, size, ac ) \
-    if( qf_a.lastname != qf_ref.lastname ) \
-    { \
-        set_func_name( #name ); \
-        used_asm = 1; \
-        for( int i = 0; i < 100; i++ ) \
-        { \
-            x264_run_level_t runlevel_c, runlevel_a; \
-            int nnz = 0; \
-            int max = rand() & (size-1); \
-            memset( dct1, 0, size*sizeof(dctcoef) ); \
-            memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
-            memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
-            for( int idx = ac; idx < max; idx++ ) \
-                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
-            if( !nnz ) \
-                dct1[ac] = 1; \
-            int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
-            int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
-            if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
-                runlevel_c.mask != runlevel_a.mask || \
-                memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c)) \
-            { \
-                ok = 0; \
-                fprintf( stderr, #name ": [FAILED]\n" ); \
-                break; \
-            } \
-        } \
-    }
-
-    ok = 1; used_asm = 0;
-    TEST_LEVELRUN( coeff_level_run4              , coeff_level_run4,   4, 0 );
-    TEST_LEVELRUN( coeff_level_run8              , coeff_level_run8,   8, 0 );
-    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 16, 1 );
-    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
-    report( "coeff_level_run :" );
-
-    return ret;
-}
-
-static int check_intra( int cpu_ref, int cpu_new )
-{
-    int ret = 0, ok = 1, used_asm = 0;
-    ALIGNED_ARRAY_32( pixel, edge,[36] );
-    ALIGNED_ARRAY_32( pixel, edge2,[36] );
-    ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
-    struct
-    {
-        x264_predict_t      predict_16x16[4+3];
-        x264_predict_t      predict_8x8c[4+3];
-        x264_predict_t      predict_8x16c[4+3];
-        x264_predict8x8_t   predict_8x8[9+3];
-        x264_predict_t      predict_4x4[9+3];
-        x264_predict_8x8_filter_t predict_8x8_filter;
-    } ip_c, ip_ref, ip_a;
-
-    x264_predict_16x16_init( 0, ip_c.predict_16x16 );
-    x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
-    x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
-    x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
-    x264_predict_4x4_init( 0, ip_c.predict_4x4 );
-
-    x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
-    x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
-    x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
-    x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
-    x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
-
-    x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
-    x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
-    x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
-    x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
-    x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
-
-    memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\
-
-    ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-
-#define INTRA_TEST( name, dir, w, h, align, bench, ... )\
-    if( ip_a.name[dir] != ip_ref.name[dir] )\
-    {\
-        set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
-        used_asm = 1;\
-        memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
-        memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
-        for( int a = 0; a < (do_bench ? 64/sizeof(pixel) : 1); a += align )\
-        {\
-            call_c##bench( ip_c.name[dir], pbuf3+48+a, ##__VA_ARGS__ );\
-            call_a##bench( ip_a.name[dir], pbuf4+48+a, ##__VA_ARGS__ );\
-            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
-            {\
-                fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
-                ok = 0;\
-                if( ip_c.name == (void *)ip_c.predict_8x8 )\
-                {\
-                    for( int k = -1; k < 16; k++ )\
-                        printf( "%2x ", edge[16+k] );\
-                    printf( "\n" );\
-                }\
-                for( int j = 0; j < h; j++ )\
-                {\
-                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
-                        printf( "%2x ", edge[14-j] );\
-                    for( int k = 0; k < w; k++ )\
-                        printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
-                    printf( "\n" );\
-                }\
-                printf( "\n" );\
-                for( int j = 0; j < h; j++ )\
-                {\
-                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
-                        printf( "   " );\
-                    for( int k = 0; k < w; k++ )\
-                        printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
-                    printf( "\n" );\
-                }\
-                break;\
-            }\
-        }\
-    }
-
-    for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_4x4, i,  4,  4,  4, );
-    for( int i = 0; i < 7; i++ )
-        INTRA_TEST(  predict_8x8c, i,  8,  8, 16, );
-    for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_8x16c, i,  8, 16, 16, );
-    for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_16x16, i, 16, 16, 16, );
-    for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_8x8, i,  8,  8,  8, , edge );
-
-    set_func_name("intra_predict_8x8_filter");
-    if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
-    {
-        used_asm = 1;
-        for( int i = 0; i < 32; i++ )
-        {
-            if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
-                continue;
-            int neighbor = (i&24)>>1;
-            memset( edge,  0, 36*sizeof(pixel) );
-            memset( edge2, 0, 36*sizeof(pixel) );
-            call_c( ip_c.predict_8x8_filter, pbuf1+48, edge,  neighbor, i&7 );
-            call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
-            if( !(neighbor&MB_TOPLEFT) )
-                edge[15] = edge2[15] = 0;
-            if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * sizeof(pixel) ) )
-            {
-                fprintf( stderr, "predict_8x8_filter :  [FAILED] %d %d\n", (i&24)>>1, i&7);
-                ok = 0;
-            }
-        }
-    }
-
-#define EXTREMAL_PLANE( w, h ) \
-    { \
-        int max[7]; \
-        for( int j = 0; j < 7; j++ ) \
-            max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
-        fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
-        for( int j = 0; j < w/2; j++ ) \
-            fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
-        for( int j = w/2; j < w-1; j++ ) \
-            fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
-        fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
-        for( int j = 0; j < h/2; j++ ) \
-            fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
-        for( int j = h/2; j < h-1; j++ ) \
-            fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
-        fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
-    }
-    /* Extremal test case for planar prediction. */
-    for( int test = 0; test < 100 && ok; test++ )
-        for( int i = 0; i < 128 && ok; i++ )
-        {
-            EXTREMAL_PLANE(  8,  8 );
-            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8,  8, 64, 1 );
-            EXTREMAL_PLANE(  8, 16 );
-            INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P,  8, 16, 64, 1 );
-            EXTREMAL_PLANE( 16, 16 );
-            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 16, 64, 1 );
-        }
-    report( "intra pred :" );
-    return ret;
-}
-
-#define DECL_CABAC(cpu) \
-static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\
-{\
-    x264_cabac_t cb;\
-    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
-    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
-    for( int i = 0; i < 0x1000; i++ )\
-        x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
-}\
-static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\
-{\
-    x264_cabac_t cb;\
-    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
-    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
-    for( int i = 0; i < 0x1000; i++ )\
-        x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
-}\
-static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
-{\
-    x264_cabac_t cb;\
-    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
-    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
-    for( int i = 0; i < 0x1000; i++ )\
-        x264_cabac_encode_terminal_##cpu( &cb );\
-}
-DECL_CABAC(c)
-#if HAVE_MMX
-DECL_CABAC(asm)
-#elif defined(ARCH_AARCH64)
-DECL_CABAC(asm)
-#else
-#define run_cabac_decision_asm run_cabac_decision_c
-#define run_cabac_bypass_asm run_cabac_bypass_c
-#define run_cabac_terminal_asm run_cabac_terminal_c
-#endif
-
-extern const uint8_t x264_count_cat_m1[14];
-void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
-void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
-void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
-
-static int check_cabac( int cpu_ref, int cpu_new )
-{
-    int ret = 0, ok = 1, used_asm = 0;
-    x264_t h;
-    h.sps->i_chroma_format_idc = 3;
-
-    x264_bitstream_function_t bs_ref;
-    x264_bitstream_function_t bs_a;
-    x264_bitstream_init( cpu_ref, &bs_ref );
-    x264_bitstream_init( cpu_new, &bs_a );
-    x264_quant_init( &h, cpu_new, &h.quantf );
-    h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
-
-#define CABAC_RESIDUAL(name, start, end, rd)\
-{\
-    if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
-    {\
-        used_asm = 1;\
-        set_func_name( #name );\
-        for( int i = 0; i < 2; i++ )\
-        {\
-            for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\
-            {\
-                for( int j = 0; j < 256; j++ )\
-                {\
-                    ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
-                    uint8_t bitstream[2][1<<16];\
-                    static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
-                    int ac = ctx_ac[ctx_block_cat];\
-                    int nz = 0;\
-                    while( !nz )\
-                    {\
-                        for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\
-                        {\
-                            /* Very rough distribution that covers possible inputs */\
-                            int rnd = rand();\
-                            int coef = !(rnd&3);\
-                            coef += !(rnd&  15) * (rand()&0x0006);\
-                            coef += !(rnd&  63) * (rand()&0x0008);\
-                            coef += !(rnd& 255) * (rand()&0x00F0);\
-                            coef += !(rnd&1023) * (rand()&0x7F00);\
-                            nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\
-                        }\
-                    }\
-                    h.mb.b_interlaced = i;\
-                    x264_cabac_t cb[2];\
-                    x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
-                    x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
-                    x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
-                    x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
-                    cb[0].f8_bits_encoded = 0;\
-                    cb[1].f8_bits_encoded = 0;\
-                    if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
-                    call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
-                    call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
-                    ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
-                    if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
-                    if( !ok )\
-                    {\
-                        fprintf( stderr, #name " :  [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
-                        if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\
-                            fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\
-                        fprintf( stderr, "\n");\
-                        goto name##fail;\
-                    }\
-                    if( (j&15) == 0 )\
-                    {\
-                        call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
-                        call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
-                    }\
-                }\
-            }\
-        }\
-    }\
-}\
-name##fail:
-
-    CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
-    report( "cabac residual:" );
-
-    ok = 1; used_asm = 0;
-    CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
-    CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
-    report( "cabac residual rd:" );
-
-    if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
-        return ret;
-    ok = 1; used_asm = 0;
-    x264_cabac_init( &h );
-
-    set_func_name( "cabac_encode_decision" );
-    memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_decision_c, &h, buf3 );
-    call_a( run_cabac_decision_asm, &h, buf4 );
-    ok = !memcmp( buf3, buf4, 0x1000 );
-    report( "cabac decision:" );
-
-    set_func_name( "cabac_encode_bypass" );
-    memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_bypass_c, &h, buf3 );
-    call_a( run_cabac_bypass_asm, &h, buf4 );
-    ok = !memcmp( buf3, buf4, 0x1000 );
-    report( "cabac bypass:" );
-
-    set_func_name( "cabac_encode_terminal" );
-    memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_terminal_c, &h, buf3 );
-    call_a( run_cabac_terminal_asm, &h, buf4 );
-    ok = !memcmp( buf3, buf4, 0x1000 );
-    report( "cabac terminal:" );
-
-    return ret;
-}
-
-static int check_bitstream( int cpu_ref, int cpu_new )
-{
-    x264_bitstream_function_t bs_c;
-    x264_bitstream_function_t bs_ref;
-    x264_bitstream_function_t bs_a;
-
-    int ret = 0, ok = 1, used_asm = 0;
-
-    x264_bitstream_init( 0, &bs_c );
-    x264_bitstream_init( cpu_ref, &bs_ref );
-    x264_bitstream_init( cpu_new, &bs_a );
-    if( bs_a.nal_escape != bs_ref.nal_escape )
-    {
-        int size = 0x4000;
-        uint8_t *input = malloc(size+100);
-        uint8_t *output1 = malloc(size*2);
-        uint8_t *output2 = malloc(size*2);
-        used_asm = 1;
-        set_func_name( "nal_escape" );
-        for( int i = 0; i < 100; i++ )
-        {
-            /* Test corner-case sizes */
-            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
-            /* Test 8 different probability distributions of zeros */
-            for( int j = 0; j < test_size+32; j++ )
-                input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
-            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
-            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
-            int size_c = end_c-output1;
-            int size_a = end_a-output2;
-            if( size_c != size_a || memcmp( output1, output2, size_c ) )
-            {
-                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
-                ok = 0;
-                break;
-            }
-        }
-        for( int j = 0; j < size+32; j++ )
-            input[j] = rand();
-        call_c2( bs_c.nal_escape, output1, input, input+size );
-        call_a2( bs_a.nal_escape, output2, input, input+size );
-        free(input);
-        free(output1);
-        free(output2);
-    }
-    report( "nal escape:" );
-
-    return ret;
-}
-
-static int check_all_funcs( int cpu_ref, int cpu_new )
-{
-    return check_pixel( cpu_ref, cpu_new )
-         + check_dct( cpu_ref, cpu_new )
-         + check_mc( cpu_ref, cpu_new )
-         + check_intra( cpu_ref, cpu_new )
-         + check_deblock( cpu_ref, cpu_new )
-         + check_quant( cpu_ref, cpu_new )
-         + check_cabac( cpu_ref, cpu_new )
-         + check_bitstream( cpu_ref, cpu_new );
-}
-
-static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
-{
-    *cpu_ref = *cpu_new;
-    *cpu_new |= flags;
-#if STACK_ALIGNMENT < 16
-    *cpu_new |= X264_CPU_STACK_MOD4;
-#endif
-    if( *cpu_new & X264_CPU_SSE2_IS_FAST )
-        *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
-    if( !quiet )
-        fprintf( stderr, "x264: %s\n", name );
-    return check_all_funcs( *cpu_ref, *cpu_new );
-}
-
-static int check_all_flags( void )
-{
-    int ret = 0;
-    int cpu0 = 0, cpu1 = 0;
-    uint32_t cpu_detect = x264_cpu_detect();
-#if HAVE_MMX
-    if( cpu_detect & X264_CPU_MMX2 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
-        cpu1 &= ~X264_CPU_CACHELINE_64;
-#if ARCH_X86
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
-        cpu1 &= ~X264_CPU_CACHELINE_32;
-#endif
-        if( cpu_detect & X264_CPU_LZCNT )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
-            cpu1 &= ~X264_CPU_LZCNT;
-        }
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
-    }
-    if( cpu_detect & X264_CPU_SSE )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
-    if( cpu_detect & X264_CPU_SSE2 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
-        cpu1 &= ~X264_CPU_CACHELINE_64;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
-        cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
-        if( cpu_detect & X264_CPU_LZCNT )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
-            cpu1 &= ~X264_CPU_LZCNT;
-        }
-    }
-    if( cpu_detect & X264_CPU_SSE3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
-        cpu1 &= ~X264_CPU_CACHELINE_64;
-    }
-    if( cpu_detect & X264_CPU_SSSE3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
-        cpu1 &= ~X264_CPU_CACHELINE_64;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
-        cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
-        cpu1 &= ~X264_CPU_CACHELINE_64;
-        cpu1 &= ~X264_CPU_SLOW_ATOM;
-        if( cpu_detect & X264_CPU_LZCNT )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
-            cpu1 &= ~X264_CPU_LZCNT;
-        }
-    }
-    if( cpu_detect & X264_CPU_SSE4 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
-    if( cpu_detect & X264_CPU_SSE42 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
-    if( cpu_detect & X264_CPU_AVX )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
-    if( cpu_detect & X264_CPU_XOP )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
-    if( cpu_detect & X264_CPU_FMA4 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
-        cpu1 &= ~X264_CPU_FMA4;
-    }
-    if( cpu_detect & X264_CPU_FMA3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
-        cpu1 &= ~X264_CPU_FMA3;
-    }
-    if( cpu_detect & X264_CPU_AVX2 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
-        if( cpu_detect & X264_CPU_LZCNT )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
-            cpu1 &= ~X264_CPU_LZCNT;
-        }
-    }
-    if( cpu_detect & X264_CPU_BMI1 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        cpu1 &= ~X264_CPU_BMI1;
-    }
-    if( cpu_detect & X264_CPU_BMI2 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
-        cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
-    }
-#elif ARCH_PPC
-    if( cpu_detect & X264_CPU_ALTIVEC )
-    {
-        fprintf( stderr, "x264: ALTIVEC against C\n" );
-        ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
-    }
-#elif ARCH_ARM
-    if( cpu_detect & X264_CPU_NEON )
-        x264_checkasm_call = x264_checkasm_call_neon;
-    if( cpu_detect & X264_CPU_ARMV6 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
-    if( cpu_detect & X264_CPU_NEON )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
-    if( cpu_detect & X264_CPU_FAST_NEON_MRC )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
-#elif ARCH_AARCH64
-    if( cpu_detect & X264_CPU_ARMV8 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
-    if( cpu_detect & X264_CPU_NEON )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
-#elif ARCH_MIPS
-    if( cpu_detect & X264_CPU_MSA )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" );
-#endif
-    return ret;
-}
-
-int main(int argc, char *argv[])
-{
-    int ret = 0;
-
-#ifdef _WIN32
-    /* Disable the Windows Error Reporting dialog */
-    SetErrorMode( SEM_NOGPFAULTERRORBOX );
-#endif
-
-    if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
-    {
-#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
-        fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
-        return 1;
-#endif
-        do_bench = 1;
-        if( argv[1][7] == '=' )
-        {
-            bench_pattern = argv[1]+8;
-            bench_pattern_len = strlen(bench_pattern);
-        }
-        argc--;
-        argv++;
-    }
-
-    int seed = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
-    fprintf( stderr, "x264: using random seed %u\n", seed );
-    srand( seed );
-
-    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
-    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
-    if( !buf1 || !pbuf1 )
-    {
-        fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
-        return -1;
-    }
-#define INIT_POINTER_OFFSETS\
-    buf2 = buf1 + 0xf00;\
-    buf3 = buf2 + 0xf00;\
-    buf4 = buf3 + 0x1000*sizeof(pixel);\
-    pbuf2 = pbuf1 + 0xf00;\
-    pbuf3 = (pixel*)buf3;\
-    pbuf4 = (pixel*)buf4;
-    INIT_POINTER_OFFSETS;
-    for( int i = 0; i < 0x1e00; i++ )
-    {
-        buf1[i] = rand() & 0xFF;
-        pbuf1[i] = rand() & PIXEL_MAX;
-    }
-    memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
-
-    /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
-    if( do_bench )
-        for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
-        {
-            INIT_POINTER_OFFSETS;
-            ret |= x264_stack_pagealign( check_all_flags, i*32 );
-            buf1 += 32;
-            pbuf1 += 32;
-            quiet = 1;
-            fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
-        }
-    else
-        ret = x264_stack_pagealign( check_all_flags, 0 );
-
-    if( ret )
-    {
-        fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
-        return -1;
-    }
-    fprintf( stderr, "x264: All tests passed Yeah :)\n" );
-    if( do_bench )
-        print_bench();
-    return 0;
-}
-
diff --git a/android/src/main/libenc/jni/libx264/tools/cltostr.sh b/android/src/main/libenc/jni/libx264/tools/cltostr.sh
deleted file mode 100755
index 23b6cc7..0000000
--- a/android/src/main/libenc/jni/libx264/tools/cltostr.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/sh
-
-# Convert standard input to a C char array, write to a file, then create an
-# MD5 sum of that file and append said MD5 sum as char array to the file.
-
-[ -n "$1" ] || exit 1
-
-# Filter out whitespace, empty lines, and comments.
-sanitize() {
-    sed 's/^[[:space:]]*//; /^$/d; /^\/\//d'
-}
-
-# Convert stdin to a \0-terminated char array.
-dump() {
-    echo "static const char $1[] = {"
-    od -v -A n -t x1 | sed 's/[[:space:]]*\([[:alnum:]]\{2\}\)/0x\1, /g'
-    echo '0x00 };'
-}
-
-# Print MD5 hash w/o newline character to not embed the character in the array.
-hash() {
-    # md5sum is not standard, so try different platform-specific alternatives.
-    { md5sum "$1" || md5 -q "$1" || digest -a md5 "$1"; } 2>/dev/null |
-        cut -b -32 | tr -d '\n\r'
-}
-
-trap 'rm -f "$1.temp"' EXIT
-
-sanitize | tee "$1.temp" |
-    dump 'x264_opencl_source' > "$1"
-
-hash "$1.temp" |
-    dump 'x264_opencl_source_hash' >> "$1"
diff --git a/android/src/main/libenc/jni/libx264/tools/countquant_x264.pl b/android/src/main/libenc/jni/libx264/tools/countquant_x264.pl
deleted file mode 100755
index e08f2ef..0000000
--- a/android/src/main/libenc/jni/libx264/tools/countquant_x264.pl
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/env perl
-# countquant_x264.pl: displays statistics from x264 multipass logfiles
-# by Loren Merritt, 2005-4-5
-
-@size{I,P,B} =
-@n{I,P,B} = (0)x3;
-
-sub proc_file {
-    my $fh = shift;
-    while(<$fh>) {
-        /type:(.) q:(\d+\.\d+) tex:(\d+) mv:(\d+) misc:(\d+)/ or next;
-	$type = uc $1;
-	$n{$type} ++;
-	$q[int($2+.5)] ++;
-	$avgq += $2;
-	$avgq{$type} += $2;
-        my $bytes = ($3+$4+$5)/8;
-	$size{$type} += $bytes;
-    }
-    $size = $size{I} + $size{P} + $size{B};
-    $n = $n{I} + $n{P} + $n{B};
-    $n or die "unrecognized input\n";
-}
-
-if(@ARGV) {
-    foreach(@ARGV) {
-        open $fh, "<", $_ or die "can't open '$_': $!";
-	proc_file($fh);
-    }
-} else {
-    proc_file(STDIN);
-}
-
-for(0..51) {
-    $q[$_] or next;
-    printf "q%2d: %6d  %4.1f%%\n", $_, $q[$_], 100*$q[$_]/$n;
-}
-print "\n";
-$digits = int(log($n+1)/log(10))+2;
-printf "All: %${digits}d        %s  avgQP:%5.2f  avgBytes:%5d\n",
-    $n, $n==$n{I}?" ":"", $avgq/$n, $size/$n;
-foreach(qw(I P B S)) {
-    $n{$_} or next;
-    printf "%s:   %${digits}d (%4.1f%%)  avgQP:%5.2f  avgBytes:%5d\n",
-        $_, $n{$_}, 100*$n{$_}/$n, $avgq{$_}/$n{$_}, $size{$_}/$n{$_};
-}
-print "\n";
-printf "total size: $size B = %.2f KiB = %.2f MiB\n",
-    $size/2**10, $size/2**20;
-print "bitrate: ", join("\n       = ",
-    map sprintf("%.2f kbps @ %s fps", $_*$size*8/1000/$n, $_),
-    23.976, 25, 29.97), "\n";
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/__init__.py b/android/src/main/libenc/jni/libx264/tools/digress/__init__.py
deleted file mode 100755
index 8623d48..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Automated regression/unit testing suite.
-"""
-
-__version__ = '0.2'
-
-def digress(fixture):
-    """
-    Command-line helper for Digress.
-    """
-    from digress.cli import Dispatcher
-    Dispatcher(fixture).dispatch()
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/cli.py b/android/src/main/libenc/jni/libx264/tools/digress/cli.py
deleted file mode 100755
index 44158a4..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/cli.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""
-Digress's CLI interface.
-"""
-
-import inspect
-import sys
-from optparse import OptionParser
-
-import textwrap
-
-from types import MethodType
-
-from digress import __version__ as version
-
-def dispatchable(func):
-    """
-    Mark a method as dispatchable.
-    """
-    func.digress_dispatchable = True
-    return func
-
-class Dispatcher(object):
-    """
-    Dispatcher for CLI commands.
-    """
-    def __init__(self, fixture):
-        self.fixture = fixture
-        fixture.dispatcher = self
-
-    def _monkey_print_help(self, optparse, *args, **kwargs):
-        # monkey patches OptionParser._print_help
-        OptionParser.print_help(optparse, *args, **kwargs)
-
-        print >>sys.stderr, "\nAvailable commands:"
-
-        maxlen = max([ len(command_name) for command_name in self.commands ])
-
-        descwidth = 80 - maxlen - 4
-
-        for command_name, command_meth in self.commands.iteritems():
-            print >>sys.stderr, "  %s %s\n" % (
-                command_name.ljust(maxlen + 1),
-                ("\n" + (maxlen + 4) * " ").join(
-                    textwrap.wrap(" ".join(filter(
-                            None,
-                            command_meth.__doc__.strip().replace("\n", " ").split(" ")
-                        )),
-                        descwidth
-                    )
-                )
-            )
-
-    def _enable_flush(self):
-        self.fixture.flush_before = True
-
-    def _populate_parser(self):
-        self.commands = self._get_commands()
-
-        self.optparse = OptionParser(
-            usage = "usage: %prog [options] command [args]",
-            description = "Digress CLI frontend for %s." % self.fixture.__class__.__name__,
-            version = "Digress %s" % version
-        )
-
-        self.optparse.print_help = MethodType(self._monkey_print_help, self.optparse, OptionParser)
-
-        self.optparse.add_option(
-            "-f",
-            "--flush",
-            action="callback",
-            callback=lambda option, opt, value, parser: self._enable_flush(),
-            help="flush existing data for a revision before testing"
-        )
-
-        self.optparse.add_option(
-            "-c",
-            "--cases",
-            metavar="FOO,BAR",
-            action="callback",
-            dest="cases",
-            type=str,
-            callback=lambda option, opt, value, parser: self._select_cases(*value.split(",")),
-            help="test cases to run, run with command list to see full list"
-        )
-
-    def _select_cases(self, *cases):
-        self.fixture.cases = filter(lambda case: case.__name__ in cases, self.fixture.cases)
-
-    def _get_commands(self):
-        commands = {}
-
-        for name, member in inspect.getmembers(self.fixture):
-            if hasattr(member, "digress_dispatchable"):
-                commands[name] = member
-
-        return commands
-
-    def _run_command(self, name, *args):
-        if name not in self.commands:
-            print >>sys.stderr, "error: %s is not a valid command\n" % name
-            self.optparse.print_help()
-            return
-
-        command = self.commands[name]
-
-        argspec = inspect.getargspec(command)
-
-        max_arg_len = len(argspec.args) - 1
-        min_arg_len = max_arg_len - ((argspec.defaults is not None) and len(argspec.defaults) or 0)
-
-        if len(args) < min_arg_len:
-            print >>sys.stderr, "error: %s takes at least %d arguments\n" % (
-                name,
-                min_arg_len
-            )
-            print >>sys.stderr, "%s\n" % command.__doc__
-            self.optparse.print_help()
-            return
-
-        if len(args) > max_arg_len:
-            print >>sys.stderr, "error: %s takes at most %d arguments\n" % (
-                name,
-                max_arg_len
-            )
-            print >>sys.stderr, "%s\n" % command.__doc__
-            self.optparse.print_help()
-            return
-
-        command(*args)
-
-    def pre_dispatch(self):
-        pass
-
-    def dispatch(self):
-        self._populate_parser()
-
-        self.optparse.parse_args()
-        self.pre_dispatch()
-        args = self.optparse.parse_args()[1] # arguments may require reparsing after pre_dispatch; see test_x264.py
-
-        if len(args) == 0:
-            print >>sys.stderr, "error: no comamnd specified\n"
-            self.optparse.print_help()
-            return
-
-        command = args[0]
-        addenda = args[1:]
-
-        self._run_command(command, *addenda)
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/comparers.py b/android/src/main/libenc/jni/libx264/tools/digress/comparers.py
deleted file mode 100755
index 215ffee..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/comparers.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Digress comparers.
-"""
-
-from digress.errors import ComparisonError
-
-import os
-from itertools import imap, izip
-
-def compare_direct(value_a, value_b):
-    if value_a != value_b:
-        raise ComparisonError("%s is not %s" % (value_a, value_b))
-
-def compare_pass(value_a, value_b):
-    """
-    Always true, as long as the test is passed.
-    """
-
-def compare_tolerance(tolerance):
-    def _compare_tolerance(value_a, value_b):
-        if abs(value_a - value_b) > tolerance:
-            raise ComparisonError("%s is not %s (tolerance: %s)" % (
-                value_a,
-                value_b,
-                tolerance
-            ))
-    return _compare_tolerance
-
-def compare_files(file_a, file_b):
-    size_a = os.path.getsize(file_a)
-    size_b = os.path.getsize(file_b)
-
-    print file_a, file_b
-
-    if size_a != size_b:
-        raise ComparisonError("%s is not the same size as %s" % (
-            file_a,
-            file_b
-        ))
-
-    BUFFER_SIZE = 8196
-
-    offset = 0
-
-    with open(file_a) as f_a:
-        with open(file_b) as f_b:
-            for chunk_a, chunk_b in izip(
-                imap(
-                    lambda i: f_a.read(BUFFER_SIZE),
-                    xrange(size_a // BUFFER_SIZE + 1)
-                ),
-                imap(
-                    lambda i: f_b.read(BUFFER_SIZE),
-                    xrange(size_b // BUFFER_SIZE + 1)
-                )
-            ):
-                chunk_size = len(chunk_a)
-
-                if chunk_a != chunk_b:
-                    for i in xrange(chunk_size):
-                        if chunk_a[i] != chunk_b[i]:
-                            raise ComparisonError("%s differs from %s at offset %d" % (
-                                file_a,
-                                file_b,
-                                offset + i
-                            ))
-
-                offset += chunk_size
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/constants.py b/android/src/main/libenc/jni/libx264/tools/digress/constants.py
deleted file mode 100755
index 1a18bab..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/constants.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""
-All of Digress's constants.
-"""
-
-TEST_PASS = 0
-TEST_FAIL = 1
-TEST_DISABLED = 2
-TEST_SKIPPED = 3
-
-CASE_PASS = 0
-CASE_FAIL = 1
-
-FIXTURE_PASS = 0
-FIXTURE_FAIL = 1
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/errors.py b/android/src/main/libenc/jni/libx264/tools/digress/errors.py
deleted file mode 100755
index 8862829..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/errors.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""
-Digress errors.
-"""
-
-class DigressError(Exception):
-    """
-    Digress error base class.
-    """
-
-class NoSuchTestError(DigressError):
-    """
-    Raised when no such test exists.
-    """
-
-class DisabledTestError(DigressError):
-    """
-    Test is disabled.
-    """
-
-class SkippedTestError(DigressError):
-    """
-    Test is marked as skipped.
-    """
-
-class DisabledCaseError(DigressError):
-    """
-    Case is marked as disabled.
-    """
-
-class SkippedCaseError(DigressError):
-    """
-    Case is marked as skipped.
-    """
-
-class FailedTestError(DigressError):
-    """
-    Test failed.
-    """
-
-class ComparisonError(DigressError):
-    """
-    Comparison failed.
-    """
-
-class IncomparableError(DigressError):
-    """
-    Values cannot be compared.
-    """
-
-class AlreadyRunError(DigressError):
-    """
-    Test/case has already been run.
-    """
-
-class SCMError(DigressError):
-    """
-    Error occurred in SCM.
-    """
-    def __init__(self, message):
-        self.message = message.replace("\n", " ")
-
-    def __str__(self):
-        return self.message
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/scm/__init__.py b/android/src/main/libenc/jni/libx264/tools/digress/scm/__init__.py
deleted file mode 100755
index 95ba161..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/scm/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""
-Source control backends for Digress.
-"""
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/scm/dummy.py b/android/src/main/libenc/jni/libx264/tools/digress/scm/dummy.py
deleted file mode 100755
index bf88a04..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/scm/dummy.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""
-Dummy SCM backend for Digress.
-"""
-
-from random import random
-
-def checkout(revision):
-    """
-    Checkout a revision.
-    """
-    pass
-
-def current_rev():
-    """
-    Get the current revision
-    """
-    return str(random())
-
-def revisions(rev_a, rev_b):
-    """
-    Get a list of revisions from one to another.
-    """
-    pass
-
-def stash():
-    """
-    Stash the repository.
-    """
-    pass
-
-def unstash():
-    """
-    Unstash the repository.
-    """
-    pass
-
-def bisect(command, revision):
-    """
-    Perform a bisection.
-    """
-    raise NotImplementedError("dummy SCM backend does not support bisection")
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/scm/git.py b/android/src/main/libenc/jni/libx264/tools/digress/scm/git.py
deleted file mode 100755
index 3e1ed4a..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/scm/git.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-Git SCM backend for Digress.
-"""
-
-from subprocess import Popen, PIPE, STDOUT
-import re
-
-from digress.errors import SCMError
-
-GIT_BRANCH_EXPR = re.compile("[*] (.*)")
-
-def checkout(revision):
-    """
-    Checkout a revision from git.
-    """
-    proc = Popen([
-        "git",
-        "checkout",
-        "-f",
-        revision
-    ], stdout=PIPE, stderr=STDOUT)
-
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("checkout error: %s" % output)
-
-def rev_parse(ref):
-    proc = Popen([
-        "git",
-        "rev-parse",
-        ref
-    ], stdout=PIPE, stderr=STDOUT)
-
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("rev-parse error: %s" % output)
-    return output
-
-def current_rev():
-    """
-    Get the current revision.
-    """
-    return rev_parse("HEAD")
-
-def current_branch():
-    """
-    Get the current branch.
-    """
-    proc = Popen([
-        "git",
-        "branch",
-        "--no-color"
-    ], stdout=PIPE, stderr=STDOUT)
-
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("branch error: %s" % output)
-    branch_name = GIT_BRANCH_EXPR.findall(output)[0]
-    return branch_name != "(no branch)" and branch_name or None
-
-def revisions(rev_a, rev_b):
-    """
-    Get a list of revisions from one to another.
-    """
-    proc = Popen([
-        "git",
-        "log",
-        "--format=%H", ("%s...%s" % (rev_a, rev_b))
-    ], stdout=PIPE, stderr=STDOUT)
-
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("log error: %s" % output)
-    return output.split("\n")
-
-def stash():
-    """
-    Stash the repository.
-    """
-    proc = Popen([
-        "git",
-        "stash",
-        "save",
-        "--keep-index"
-    ], stdout=PIPE, stderr=STDOUT)
-
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("stash error: %s" % output)
-
-def unstash():
-    """
-    Unstash the repository.
-    """
-    proc = Popen(["git", "stash", "pop"], stdout=PIPE, stderr=STDOUT)
-    proc.communicate()
-
-def bisect(*args):
-    """
-    Perform a bisection.
-    """
-    proc = Popen((["git", "bisect"] + list(args)), stdout=PIPE, stderr=STDOUT)
-    output = proc.communicate()[0]
-    if proc.returncode != 0:
-        raise SCMError("bisect error: %s" % output)
-    return output
-
-def dirty():
-    """
-    Check if the working tree is dirty.
-    """
-    proc = Popen(["git", "status"], stdout=PIPE, stderr=STDOUT)
-    output = proc.communicate()[0].strip()
-    if proc.returncode != 0:
-        raise SCMError("status error: %s" % output)
-    if "modified:" in output:
-        return True
-    else:
-        return False
diff --git a/android/src/main/libenc/jni/libx264/tools/digress/testing.py b/android/src/main/libenc/jni/libx264/tools/digress/testing.py
deleted file mode 100755
index f231270..0000000
--- a/android/src/main/libenc/jni/libx264/tools/digress/testing.py
+++ /dev/null
@@ -1,597 +0,0 @@
-"""
-Digress testing core.
-"""
-
-from digress.errors import SkippedTestError, DisabledTestError, NoSuchTestError, \
-                           FailedTestError, AlreadyRunError, SCMError, \
-                           ComparisonError
-from digress.constants import *
-from digress.cli import dispatchable
-
-import inspect
-import operator
-import os
-import json
-
-import textwrap
-
-from shutil import rmtree
-
-from time import time
-from functools import wraps
-
-from itertools import izip_longest
-
-from hashlib import sha1
-
-class depends(object):
-    """
-    Dependency decorator for a test.
-    """
-    def __init__(self, *test_names):
-        self.test_names = test_names
-
-    def __call__(self, func):
-        func.digress_depends = self.test_names
-        return func
-
-class _skipped(object):
-    """
-    Internal skipped decorator.
-    """
-    def __init__(self, reason=""):
-        self._reason = reason
-
-    def __call__(self, func):
-        @wraps(func)
-        def _closure(*args):
-            raise SkippedTestError(self._reason)
-        return _closure
-
-class disabled(object):
-    """
-    Disable a test, with reason.
-    """
-    def __init__(self, reason=""):
-        self._reason = reason
-
-    def __call__(self, func):
-        @wraps(func)
-        def _closure(*args):
-            raise DisabledTestError(self._reason)
-        return _closure
-
-class comparer(object):
-    """
-    Set the comparer for a test.
-    """
-    def __init__(self, comparer_):
-        self._comparer = comparer_
-
-    def __call__(self, func):
-        func.digress_comparer = self._comparer
-        return func
-
-class Fixture(object):
-    cases = []
-    scm = None
-
-    flush_before = False
-
-    def _skip_case(self, case, depend):
-        for name, meth in inspect.getmembers(case):
-            if name[:5] == "test_":
-                setattr(
-                    case,
-                    name,
-                    _skipped("failed dependency: case %s" % depend)(meth)
-                )
-
-    def _run_case(self, case, results):
-        if case.__name__ in results:
-            raise AlreadyRunError
-
-        for depend in case.depends:
-            if depend.__name__ in results and results[depend.__name__]["status"] != CASE_PASS:
-                self._skip_case(case, depend.__name__)
-
-            try:
-                result = self._run_case(depend, results)
-            except AlreadyRunError:
-                continue
-
-            if result["status"] != CASE_PASS:
-                self._skip_case(case, depend.__name__)
-
-        result = case().run()
-        results[case.__name__] = result
-        return result
-
-    @dispatchable
-    def flush(self, revision=None):
-        """
-        Flush any cached results. Takes a revision for an optional argument.
-        """
-        if not revision:
-            print "Flushing all cached results...",
-
-            try:
-                rmtree(".digress_%s" % self.__class__.__name__)
-            except Exception, e:
-                print "failed: %s" % e
-            else:
-                print "done."
-        else:
-            try:
-                rev = self.scm.rev_parse(revision)
-            except SCMError, e:
-                print e
-            else:
-                print "Flushing cached results for %s..." % rev,
-
-                try:
-                    rmtree(os.path.join(".digress_%s" % self.__class__.__name__, rev))
-                except Exception, e:
-                    print "failed: %s" % e
-                else:
-                    print "done."
-
-    @dispatchable
-    def run(self, revision=None):
-        """
-        Run the fixture for a specified revision.
-
-        Takes a revision for an argument.
-        """
-        oldrev = None
-        oldbranch = None
-        dirty = False
-
-        try:
-            dirty = self.scm.dirty()
-
-            # if the tree is clean, then we don't need to make an exception
-            if not dirty and revision is None: revision = "HEAD"
-
-            if revision:
-                oldrev = self.scm.current_rev()
-                oldbranch = self.scm.current_branch()
-
-                if dirty:
-                    self.scm.stash()
-                self.scm.checkout(revision)
-
-                rev = self.scm.current_rev()
-
-                self.datastore = os.path.join(".digress_%s" % self.__class__.__name__, rev)
-
-                if os.path.isdir(self.datastore):
-                    if self.flush_before:
-                        self.flush(rev)
-                else:
-                    os.makedirs(self.datastore)
-            else:
-                rev = "(dirty working tree)"
-                self.datastore = None
-
-            print "Running fixture %s on revision %s...\n" % (self.__class__.__name__, rev)
-
-            results = {}
-
-            for case in self.cases:
-                try:
-                    self._run_case(case, results)
-                except AlreadyRunError:
-                    continue
-
-            total_time = reduce(operator.add, filter(
-                None,
-                [
-                    result["time"] for result in results.values()
-                ]
-            ), 0)
-
-            overall_status = (
-                CASE_FAIL in [ result["status"] for result in results.values() ]
-            ) and FIXTURE_FAIL or FIXTURE_PASS
-
-            print "Fixture %s in %.4f.\n" % (
-                (overall_status == FIXTURE_PASS) and "passed" or "failed",
-                total_time
-            )
-
-            return { "cases" : results, "time" : total_time, "status" : overall_status, "revision" : rev }
-
-        finally:
-            if oldrev:
-                self.scm.checkout(oldrev)
-                if oldbranch:
-                    self.scm.checkout(oldbranch)
-                if dirty:
-                    self.scm.unstash()
-
-    @dispatchable
-    def bisect(self, good_rev, bad_rev=None):
-        """
-        Perform a bisection between two revisions.
-
-        First argument is the good revision, second is the bad revision, which
-        defaults to the current revision.
-        """
-        if not bad_rev: bad_rev = self.scm.current_rev()
-
-        dirty = False
-
-        # get a set of results for the good revision
-        good_result = self.run(good_rev)
-
-        good_rev = good_result["revision"]
-
-        try:
-            dirty = self.scm.dirty()
-
-            if dirty:
-                self.scm.stash()
-
-            self.scm.bisect("start")
-
-            self.scm.bisect("bad", bad_rev)
-            self.scm.bisect("good", good_rev)
-
-            bisecting = True
-            isbad = False
-
-            while bisecting:
-                results = self.run(self.scm.current_rev())
-                revision = results["revision"]
-
-                # perform comparisons
-                # FIXME: this just uses a lot of self.compare
-                for case_name, case_result in good_result["cases"].iteritems():
-                    case = filter(lambda case: case.__name__ == case_name, self.cases)[0]
-
-                    for test_name, test_result in case_result["tests"].iteritems():
-                        test = filter(
-                            lambda pair: pair[0] == "test_%s" % test_name,
-                            inspect.getmembers(case)
-                        )[0][1]
-
-                        other_result = results["cases"][case_name]["tests"][test_name]
-
-                        if other_result["status"] == TEST_FAIL and case_result["status"] != TEST_FAIL:
-                            print "Revision %s failed %s.%s." % (revision, case_name, test_name)
-                            isbad = True
-                            break
-
-                        elif hasattr(test, "digress_comparer"):
-                            try:
-                                test.digress_comparer(test_result["value"], other_result["value"])
-                            except ComparisonError, e:
-                                print "%s differs: %s" % (test_name, e)
-                                isbad = True
-                                break
-
-                if isbad:
-                    output = self.scm.bisect("bad", revision)
-                    print "Marking revision %s as bad." % revision
-                else:
-                    output = self.scm.bisect("good", revision)
-                    print "Marking revision %s as good." % revision
-
-                if output.split("\n")[0].endswith("is the first bad commit"):
-                    print "\nBisection complete.\n"
-                    print output
-                    bisecting = False
-
-                print ""
-        except SCMError, e:
-            print e
-        finally:
-            self.scm.bisect("reset")
-
-            if dirty:
-                self.scm.unstash()
-
-    @dispatchable
-    def multicompare(self, rev_a=None, rev_b=None, mode="waterfall"):
-        """
-        Generate a comparison of tests.
-
-        Takes three optional arguments, from which revision, to which revision,
-        and the method of display (defaults to vertical "waterfall", also
-        accepts "river" for horizontal display)
-        """
-        if not rev_a: rev_a = self.scm.current_rev()
-        if not rev_b: rev_b = self.scm.current_rev()
-
-        revisions = self.scm.revisions(rev_a, rev_b)
-
-        results = []
-
-        for revision in revisions:
-            results.append(self.run(revision))
-
-        test_names = reduce(operator.add, [
-            [
-                (case_name, test_name)
-                for
-                    test_name, test_result
-                in
-                    case_result["tests"].iteritems()
-            ]
-            for
-                case_name, case_result
-            in
-                results[0]["cases"].iteritems()
-        ], [])
-
-        MAXLEN = 20
-
-        colfmt = "| %s "
-
-        table = []
-
-        if mode not in ("waterfall", "river"):
-            mode = "waterfall"
-
-            print "Unknown multicompare mode specified, defaulting to %s." % mode
-
-        if mode == "waterfall":
-            header = [ "Test" ]
-
-            for result in results:
-                header.append(result["revision"])
-
-            table.append(header)
-
-            for test_name in test_names:
-                row_data = [ ".".join(test_name) ]
-
-                for result in results:
-                    test_result = result["cases"][test_name[0]]["tests"][test_name[1]]
-
-                    if test_result["status"] != TEST_PASS:
-                        value = "did not pass: %s" % (test_result["value"])
-                    else:
-                        value = "%s (%.4f)" % (test_result["value"], test_result["time"])
-
-                    row_data.append(value)
-
-                table.append(row_data)
-
-        elif mode == "river":
-            header = [ "Revision" ]
-
-            for test_name in test_names:
-                header.append(".".join(test_name))
-
-            table.append(header)
-
-            for result in results:
-                row_data = [ result["revision"] ]
-
-                for case_name, case_result in result["cases"].iteritems():
-                    for test_name, test_result in case_result["tests"].iteritems():
-
-                        if test_result["status"] != TEST_PASS:
-                            value = "did not pass: %s" % (test_result["value"])
-                        else:
-                            value = "%s (%.4f)" % (test_result["value"], test_result["time"])
-
-                        row_data.append(value)
-
-                table.append(row_data)
-
-        breaker = "=" * (len(colfmt % "".center(MAXLEN)) * len(table[0]) + 1)
-
-        print breaker
-
-        for row in table:
-            for row_stuff in izip_longest(*[
-                textwrap.wrap(col, MAXLEN, break_on_hyphens=False) for col in row
-            ], fillvalue=""):
-                row_output = ""
-
-                for col in row_stuff:
-                    row_output += colfmt % col.ljust(MAXLEN)
-
-                row_output += "|"
-
-                print row_output
-            print breaker
-
-    @dispatchable
-    def compare(self, rev_a, rev_b=None):
-        """
-        Compare two revisions directly.
-
-        Takes two arguments, second is optional and implies current revision.
-        """
-        results_a = self.run(rev_a)
-        results_b = self.run(rev_b)
-
-        for case_name, case_result in results_a["cases"].iteritems():
-            case = filter(lambda case: case.__name__ == case_name, self.cases)[0]
-
-            header = "Comparison of case %s" % case_name
-            print header
-            print "=" * len(header)
-
-            for test_name, test_result in case_result["tests"].iteritems():
-                test = filter(
-                    lambda pair: pair[0] == "test_%s" % test_name,
-                    inspect.getmembers(case)
-                )[0][1]
-
-                other_result = results_b["cases"][case_name]["tests"][test_name]
-
-                if test_result["status"] != TEST_PASS or other_result["status"] != TEST_PASS:
-                    print "%s cannot be compared as one of the revisions have not passed it." % test_name
-
-                elif hasattr(test, "digress_comparer"):
-                    try:
-                        test.digress_comparer(test_result["value"], other_result["value"])
-                    except ComparisonError, e:
-                        print "%s differs: %s" % (test_name, e)
-                    else:
-                        print "%s does not differ." % test_name
-                else:
-                    print "%s has no comparer and therefore cannot be compared." % test_name
-
-            print ""
-
-    @dispatchable
-    def list(self):
-        """
-        List all available test cases, excluding dependencies.
-        """
-        print "\nAvailable Test Cases"
-        print "===================="
-        for case in self.cases:
-            print case.__name__
-
-    def register_case(self, case):
-        case.fixture = self
-        self.cases.append(case)
-
-class Case(object):
-    depends = []
-    fixture = None
-
-    def _get_test_by_name(self, test_name):
-        if not hasattr(self, "test_%s" % test_name):
-            raise NoSuchTestError(test_name)
-        return getattr(self, "test_%s" % test_name)
-
-    def _run_test(self, test, results):
-        test_name = test.__name__[5:]
-
-        if test_name in results:
-            raise AlreadyRunError
-
-        if hasattr(test, "digress_depends"):
-            for depend in test.digress_depends:
-                if depend in results and results[depend]["status"] != TEST_PASS:
-                    test = _skipped("failed dependency: %s" % depend)(test)
-
-                dependtest = self._get_test_by_name(depend)
-
-                try:
-                    result = self._run_test(dependtest, results)
-                except AlreadyRunError:
-                    continue
-
-                if result["status"] != TEST_PASS:
-                    test = _skipped("failed dependency: %s" % depend)(test)
-
-        start_time = time()
-        run_time = None
-
-        print "Running test %s..." % test_name,
-
-        try:
-            if not self.datastore:
-                # XXX: this smells funny
-                raise IOError
-
-            with open(os.path.join(
-                self.datastore,
-                "%s.json" % sha1(test_name).hexdigest()
-            ), "r") as f:
-                result = json.load(f)
-
-            value = str(result["value"])
-
-            if result["status"] == TEST_DISABLED:
-                status = "disabled"
-            elif result["status"] == TEST_SKIPPED:
-                status = "skipped"
-            elif result["status"] == TEST_FAIL:
-                status = "failed"
-            elif result["status"] == TEST_PASS:
-                status = "passed"
-                value = "%s (in %.4f)" % (
-                    result["value"] or "(no result)",
-                    result["time"]
-                )
-            else:
-                status = "???"
-
-            print "%s (cached): %s" % (status, value)
-        except IOError:
-            try:
-                value = test()
-            except DisabledTestError, e:
-                print "disabled: %s" % e
-                status = TEST_DISABLED
-                value = str(e)
-            except SkippedTestError, e:
-                print "skipped: %s" % e
-                status = TEST_SKIPPED
-                value = str(e)
-            except FailedTestError, e:
-                print "failed: %s" % e
-                status = TEST_FAIL
-                value = str(e)
-            except Exception, e:
-                print "failed with exception: %s" % e
-                status = TEST_FAIL
-                value = str(e)
-            else:
-                run_time = time() - start_time
-                print "passed: %s (in %.4f)" % (
-                    value or "(no result)",
-                    run_time
-                )
-                status = TEST_PASS
-
-            result = { "status" : status, "value" : value, "time" : run_time }
-
-            if self.datastore:
-                with open(os.path.join(
-                    self.datastore,
-                    "%s.json" % sha1(test_name).hexdigest()
-                ), "w") as f:
-                    json.dump(result, f)
-
-        results[test_name] = result
-        return result
-
-    def run(self):
-        print "Running case %s..." % self.__class__.__name__
-
-        if self.fixture.datastore:
-            self.datastore = os.path.join(
-                self.fixture.datastore,
-                sha1(self.__class__.__name__).hexdigest()
-            )
-            if not os.path.isdir(self.datastore):
-                os.makedirs(self.datastore)
-        else:
-            self.datastore = None
-
-        results = {}
-
-        for name, meth in inspect.getmembers(self):
-            if name[:5] == "test_":
-                try:
-                    self._run_test(meth, results)
-                except AlreadyRunError:
-                    continue
-
-        total_time = reduce(operator.add, filter(
-            None, [
-                result["time"] for result in results.values()
-            ]
-        ), 0)
-
-        overall_status = (
-            TEST_FAIL in [ result["status"] for result in results.values() ]
-        ) and CASE_FAIL or CASE_PASS
-
-        print "Case %s in %.4f.\n" % (
-            (overall_status == FIXTURE_PASS) and "passed" or "failed",
-            total_time
-        )
-
-        return { "tests" : results, "time" : total_time, "status" : overall_status }
diff --git a/android/src/main/libenc/jni/libx264/tools/gas-preprocessor.pl b/android/src/main/libenc/jni/libx264/tools/gas-preprocessor.pl
deleted file mode 100755
index cb5e3c5..0000000
--- a/android/src/main/libenc/jni/libx264/tools/gas-preprocessor.pl
+++ /dev/null
@@ -1,1033 +0,0 @@
-#!/usr/bin/env perl
-# by David Conrad
-# This code is licensed under GPLv2 or later; go to gnu.org to read it
-#  (not that it much matters for an asm preprocessor)
-# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
-use strict;
-
-# Apple's gas is ancient and doesn't support modern preprocessing features like
-# .rept and has ugly macro syntax, among other things. Thus, this script
-# implements the subset of the gas preprocessor used by x264 and ffmpeg
-# that isn't supported by Apple's gas.
-
-my %canonical_arch = ("aarch64" => "aarch64", "arm64" => "aarch64",
-                      "arm"     => "arm",
-                      "powerpc" => "powerpc", "ppc"   => "powerpc");
-
-my %comments = ("aarch64" => '//',
-                "arm"     => '@',
-                "powerpc" => '#');
-
-my @gcc_cmd;
-my @preprocess_c_cmd;
-
-my $comm;
-my $arch;
-my $as_type = "apple-gas";
-
-my $fix_unreq = $^O eq "darwin";
-my $force_thumb = 0;
-
-my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo";
-
-my $usage_str = "
-$0\n
-Gas-preprocessor.pl converts assembler files using modern GNU as syntax for
-Apple's ancient gas version or clang's incompatible integrated assembler. The
-conversion is regularly tested for Libav, x264 and vlc. Other projects might
-use different features which are not correctly handled.
-
-Options for this program needs to be separated with ' -- ' from the assembler
-command. Following options are currently supported:
-
-    -help         - this usage text
-    -arch         - target architecture
-    -as-type      - one value out of {{,apple-}{gas,clang},armasm}
-    -fix-unreq
-    -no-fix-unreq
-    -force-thumb  - assemble as thumb regardless of the input source
-                    (note, this is incomplete and only works for sources
-                    it explicitly was tested with)
-";
-
-sub usage() {
-    print $usage_str;
-}
-
-while (@ARGV) {
-    my $opt = shift;
-
-    if ($opt =~ /^-(no-)?fix-unreq$/) {
-        $fix_unreq = $1 ne "no-";
-    } elsif ($opt eq "-force-thumb") {
-        $force_thumb = 1;
-    } elsif ($opt eq "-arch") {
-        $arch = shift;
-        die "unknown arch: '$arch'\n" if not exists $comments{$arch};
-    } elsif ($opt eq "-as-type") {
-        $as_type = shift;
-        die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/;
-    } elsif ($opt eq "-help") {
-        usage();
-        exit 0;
-    } elsif ($opt eq "--" ) {
-        @gcc_cmd = @ARGV;
-    } elsif ($opt =~ /^-/) {
-        die "option '$opt' is not known. See '$0 -help' for usage information\n";
-    } else {
-        push @gcc_cmd, $opt, @ARGV;
-    }
-    last if (@gcc_cmd);
-}
-
-if (grep /\.c$/, @gcc_cmd) {
-    # C file (inline asm?) - compile
-    @preprocess_c_cmd = (@gcc_cmd, "-S");
-} elsif (grep /\.[sS]$/, @gcc_cmd) {
-    # asm file, just do C preprocessor
-    @preprocess_c_cmd = (@gcc_cmd, "-E");
-} elsif (grep /-(v|h|-version|dumpversion)/, @gcc_cmd) {
-    # pass -v/--version along, used during probing. Matching '-v' might have
-    # uninteded results but it doesn't matter much if gas-preprocessor or
-    # the compiler fails.
-    exec(@gcc_cmd);
-} else {
-    die "Unrecognized input filetype";
-}
-if ($as_type eq "armasm") {
-
-    $preprocess_c_cmd[0] = "cpp";
-    push(@preprocess_c_cmd, "-U__ELF__");
-    push(@preprocess_c_cmd, "-U__MACH__");
-
-    @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd;
-    # Remove -ignore XX parameter pairs from preprocess_c_cmd
-    my $index = 1;
-    while ($index < $#preprocess_c_cmd) {
-        if ($preprocess_c_cmd[$index] eq "-ignore" and $index + 1 < $#preprocess_c_cmd) {
-            splice(@preprocess_c_cmd, $index, 2);
-            next;
-        }
-        $index++;
-    }
-    if (grep /^-MM$/, @preprocess_c_cmd) {
-        system(@preprocess_c_cmd) == 0 or die "Error running preprocessor";
-        exit 0;
-    }
-}
-
-# if compiling, avoid creating an output file named '-.o'
-if ((grep /^-c$/, @gcc_cmd) && !(grep /^-o/, @gcc_cmd)) {
-    foreach my $i (@gcc_cmd) {
-        if ($i =~ /\.[csS]$/) {
-            my $outputfile = $i;
-            $outputfile =~ s/\.[csS]$/.o/;
-            push(@gcc_cmd, "-o");
-            push(@gcc_cmd, $outputfile);
-            last;
-        }
-    }
-}
-# replace only the '-o' argument with '-', avoids rewriting the make dependency
-# target specified with -MT to '-'
-my $index = 1;
-while ($index < $#preprocess_c_cmd) {
-    if ($preprocess_c_cmd[$index] eq "-o") {
-        $index++;
-        $preprocess_c_cmd[$index] = "-";
-    }
-    $index++;
-}
-
-my $tempfile;
-if ($as_type ne "armasm") {
-    @gcc_cmd = map { /\.[csS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
-} else {
-    @preprocess_c_cmd = grep ! /^-c$/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-m/, @preprocess_c_cmd;
-
-    @preprocess_c_cmd = grep ! /^-G/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-W/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-Z/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd;
-    @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd;
-
-    @gcc_cmd = grep ! /^-G/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-W/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-Z/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-fp/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-EHsc$/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-O/, @gcc_cmd;
-
-    my @outfiles = grep /\.(o|obj)$/, @gcc_cmd;
-    $tempfile = $outfiles[0].".asm";
-
-    # Remove most parameters from gcc_cmd, which actually is the armasm command,
-    # which doesn't support any of the common compiler/preprocessor options.
-    @gcc_cmd = grep ! /^-D/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-U/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-m/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-M/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-c$/, @gcc_cmd;
-    @gcc_cmd = grep ! /^-I/, @gcc_cmd;
-    @gcc_cmd = map { /\.S$/ ? $tempfile : $_ } @gcc_cmd;
-}
-
-# detect architecture from gcc binary name
-if (!$arch) {
-    if ($gcc_cmd[0] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
-        $arch = $1;
-    } else {
-        # look for -arch flag
-        foreach my $i (1 .. $#gcc_cmd-1) {
-            if ($gcc_cmd[$i] eq "-arch" and
-                $gcc_cmd[$i+1] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
-                $arch = $1;
-            }
-        }
-    }
-}
-
-# assume we're not cross-compiling if no -arch or the binary doesn't have the arch name
-$arch = qx/arch/ if (!$arch);
-
-die "Unknown target architecture '$arch'" if not exists $canonical_arch{$arch};
-
-$arch = $canonical_arch{$arch};
-$comm = $comments{$arch};
-my $inputcomm = $comm;
-$comm = ";" if $as_type =~ /armasm/;
-
-my %ppc_spr = (ctr    => 9,
-               vrsave => 256);
-
-open(INPUT, "-|", @preprocess_c_cmd) || die "Error running preprocessor";
-
-if ($ENV{GASPP_DEBUG}) {
-    open(ASMFILE, ">&STDOUT");
-} else {
-    if ($as_type ne "armasm") {
-        open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler";
-    } else {
-        open(ASMFILE, ">", $tempfile);
-    }
-}
-
-my $current_macro = '';
-my $macro_level = 0;
-my $rept_level = 0;
-my %macro_lines;
-my %macro_args;
-my %macro_args_default;
-my $macro_count = 0;
-my $altmacro = 0;
-my $in_irp = 0;
-
-my $num_repts;
-my @rept_lines;
-
-my @irp_args;
-my $irp_param;
-
-my @ifstack;
-
-my %symbols;
-
-my @sections;
-
-my %literal_labels;     # for ldr <reg>, =<expr>
-my $literal_num = 0;
-my $literal_expr = ".word";
-$literal_expr = ".quad" if $arch eq "aarch64";
-
-my $thumb = 0;
-
-my %thumb_labels;
-my %call_targets;
-my %mov32_targets;
-
-my %neon_alias_reg;
-my %neon_alias_type;
-
-my $temp_label_next = 0;
-my %last_temp_labels;
-my %next_temp_labels;
-
-my %labels_seen;
-
-my %aarch64_req_alias;
-
-if ($force_thumb) {
-    parse_line(".thumb\n");
-}
-
-# pass 1: parse .macro
-# note that the handling of arguments is probably overly permissive vs. gas
-# but it should be the same for valid cases
-while (<INPUT>) {
-    # remove lines starting with '#', preprocessing is done, '#' at start of
-    # the line indicates a comment for all supported archs (aarch64, arm, ppc
-    # and x86). Also strips line number comments but since they are off anyway
-    # it is no loss.
-    s/^#.*$//;
-    # remove all comments (to avoid interfering with evaluating directives)
-    s/(?<!\\)$inputcomm.*//x;
-    # Strip out windows linefeeds
-    s/\r$//;
-
-    foreach my $subline (split(";", $_)) {
-        # Add newlines at the end of lines that don't already have one
-        chomp $subline;
-        $subline .= "\n";
-        parse_line($subline);
-    }
-}
-
-sub eval_expr {
-    my $expr = $_[0];
-    while ($expr =~ /([A-Za-z._][A-Za-z0-9._]*)/g) {
-        my $sym = $1;
-        $expr =~ s/$sym/($symbols{$sym})/ if defined $symbols{$sym};
-    }
-    eval $expr;
-}
-
-sub handle_if {
-    my $line = $_[0];
-    # handle .if directives; apple's assembler doesn't support important non-basic ones
-    # evaluating them is also needed to handle recursive macros
-    if ($line =~ /\.if(n?)([a-z]*)\s+(.*)/) {
-        my $result = $1 eq "n";
-        my $type   = $2;
-        my $expr   = $3;
-
-        if ($type eq "b") {
-            $expr =~ s/\s//g;
-            $result ^= $expr eq "";
-        } elsif ($type eq "c") {
-            if ($expr =~ /(.*)\s*,\s*(.*)/) {
-                $result ^= $1 eq $2;
-            } else {
-                die "argument to .ifc not recognized";
-            }
-        } elsif ($type eq "") {
-            $result ^= eval_expr($expr) != 0;
-        } elsif ($type eq "eq") {
-            $result = eval_expr($expr) == 0;
-        } elsif ($type eq "lt") {
-            $result = eval_expr($expr) < 0;
-        } else {
-            chomp($line);
-            die "unhandled .if varient. \"$line\"";
-        }
-        push (@ifstack, $result);
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-sub parse_if_line {
-    my $line = $_[0];
-
-    # evaluate .if blocks
-    if (scalar(@ifstack)) {
-        # Don't evaluate any new if statements if we're within
-        # a repetition or macro - they will be evaluated once
-        # the repetition is unrolled or the macro is expanded.
-        if (scalar(@rept_lines) == 0 and $macro_level == 0) {
-            if ($line =~ /\.endif/) {
-                pop(@ifstack);
-                return 1;
-            } elsif ($line =~ /\.elseif\s+(.*)/) {
-                if ($ifstack[-1] == 0) {
-                    $ifstack[-1] = !!eval_expr($1);
-                } elsif ($ifstack[-1] > 0) {
-                    $ifstack[-1] = -$ifstack[-1];
-                }
-                return 1;
-            } elsif ($line =~ /\.else/) {
-                $ifstack[-1] = !$ifstack[-1];
-                return 1;
-            } elsif (handle_if($line)) {
-                return 1;
-            }
-        }
-
-        # discard lines in false .if blocks
-        foreach my $i (0 .. $#ifstack) {
-            if ($ifstack[$i] <= 0) {
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
-sub parse_line {
-    my $line = $_[0];
-
-    return if (parse_if_line($line));
-
-    if (scalar(@rept_lines) == 0) {
-        if (/\.macro/) {
-            $macro_level++;
-            if ($macro_level > 1 && !$current_macro) {
-                die "nested macros but we don't have master macro";
-            }
-        } elsif (/\.endm/) {
-            $macro_level--;
-            if ($macro_level < 0) {
-                die "unmatched .endm";
-            } elsif ($macro_level == 0) {
-                $current_macro = '';
-                return;
-            }
-        }
-    }
-
-    if ($macro_level == 0) {
-        if ($line =~ /\.(rept|irp)/) {
-            $rept_level++;
-        } elsif ($line =~ /.endr/) {
-            $rept_level--;
-        }
-    }
-
-    if ($macro_level > 1) {
-        push(@{$macro_lines{$current_macro}}, $line);
-    } elsif (scalar(@rept_lines) and $rept_level >= 1) {
-        push(@rept_lines, $line);
-    } elsif ($macro_level == 0) {
-        expand_macros($line);
-    } else {
-        if ($line =~ /\.macro\s+([\d\w\.]+)\s*,?\s*(.*)/) {
-            $current_macro = $1;
-
-            # commas in the argument list are optional, so only use whitespace as the separator
-            my $arglist = $2;
-            $arglist =~ s/,/ /g;
-
-            my @args = split(/\s+/, $arglist);
-            foreach my $i (0 .. $#args) {
-                my @argpair = split(/=/, $args[$i]);
-                $macro_args{$current_macro}[$i] = $argpair[0];
-                $argpair[0] =~ s/:vararg$//;
-                $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
-            }
-            # ensure %macro_lines has the macro name added as a key
-            $macro_lines{$current_macro} = [];
-
-        } elsif ($current_macro) {
-            push(@{$macro_lines{$current_macro}}, $line);
-        } else {
-            die "macro level without a macro name";
-        }
-    }
-}
-
-sub handle_set {
-    my $line = $_[0];
-    if ($line =~ /\.set\s+(.*),\s*(.*)/) {
-        $symbols{$1} = eval_expr($2);
-        return 1;
-    }
-    return 0;
-}
-
-sub expand_macros {
-    my $line = $_[0];
-
-    # handle .if directives; apple's assembler doesn't support important non-basic ones
-    # evaluating them is also needed to handle recursive macros
-    if (handle_if($line)) {
-        return;
-    }
-
-    if (/\.purgem\s+([\d\w\.]+)/) {
-        delete $macro_lines{$1};
-        delete $macro_args{$1};
-        delete $macro_args_default{$1};
-        return;
-    }
-
-    if ($line =~ /\.altmacro/) {
-        $altmacro = 1;
-        return;
-    }
-
-    if ($line =~ /\.noaltmacro/) {
-        $altmacro = 0;
-        return;
-    }
-
-    $line =~ s/\%([^,]*)/eval_expr($1)/eg if $altmacro;
-
-    # Strip out the .set lines from the armasm output
-    return if (handle_set($line) and $as_type eq "armasm");
-
-    if ($line =~ /\.rept\s+(.*)/) {
-        $num_repts = $1;
-        @rept_lines = ("\n");
-
-        # handle the possibility of repeating another directive on the same line
-        # .endr on the same line is not valid, I don't know if a non-directive is
-        if ($num_repts =~ s/(\.\w+.*)//) {
-            push(@rept_lines, "$1\n");
-        }
-        $num_repts = eval_expr($num_repts);
-    } elsif ($line =~ /\.irp\s+([\d\w\.]+)\s*(.*)/) {
-        $in_irp = 1;
-        $num_repts = 1;
-        @rept_lines = ("\n");
-        $irp_param = $1;
-
-        # only use whitespace as the separator
-        my $irp_arglist = $2;
-        $irp_arglist =~ s/,/ /g;
-        $irp_arglist =~ s/^\s+//;
-        @irp_args = split(/\s+/, $irp_arglist);
-    } elsif ($line =~ /\.irpc\s+([\d\w\.]+)\s*(.*)/) {
-        $in_irp = 1;
-        $num_repts = 1;
-        @rept_lines = ("\n");
-        $irp_param = $1;
-
-        my $irp_arglist = $2;
-        $irp_arglist =~ s/,/ /g;
-        $irp_arglist =~ s/^\s+//;
-        @irp_args = split(//, $irp_arglist);
-    } elsif ($line =~ /\.endr/) {
-        my @prev_rept_lines = @rept_lines;
-        my $prev_in_irp = $in_irp;
-        my @prev_irp_args = @irp_args;
-        my $prev_irp_param = $irp_param;
-        my $prev_num_repts = $num_repts;
-        @rept_lines = ();
-        $in_irp = 0;
-        @irp_args = '';
-
-        if ($prev_in_irp != 0) {
-            foreach my $i (@prev_irp_args) {
-                foreach my $origline (@prev_rept_lines) {
-                    my $line = $origline;
-                    $line =~ s/\\$prev_irp_param/$i/g;
-                    $line =~ s/\\\(\)//g;     # remove \()
-                    parse_line($line);
-                }
-            }
-        } else {
-            for (1 .. $prev_num_repts) {
-                foreach my $origline (@prev_rept_lines) {
-                    my $line = $origline;
-                    parse_line($line);
-                }
-            }
-        }
-    } elsif ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
-        handle_serialized_line($1);
-        my $macro = $2;
-
-        # commas are optional here too, but are syntactically important because
-        # parameters can be blank
-        my @arglist = split(/,/, $3);
-        my @args;
-        my @args_seperator;
-
-        my $comma_sep_required = 0;
-        foreach (@arglist) {
-            # allow arithmetic/shift operators in macro arguments
-            $_ =~ s/\s*(\+|-|\*|\/|<<|>>|<|>)\s*/$1/g;
-
-            my @whitespace_split = split(/\s+/, $_);
-            if (!@whitespace_split) {
-                push(@args, '');
-                push(@args_seperator, '');
-            } else {
-                foreach (@whitespace_split) {
-                        #print ("arglist = \"$_\"\n");
-                    if (length($_)) {
-                        push(@args, $_);
-                        my $sep = $comma_sep_required ? "," : " ";
-                        push(@args_seperator, $sep);
-                        #print ("sep = \"$sep\", arg = \"$_\"\n");
-                        $comma_sep_required = 0;
-                    }
-                }
-            }
-
-            $comma_sep_required = 1;
-        }
-
-        my %replacements;
-        if ($macro_args_default{$macro}){
-            %replacements = %{$macro_args_default{$macro}};
-        }
-
-        # construct hashtable of text to replace
-        foreach my $i (0 .. $#args) {
-            my $argname = $macro_args{$macro}[$i];
-            my @macro_args = @{ $macro_args{$macro} };
-            if ($args[$i] =~ m/=/) {
-                # arg=val references the argument name
-                # XXX: I'm not sure what the expected behaviour if a lot of
-                # these are mixed with unnamed args
-                my @named_arg = split(/=/, $args[$i]);
-                $replacements{$named_arg[0]} = $named_arg[1];
-            } elsif ($i > $#{$macro_args{$macro}}) {
-                # more args given than the macro has named args
-                # XXX: is vararg allowed on arguments before the last?
-                $argname = $macro_args{$macro}[-1];
-                if ($argname =~ s/:vararg$//) {
-                    #print "macro = $macro, args[$i] = $args[$i], args_seperator=@args_seperator, argname = $argname, arglist[$i] = $arglist[$i], arglist = @arglist, args=@args, macro_args=@macro_args\n";
-                    #$replacements{$argname} .= ", $args[$i]";
-                    $replacements{$argname} .= "$args_seperator[$i] $args[$i]";
-                } else {
-                    die "Too many arguments to macro $macro";
-                }
-            } else {
-                $argname =~ s/:vararg$//;
-                $replacements{$argname} = $args[$i];
-            }
-        }
-
-        my $count = $macro_count++;
-
-        # apply replacements as regex
-        foreach (@{$macro_lines{$macro}}) {
-            my $macro_line = $_;
-            # do replacements by longest first, this avoids wrong replacement
-            # when argument names are subsets of each other
-            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
-                $macro_line =~ s/\\$_/$replacements{$_}/g;
-            }
-            if ($altmacro) {
-                foreach (reverse sort {length $a <=> length $b} keys %replacements) {
-                    $macro_line =~ s/\b$_\b/$replacements{$_}/g;
-                }
-            }
-            $macro_line =~ s/\\\@/$count/g;
-            $macro_line =~ s/\\\(\)//g;     # remove \()
-            parse_line($macro_line);
-        }
-    } else {
-        handle_serialized_line($line);
-    }
-}
-
-sub is_arm_register {
-    my $name = $_[0];
-    if ($name eq "lr" or
-        $name eq "ip" or
-        $name =~ /^[rav]\d+$/) {
-        return 1;
-    }
-    return 0;
-}
-
-sub handle_local_label {
-    my $line = $_[0];
-    my $num  = $_[1];
-    my $dir  = $_[2];
-    my $target = "$num$dir";
-    if ($dir eq "b") {
-        $line =~ s/$target/$last_temp_labels{$num}/g;
-    } else {
-        my $name = "temp_label_$temp_label_next";
-        $temp_label_next++;
-        push(@{$next_temp_labels{$num}}, $name);
-        $line =~ s/$target/$name/g;
-    }
-    return $line;
-}
-
-sub handle_serialized_line {
-    my $line = $_[0];
-
-    # handle .previous (only with regard to .section not .subsection)
-    if ($line =~ /\.(section|text|const_data)/) {
-        push(@sections, $line);
-    } elsif ($line =~ /\.previous/) {
-        if (!$sections[-2]) {
-            die ".previous without a previous section";
-        }
-        $line = $sections[-2];
-        push(@sections, $line);
-    }
-
-    $thumb = 1 if $line =~ /\.code\s+16|\.thumb/;
-    $thumb = 0 if $line =~ /\.code\s+32|\.arm/;
-
-    # handle ldr <reg>, =<expr>
-    if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/ and $as_type ne "armasm") {
-        my $label = $literal_labels{$3};
-        if (!$label) {
-            $label = "Literal_$literal_num";
-            $literal_num++;
-            $literal_labels{$3} = $label;
-        }
-        $line = "$1 ldr$2, $label\n";
-    } elsif ($line =~ /\.ltorg/ and $as_type ne "armasm") {
-        $line .= ".align 2\n";
-        foreach my $literal (keys %literal_labels) {
-            $line .= "$literal_labels{$literal}:\n $literal_expr $literal\n";
-        }
-        %literal_labels = ();
-    }
-
-    # handle GNU as pc-relative relocations for adrp/add
-    if ($line =~ /(.*)\s*adrp([\w\s\d]+)\s*,\s*#?:pg_hi21:([^\s]+)/) {
-        $line = "$1 adrp$2, ${3}\@PAGE\n";
-    } elsif ($line =~ /(.*)\s*add([\w\s\d]+)\s*,([\w\s\d]+)\s*,\s*#?:lo12:([^\s]+)/) {
-        $line = "$1 add$2, $3, ${4}\@PAGEOFF\n";
-    }
-
-    # thumb add with large immediate needs explicit add.w
-    if ($thumb and $line =~ /add\s+.*#([^@]+)/) {
-        $line =~ s/add/add.w/ if eval_expr($1) > 255;
-    }
-
-    # mach-o local symbol names start with L (no dot)
-    $line =~ s/(?<!\w)\.(L\w+)/$1/g;
-
-    # recycle the '.func' directive for '.thumb_func'
-    if ($thumb and $as_type =~ /^apple-/) {
-        $line =~ s/\.func/.thumb_func/x;
-    }
-
-    if ($thumb and $line =~ /^\s*(\w+)\s*:/) {
-        $thumb_labels{$1}++;
-    }
-
-    if ($as_type =~ /^apple-/ and
-        $line =~ /^\s*((\w+\s*:\s*)?bl?x?(..)?(?:\.w)?|\.global)\s+(\w+)/) {
-        my $cond = $3;
-        my $label = $4;
-        # Don't interpret e.g. bic as b<cc> with ic as conditional code
-        if ($cond =~ /|$arm_cond_codes/) {
-            if (exists $thumb_labels{$label}) {
-                print ASMFILE ".thumb_func $label\n";
-            } else {
-                $call_targets{$label}++;
-            }
-        }
-    }
-
-    # @l -> lo16()  @ha -> ha16()
-    $line =~ s/,\s+([^,]+)\@l\b/, lo16($1)/g;
-    $line =~ s/,\s+([^,]+)\@ha\b/, ha16($1)/g;
-
-    # move to/from SPR
-    if ($line =~ /(\s+)(m[ft])([a-z]+)\s+(\w+)/ and exists $ppc_spr{$3}) {
-        if ($2 eq 'mt') {
-            $line = "$1${2}spr $ppc_spr{$3}, $4\n";
-        } else {
-            $line = "$1${2}spr $4, $ppc_spr{$3}\n";
-        }
-    }
-
-    if ($line =~ /\.unreq\s+(.*)/) {
-        if (defined $neon_alias_reg{$1}) {
-            delete $neon_alias_reg{$1};
-            delete $neon_alias_type{$1};
-            return;
-        } elsif (defined $aarch64_req_alias{$1}) {
-            delete $aarch64_req_alias{$1};
-            return;
-        }
-    }
-    # old gas versions store upper and lower case names on .req,
-    # but they remove only one on .unreq
-    if ($fix_unreq) {
-        if ($line =~ /\.unreq\s+(.*)/) {
-            $line = ".unreq " . lc($1) . "\n";
-            $line .= ".unreq " . uc($1) . "\n";
-        }
-    }
-
-    if ($line =~ /(\w+)\s+\.(dn|qn)\s+(\w+)(?:\.(\w+))?(\[\d+\])?/) {
-        $neon_alias_reg{$1} = "$3$5";
-        $neon_alias_type{$1} = $4;
-        return;
-    }
-    if (scalar keys %neon_alias_reg > 0 && $line =~ /^\s+v\w+/) {
-        # This line seems to possibly have a neon instruction
-        foreach (keys %neon_alias_reg) {
-            my $alias = $_;
-            # Require the register alias to match as an invididual word, not as a substring
-            # of a larger word-token.
-            if ($line =~ /\b$alias\b/) {
-                $line =~ s/\b$alias\b/$neon_alias_reg{$alias}/g;
-                # Add the type suffix. If multiple aliases match on the same line,
-                # only do this replacement the first time (a vfoo.bar string won't match v\w+).
-                $line =~ s/^(\s+)(v\w+)(\s+)/$1$2.$neon_alias_type{$alias}$3/;
-            }
-        }
-    }
-
-    if ($arch eq "aarch64" or $as_type eq "armasm") {
-        # clang's integrated aarch64 assembler in Xcode 5 does not support .req/.unreq
-        if ($line =~ /\b(\w+)\s+\.req\s+(\w+)\b/) {
-            $aarch64_req_alias{$1} = $2;
-            return;
-        }
-        foreach (keys %aarch64_req_alias) {
-            my $alias = $_;
-            # recursively resolve aliases
-            my $resolved = $aarch64_req_alias{$alias};
-            while (defined $aarch64_req_alias{$resolved}) {
-                $resolved = $aarch64_req_alias{$resolved};
-            }
-            $line =~ s/\b$alias\b/$resolved/g;
-        }
-    }
-    if ($arch eq "aarch64") {
-        # fix missing aarch64 instructions in Xcode 5.1 (beta3)
-        # mov with vector arguments is not supported, use alias orr instead
-        if ($line =~ /^\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) {
-            $line = "        orr $1, $2, $2\n";
-        }
-        # movi 16, 32 bit shifted variant, shift is optional
-        if ($line =~ /^\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) {
-            $line = "        movi $1, $2, lsl #0\n";
-        }
-        # Xcode 5 misses the alias uxtl. Replace it with the more general ushll.
-        # Clang 3.4 misses the alias sxtl too. Replace it with the more general sshll.
-        if ($line =~ /^\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) {
-            $line = "        $1shll$2 $3, $4, #0\n";
-        }
-        # clang 3.4 does not automatically use shifted immediates in add/sub
-        if ($as_type eq "clang" and
-            $line =~ /^(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ <>]+)\s*$/) {
-            my $imm = eval $3;
-            if ($imm > 4095 and not ($imm & 4095)) {
-                $line = "$1 $2#" . ($imm >> 12) . ", lsl #12\n";
-            }
-        }
-        if ($ENV{GASPP_FIX_XCODE5}) {
-            if ($line =~ /^\s*bsl\b/) {
-                $line =~ s/\b(bsl)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/;
-                $line =~ s/\b(v[0-3]?\d)\.$3\b/$1/g;
-            }
-            if ($line =~ /^\s*saddl2?\b/) {
-                $line =~ s/\b(saddl2?)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/;
-                $line =~ s/\b(v[0-3]?\d)\.\w+\b/$1/g;
-            }
-            if ($line =~ /^\s*dup\b.*\]$/) {
-                $line =~ s/\bdup(\s+v[0-3]?\d)\.(\w+)\b/dup.$2$1/g;
-                $line =~ s/\b(v[0-3]?\d)\.[bhsdBHSD](\[\d\])$/$1$2/g;
-            }
-        }
-    }
-
-    if ($as_type eq "armasm") {
-        # Also replace variables set by .set
-        foreach (keys %symbols) {
-            my $sym = $_;
-            $line =~ s/\b$sym\b/$symbols{$sym}/g;
-        }
-
-        # Handle function declarations and keep track of the declared labels
-        if ($line =~ s/^\s*\.func\s+(\w+)/$1 PROC/) {
-            $labels_seen{$1} = 1;
-        }
-
-        if ($line =~ s/^\s*(\d+)://) {
-            # Convert local labels into unique labels. armasm (at least in
-            # RVCT) has something similar, but still different enough.
-            # By converting to unique labels we avoid any possible
-            # incompatibilities.
-
-            my $num = $1;
-            foreach (@{$next_temp_labels{$num}}) {
-                $line = "$_\n" . $line;
-            }
-            @next_temp_labels{$num} = ();
-            my $name = "temp_label_$temp_label_next";
-            $temp_label_next++;
-            # The matching regexp above removes the label from the start of
-            # the line (which might contain an instruction as well), readd
-            # it on a separate line above it.
-            $line = "$name:\n" . $line;
-            $last_temp_labels{$num} = $name;
-        }
-
-        if ($line =~ s/^(\w+):/$1/) {
-            # Skip labels that have already been declared with a PROC,
-            # labels must not be declared multiple times.
-            return if (defined $labels_seen{$1});
-            $labels_seen{$1} = 1;
-        } elsif ($line !~ /(\w+) PROC/) {
-            # If not a label, make sure the line starts with whitespace,
-            # otherwise ms armasm interprets it incorrectly.
-            $line =~ s/^[\.\w]/\t$&/;
-        }
-
-
-        # Check branch instructions
-        if ($line =~ /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?(..)?(\.w)?)\s+(\w+)/) {
-            my $instr = $2;
-            my $cond = $3;
-            my $width = $4;
-            my $target = $5;
-            # Don't interpret e.g. bic as b<cc> with ic as conditional code
-            if ($cond !~ /|$arm_cond_codes/) {
-                # Not actually a branch
-            } elsif ($target =~ /(\d+)([bf])/) {
-                # The target is a local label
-                $line = handle_local_label($line, $1, $2);
-                $line =~ s/\b$instr\b/$&.w/ if $width eq "";
-            } elsif (!is_arm_register($target)) {
-                $call_targets{$target}++;
-            }
-        } elsif ($line =~ /^\s*.h?word.*\b\d+[bf]\b/) {
-            while ($line =~ /\b(\d+)([bf])\b/g) {
-                $line = handle_local_label($line, $1, $2);
-            }
-        }
-
-        # ALIGN in armasm syntax is the actual number of bytes
-        if ($line =~ /\.align\s+(\d+)/) {
-            my $align = 1 << $1;
-            $line =~ s/\.align\s(\d+)/ALIGN $align/;
-        }
-        # Convert gas style [r0, :128] into armasm [r0@128] alignment specification
-        $line =~ s/\[([^\[]+),\s*:(\d+)\]/[$1\@$2]/g;
-
-        # armasm treats logical values {TRUE} and {FALSE} separately from
-        # numeric values - logical operators and values can't be intermixed
-        # with numerical values. Evaluate !<number> and (a <> b) into numbers,
-        # let the assembler evaluate the rest of the expressions. This current
-        # only works for cases when ! and <> are used with actual constant numbers,
-        # we don't evaluate subexpressions here.
-
-        # Evaluate !<number>
-        while ($line =~ /!\s*(\d+)/g) {
-            my $val = ($1 != 0) ? 0 : 1;
-            $line =~ s/!(\d+)/$val/;
-        }
-        # Evaluate (a > b)
-        while ($line =~ /\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/) {
-            my $val;
-            if ($2 eq "<") {
-                $val = ($1 < $3) ? 1 : 0;
-            } else {
-                $val = ($1 > $3) ? 1 : 0;
-            }
-            $line =~ s/\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/$val/;
-        }
-
-        # Change a movw... #:lower16: into a mov32 pseudoinstruction
-        $line =~ s/^(\s*)movw(\s+\w+\s*,\s*)\#:lower16:(.*)$/$1mov32$2$3/;
-        # and remove the following, matching movt completely
-        $line =~ s/^\s*movt\s+\w+\s*,\s*\#:upper16:.*$//;
-
-        if ($line =~ /^\s*mov32\s+\w+,\s*([a-zA-Z]\w*)/) {
-            $mov32_targets{$1}++;
-        }
-
-        # Misc bugs/deficiencies:
-        # armasm seems unable to parse e.g. "vmov s0, s1" without a type
-        # qualifier, thus add .f32.
-        $line =~ s/^(\s+(?:vmov|vadd))(\s+s)/$1.f32$2/;
-        # armasm is unable to parse &0x - add spacing
-        $line =~ s/&0x/& 0x/g;
-    }
-
-    if ($force_thumb) {
-        # Convert register post indexing to a separate add instruction.
-        # This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]",
-        # "add r1, r1, r2".
-        $line =~ s/(ldr|str)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g;
-
-        # Convert "mov pc, lr" into "bx lr", since the former only works
-        # for switching from arm to thumb (and only in armv7), but not
-        # from thumb to arm.
-        s/mov\s*pc\s*,\s*lr/bx lr/g;
-
-        # Convert stmdb/ldmia with only one register into a plain str/ldr with post-increment/decrement
-        $line =~ s/stmdb\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g;
-        $line =~ s/ldmia\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g;
-
-        $line =~ s/\.arm/.thumb/x;
-    }
-
-    # comment out unsupported directives
-    $line =~ s/\.type/$comm$&/x        if $as_type =~ /^(apple-|armasm)/;
-    $line =~ s/\.func/$comm$&/x        if $as_type =~ /^(apple-|clang)/;
-    $line =~ s/\.endfunc/$comm$&/x     if $as_type =~ /^(apple-|clang)/;
-    $line =~ s/\.endfunc/ENDP/x        if $as_type =~ /armasm/;
-    $line =~ s/\.ltorg/$comm$&/x       if $as_type =~ /^(apple-|clang)/;
-    $line =~ s/\.ltorg/LTORG/x         if $as_type eq "armasm";
-    $line =~ s/\.size/$comm$&/x        if $as_type =~ /^(apple-|armasm)/;
-    $line =~ s/\.fpu/$comm$&/x         if $as_type =~ /^(apple-|armasm)/;
-    $line =~ s/\.arch/$comm$&/x        if $as_type =~ /^(apple-|clang|armasm)/;
-    $line =~ s/\.object_arch/$comm$&/x if $as_type =~ /^(apple-|armasm)/;
-    $line =~ s/.section\s+.note.GNU-stack.*/$comm$&/x if $as_type =~ /^(apple-|armasm)/;
-
-    $line =~ s/\.syntax/$comm$&/x      if $as_type =~ /armasm/;
-
-    $line =~ s/\.hword/.short/x;
-
-    if ($as_type =~ /^apple-/) {
-        # the syntax for these is a little different
-        $line =~ s/\.global/.globl/x;
-        # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
-        $line =~ s/(.*)\.rodata/.const_data/x;
-        $line =~ s/\.int/.long/x;
-        $line =~ s/\.float/.single/x;
-    }
-    if ($as_type eq "armasm") {
-        $line =~ s/\.global/EXPORT/x;
-        $line =~ s/\.int/dcd/x;
-        $line =~ s/\.long/dcd/x;
-        $line =~ s/\.float/dcfs/x;
-        $line =~ s/\.word/dcd/x;
-        $line =~ s/\.short/dcw/x;
-        $line =~ s/\.byte/dcb/x;
-        $line =~ s/\.thumb/THUMB/x;
-        $line =~ s/\.arm/ARM/x;
-        # The alignment in AREA is the power of two, just as .align in gas
-        $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=2, CODEALIGN/;
-        $line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, ALIGN=5/;
-
-        $line =~ s/fmxr/vmsr/;
-        $line =~ s/fmrx/vmrs/;
-        $line =~ s/fadds/vadd.f32/;
-    }
-
-    # catch unknown section names that aren't mach-o style (with a comma)
-    if ($as_type =~ /apple-/ and $line =~ /.section ([^,]*)$/) {
-        die ".section $1 unsupported; figure out the mach-o section name and add it";
-    }
-
-    print ASMFILE $line;
-}
-
-if ($as_type ne "armasm") {
-    print ASMFILE ".text\n";
-    print ASMFILE ".align 2\n";
-    foreach my $literal (keys %literal_labels) {
-        print ASMFILE "$literal_labels{$literal}:\n $literal_expr $literal\n";
-    }
-
-    map print(ASMFILE ".thumb_func $_\n"),
-        grep exists $thumb_labels{$_}, keys %call_targets;
-} else {
-    map print(ASMFILE "\tIMPORT $_\n"),
-        grep ! exists $labels_seen{$_}, (keys %call_targets, keys %mov32_targets);
-
-    print ASMFILE "\tEND\n";
-}
-
-close(INPUT) or exit 1;
-close(ASMFILE) or exit 1;
-if ($as_type eq "armasm" and ! defined $ENV{GASPP_DEBUG}) {
-    system(@gcc_cmd) == 0 or die "Error running assembler";
-}
-
-END {
-    unlink($tempfile) if defined $tempfile;
-}
-#exit 1
diff --git a/android/src/main/libenc/jni/libx264/tools/msvsdepend.sh b/android/src/main/libenc/jni/libx264/tools/msvsdepend.sh
deleted file mode 100755
index 568f611..0000000
--- a/android/src/main/libenc/jni/libx264/tools/msvsdepend.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/sh
-
-# Output a Makefile rule describing the dependencies of a given source file.
-# Expected arguments are $(CC) $(CFLAGS) $(SRC) $(OBJ)
-
-set -f
-
-[ -n "$1" ] && [ -n "$3" ] && [ -n "$4" ] || exit 1
-
-# Add flags to only perform syntax checking and output a list of included files
-# Discard all output other than included files
-# Convert '\' directory separators to '/'
-# Remove system includes (hack: check for "/Program Files" string in path)
-# Add the source file itself as a dependency
-deps="$($1 $2 -nologo -showIncludes -W0 -Zs "$3" 2>&1 |
-        grep '^Note: including file:' |
-        sed 's/^Note: including file:[[:space:]]*\(.*\)$/\1/; s/\\/\//g' |
-        sed '/\/[Pp]rogram [Ff]iles/d')
-$3"
-
-# Convert Windows paths to Unix paths if possible
-if command -v cygpath >/dev/null 2>&1 ; then
-    IFS='
-'
-    deps="$(cygpath -u -- $deps)"
-fi
-
-# Escape characters as required to create valid Makefile file names
-escape() {
-    sed 's/ /\\ /g; s/#/\\#/g; s/\$/\$\$/g'
-}
-
-# Remove prefixes that are equal to the working directory
-# Sort and remove duplicate entries
-# Escape and collapse the dependencies into one line
-deps="$(printf '%s' "$deps" |
-        sed "s/^$(pwd | sed 's/\//\\\//g')\///; s/^\.\///" |
-        sort | uniq |
-        escape | tr -s '\n\r' ' ' | sed 's/^ *\(.*\) $/\1/')"
-
-# Escape the target file name as well
-target="$(printf '%s' "$4" | escape)"
-
-printf '%s: %s\n' "$target" "$deps"
diff --git a/android/src/main/libenc/jni/libx264/tools/q_matrix_jvt.cfg b/android/src/main/libenc/jni/libx264/tools/q_matrix_jvt.cfg
deleted file mode 100755
index 973cdfc..0000000
--- a/android/src/main/libenc/jni/libx264/tools/q_matrix_jvt.cfg
+++ /dev/null
@@ -1,68 +0,0 @@
-# This an example configuration file for initializing the quantization matrix.
-# Altogether 6 matrices for 4x4 blocks and 2 matrix for 8x8 blocks.
-# The values range from 1 to 255.
-# If first value of matrix is equal to 0, default values ("JVT") will be used
-# for that matrix.
-# If a matrix is completely omitted, it will be filled with 16s.
-#
-# Note: JM expects CHROMAU and CHROMAV to be specified separately, whereas
-# x264 forces them to use the same matrix. If U and V are specified to have
-# different matrices, only the first is used.
-####################################################################################
-
-INTRA4X4_LUMA =
- 6,13,20,28,
-13,20,28,32,
-20,28,32,37,
-28,32,37,42
-
-INTRA4X4_CHROMAU =
- 6,13,20,28,
-13,20,28,32,
-20,28,32,37,
-28,32,37,42
-
-INTRA4X4_CHROMAV =
- 6,13,20,28,
-13,20,28,32,
-20,28,32,37,
-28,32,37,42
-
-INTER4X4_LUMA =
-10,14,20,24,
-14,20,24,27,
-20,24,27,30,
-24,27,30,34
-
-INTER4X4_CHROMAU =
-10,14,20,24,
-14,20,24,27,
-20,24,27,30,
-24,27,30,34
-
-INTER4X4_CHROMAV =
-10,14,20,24,
-14,20,24,27,
-20,24,27,30,
-24,27,30,34
-
-INTRA8X8_LUMA =
- 6,10,13,16,18,23,25,27,
-10,11,16,18,23,25,27,29,
-13,16,18,23,25,27,29,31,
-16,18,23,25,27,29,31,33,
-18,23,25,27,29,31,33,36,
-23,25,27,29,31,33,36,38,
-25,27,29,31,33,36,38,40,
-27,29,31,33,36,38,40,42
-
-INTER8X8_LUMA =
- 9,13,15,17,19,21,22,24,
-13,13,17,19,21,22,24,25,
-15,17,19,21,22,24,25,27,
-17,19,21,22,24,25,27,28,
-19,21,22,24,25,27,28,30,
-21,22,24,25,27,28,30,32,
-22,24,25,27,28,30,32,33,
-24,25,27,28,30,32,33,35
-
diff --git a/android/src/main/libenc/jni/libx264/tools/test_x264.py b/android/src/main/libenc/jni/libx264/tools/test_x264.py
deleted file mode 100755
index a09e5eb..0000000
--- a/android/src/main/libenc/jni/libx264/tools/test_x264.py
+++ /dev/null
@@ -1,485 +0,0 @@
-#!/usr/bin/env python
-
-import operator
-
-from optparse import OptionGroup
-
-import sys
-
-from time import time
-
-from digress.cli import Dispatcher as _Dispatcher
-from digress.errors import ComparisonError, FailedTestError, DisabledTestError
-from digress.testing import depends, comparer, Fixture, Case
-from digress.comparers import compare_pass
-from digress.scm import git as x264git
-
-from subprocess import Popen, PIPE, STDOUT
-
-import os
-import re
-import shlex
-import inspect
-
-from random import randrange, seed
-from math import ceil
-
-from itertools import imap, izip
-
-os.chdir(os.path.join(os.path.dirname(__file__), ".."))
-
-# options
-
-OPTIONS = [
-    [ "--tune %s" % t for t in ("film", "zerolatency") ],
-    ("", "--intra-refresh"),
-    ("", "--no-cabac"),
-    ("", "--interlaced"),
-    ("", "--slice-max-size 1000"),
-    ("", "--frame-packing 5"),
-    [ "--preset %s" % p for p in ("ultrafast",
-                                  "superfast",
-                                  "veryfast",
-                                  "faster",
-                                  "fast",
-                                  "medium",
-                                  "slow",
-                                  "slower",
-                                  "veryslow",
-                                  "placebo") ]
-]
-
-# end options
-
-def compare_yuv_output(width, height):
-    def _compare_yuv_output(file_a, file_b):
-        size_a = os.path.getsize(file_a)
-        size_b = os.path.getsize(file_b)
-
-        if size_a != size_b:
-            raise ComparisonError("%s is not the same size as %s" % (
-                file_a,
-                file_b
-            ))
-
-        BUFFER_SIZE = 8196
-
-        offset = 0
-
-        with open(file_a) as f_a:
-            with open(file_b) as f_b:
-                for chunk_a, chunk_b in izip(
-                    imap(
-                        lambda i: f_a.read(BUFFER_SIZE),
-                        xrange(size_a // BUFFER_SIZE + 1)
-                    ),
-                    imap(
-                        lambda i: f_b.read(BUFFER_SIZE),
-                        xrange(size_b // BUFFER_SIZE + 1)
-                    )
-                ):
-                    chunk_size = len(chunk_a)
-
-                    if chunk_a != chunk_b:
-                        for i in xrange(chunk_size):
-                            if chunk_a[i] != chunk_b[i]:
-                                # calculate the macroblock, plane and frame from the offset
-                                offs = offset + i
-
-                                y_plane_area = width * height
-                                u_plane_area = y_plane_area + y_plane_area * 0.25
-                                v_plane_area = u_plane_area + y_plane_area * 0.25
-
-                                pixel = offs % v_plane_area
-                                frame = offs // v_plane_area
-
-                                if pixel < y_plane_area:
-                                    plane = "Y"
-
-                                    pixel_x = pixel % width
-                                    pixel_y = pixel // width
-
-                                    macroblock = (ceil(pixel_x / 16.0), ceil(pixel_y / 16.0))
-                                elif pixel < u_plane_area:
-                                    plane = "U"
-
-                                    pixel -= y_plane_area
-
-                                    pixel_x = pixel % width
-                                    pixel_y = pixel // width
-
-                                    macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0))
-                                else:
-                                    plane = "V"
-
-                                    pixel -= u_plane_area
-
-                                    pixel_x = pixel % width
-                                    pixel_y = pixel // width
-
-                                    macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0))
-
-                                macroblock = tuple([ int(x) for x in macroblock ])
-
-                                raise ComparisonError("%s differs from %s at frame %d, " \
-                                                      "macroblock %s on the %s plane (offset %d)" % (
-                                    file_a,
-                                    file_b,
-                                    frame,
-                                    macroblock,
-                                    plane,
-                                    offs)
-                                )
-
-                    offset += chunk_size
-
-    return _compare_yuv_output
-
-def program_exists(program):
-    def is_exe(fpath):
-        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
-
-    fpath, fname = os.path.split(program)
-
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ["PATH"].split(os.pathsep):
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-
-    return None
-
-class x264(Fixture):
-    scm = x264git
-
-class Compile(Case):
-    @comparer(compare_pass)
-    def test_configure(self):
-        Popen([
-            "make",
-            "distclean"
-        ], stdout=PIPE, stderr=STDOUT).communicate()
-
-        configure_proc = Popen([
-            "./configure"
-        ] + self.fixture.dispatcher.configure, stdout=PIPE, stderr=STDOUT)
-
-        output = configure_proc.communicate()[0]
-        if configure_proc.returncode != 0:
-            raise FailedTestError("configure failed: %s" % output.replace("\n", " "))
-
-    @depends("configure")
-    @comparer(compare_pass)
-    def test_make(self):
-        make_proc = Popen([
-            "make",
-            "-j5"
-        ], stdout=PIPE, stderr=STDOUT)
-
-        output = make_proc.communicate()[0]
-        if make_proc.returncode != 0:
-            raise FailedTestError("make failed: %s" % output.replace("\n", " "))
-
-_dimension_pattern = re.compile(r"\w+ [[]info[]]: (\d+)x(\d+)[pi] \d+:\d+ @ \d+/\d+ fps [(][vc]fr[)]")
-
-def _YUVOutputComparisonFactory():
-    class YUVOutputComparison(Case):
-        _dimension_pattern = _dimension_pattern
-
-        depends = [ Compile ]
-        options = []
-
-        def __init__(self):
-            for name, meth in inspect.getmembers(self):
-                if name[:5] == "test_" and name[5:] not in self.fixture.dispatcher.yuv_tests:
-                    delattr(self.__class__, name)
-
-        def _run_x264(self):
-            x264_proc = Popen([
-                "./x264",
-                "-o",
-                "%s.264" % self.fixture.dispatcher.video,
-                "--dump-yuv",
-                "x264-output.yuv"
-            ] + self.options + [
-                self.fixture.dispatcher.video
-            ], stdout=PIPE, stderr=STDOUT)
-
-            output = x264_proc.communicate()[0]
-            if x264_proc.returncode != 0:
-                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))
-
-            matches = _dimension_pattern.match(output)
-
-            return (int(matches.group(1)), int(matches.group(2)))
-
-        @comparer(compare_pass)
-        def test_jm(self):
-            if not program_exists("ldecod"): raise DisabledTestError("jm unavailable")
-
-            try:
-                runres = self._run_x264()
-
-                jm_proc = Popen([
-                    "ldecod",
-                    "-i",
-                    "%s.264" % self.fixture.dispatcher.video,
-                    "-o",
-                    "jm-output.yuv"
-                ], stdout=PIPE, stderr=STDOUT)
-
-                output = jm_proc.communicate()[0]
-                if jm_proc.returncode != 0:
-                    raise FailedTestError("jm did not complete properly: %s" % output.replace("\n", " "))
-
-                try:
-                    compare_yuv_output(*runres)("x264-output.yuv", "jm-output.yuv")
-                except ComparisonError, e:
-                    raise FailedTestError(e)
-            finally:
-                try: os.remove("x264-output.yuv")
-                except: pass
-
-                try: os.remove("%s.264" % self.fixture.dispatcher.video)
-                except: pass
-
-                try: os.remove("jm-output.yuv")
-                except: pass
-
-                try: os.remove("log.dec")
-                except: pass
-
-                try: os.remove("dataDec.txt")
-                except: pass
-
-        @comparer(compare_pass)
-        def test_ffmpeg(self):
-            if not program_exists("ffmpeg"): raise DisabledTestError("ffmpeg unavailable")
-            try:
-                runres = self._run_x264()
-
-                ffmpeg_proc = Popen([
-                    "ffmpeg",
-                    "-vsync 0",
-                    "-i",
-                    "%s.264" % self.fixture.dispatcher.video,
-                    "ffmpeg-output.yuv"
-                ], stdout=PIPE, stderr=STDOUT)
-
-                output = ffmpeg_proc.communicate()[0]
-                if ffmpeg_proc.returncode != 0:
-                    raise FailedTestError("ffmpeg did not complete properly: %s" % output.replace("\n", " "))
-
-                try:
-                    compare_yuv_output(*runres)("x264-output.yuv", "ffmpeg-output.yuv")
-                except ComparisonError, e:
-                    raise FailedTestError(e)
-            finally:
-                try: os.remove("x264-output.yuv")
-                except: pass
-
-                try: os.remove("%s.264" % self.fixture.dispatcher.video)
-                except: pass
-
-                try: os.remove("ffmpeg-output.yuv")
-                except: pass
-
-    return YUVOutputComparison
-
-class Regression(Case):
-    depends = [ Compile ]
-
-    _psnr_pattern = re.compile(r"x264 [[]info[]]: PSNR Mean Y:\d+[.]\d+ U:\d+[.]\d+ V:\d+[.]\d+ Avg:\d+[.]\d+ Global:(\d+[.]\d+) kb/s:\d+[.]\d+")
-    _ssim_pattern = re.compile(r"x264 [[]info[]]: SSIM Mean Y:(\d+[.]\d+) [(]\d+[.]\d+db[)]")
-
-    def __init__(self):
-        if self.fixture.dispatcher.x264:
-            self.__class__.__name__ += " %s" % " ".join(self.fixture.dispatcher.x264)
-
-    def test_psnr(self):
-        try:
-            x264_proc = Popen([
-                "./x264",
-                "-o",
-                "%s.264" % self.fixture.dispatcher.video,
-                "--psnr"
-            ] + self.fixture.dispatcher.x264 + [
-                self.fixture.dispatcher.video
-            ], stdout=PIPE, stderr=STDOUT)
-
-            output = x264_proc.communicate()[0]
-
-            if x264_proc.returncode != 0:
-                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))
-
-            for line in output.split("\n"):
-                if line.startswith("x264 [info]: PSNR Mean"):
-                    return float(self._psnr_pattern.match(line).group(1))
-
-            raise FailedTestError("no PSNR output caught from x264")
-        finally:
-            try: os.remove("%s.264" % self.fixture.dispatcher.video)
-            except: pass
-
-    def test_ssim(self):
-        try:
-            x264_proc = Popen([
-                "./x264",
-                "-o",
-                "%s.264" % self.fixture.dispatcher.video,
-                "--ssim"
-            ] + self.fixture.dispatcher.x264 + [
-                self.fixture.dispatcher.video
-            ], stdout=PIPE, stderr=STDOUT)
-
-            output = x264_proc.communicate()[0]
-
-            if x264_proc.returncode != 0:
-                raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " "))
-
-            for line in output.split("\n"):
-                if line.startswith("x264 [info]: SSIM Mean"):
-                    return float(self._ssim_pattern.match(line).group(1))
-
-            raise FailedTestError("no PSNR output caught from x264")
-        finally:
-            try: os.remove("%s.264" % self.fixture.dispatcher.video)
-            except: pass
-
-def _generate_random_commandline():
-    commandline = []
-
-    for suboptions in OPTIONS:
-        commandline.append(suboptions[randrange(0, len(suboptions))])
-
-    return filter(None, reduce(operator.add, [ shlex.split(opt) for opt in commandline ]))
-
-_generated = []
-
-fixture = x264()
-fixture.register_case(Compile)
-
-fixture.register_case(Regression)
-
-class Dispatcher(_Dispatcher):
-    video = "akiyo_qcif.y4m"
-    products = 50
-    configure = []
-    x264 = []
-    yuv_tests = [ "jm" ]
-
-    def _populate_parser(self):
-        super(Dispatcher, self)._populate_parser()
-
-        # don't do a whole lot with this
-        tcase = _YUVOutputComparisonFactory()
-
-        yuv_tests = [ name[5:] for name, meth in filter(lambda pair: pair[0][:5] == "test_", inspect.getmembers(tcase)) ]
-
-        group = OptionGroup(self.optparse, "x264 testing-specific options")
-
-        group.add_option(
-            "-v",
-            "--video",
-            metavar="FILENAME",
-            action="callback",
-            dest="video",
-            type=str,
-            callback=lambda option, opt, value, parser: setattr(self, "video", value),
-            help="yuv video to perform testing on (default: %s)" % self.video
-        )
-
-        group.add_option(
-            "-s",
-            "--seed",
-            metavar="SEED",
-            action="callback",
-            dest="seed",
-            type=int,
-            callback=lambda option, opt, value, parser: setattr(self, "seed", value),
-            help="seed for the random number generator (default: unix timestamp)"
-        )
-
-        group.add_option(
-            "-p",
-            "--product-tests",
-            metavar="NUM",
-            action="callback",
-            dest="video",
-            type=int,
-            callback=lambda option, opt, value, parser: setattr(self, "products", value),
-            help="number of cartesian products to generate for yuv comparison testing (default: %d)" % self.products
-        )
-
-        group.add_option(
-            "--configure-with",
-            metavar="FLAGS",
-            action="callback",
-            dest="configure",
-            type=str,
-            callback=lambda option, opt, value, parser: setattr(self, "configure", shlex.split(value)),
-            help="options to run ./configure with"
-        )
-
-        group.add_option(
-            "--yuv-tests",
-            action="callback",
-            dest="yuv_tests",
-            type=str,
-            callback=lambda option, opt, value, parser: setattr(self, "yuv_tests", [
-                val.strip() for val in value.split(",")
-            ]),
-            help="select tests to run with yuv comparisons (default: %s, available: %s)" % (
-                ", ".join(self.yuv_tests),
-                ", ".join(yuv_tests)
-            )
-        )
-
-        group.add_option(
-            "--x264-with",
-            metavar="FLAGS",
-            action="callback",
-            dest="x264",
-            type=str,
-            callback=lambda option, opt, value, parser: setattr(self, "x264", shlex.split(value)),
-            help="additional options to run ./x264 with"
-        )
-
-        self.optparse.add_option_group(group)
-
-    def pre_dispatch(self):
-        if not hasattr(self, "seed"):
-            self.seed = int(time())
-
-        print "Using seed: %d" % self.seed
-        seed(self.seed)
-
-        for i in xrange(self.products):
-            YUVOutputComparison = _YUVOutputComparisonFactory()
-
-            commandline = _generate_random_commandline()
-
-            counter = 0
-
-            while commandline in _generated:
-                counter += 1
-                commandline = _generate_random_commandline()
-
-                if counter > 100:
-                    print >>sys.stderr, "Maximum command-line regeneration exceeded. "  \
-                                        "Try a different seed or specify fewer products to generate."
-                    sys.exit(1)
-
-            commandline += self.x264
-
-            _generated.append(commandline)
-
-            YUVOutputComparison.options = commandline
-            YUVOutputComparison.__name__ = ("%s %s" % (YUVOutputComparison.__name__, " ".join(commandline)))
-
-            fixture.register_case(YUVOutputComparison)
-
-Dispatcher(fixture).dispatch()
diff --git a/android/src/main/libenc/jni/libx264/version.sh b/android/src/main/libenc/jni/libx264/version.sh
deleted file mode 100755
index 178fc95..0000000
--- a/android/src/main/libenc/jni/libx264/version.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-
-cd "$(dirname "$0")" >/dev/null && [ -f x264.h ] || exit 1
-
-api="$(grep '#define X264_BUILD' < x264.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/')"
-ver="x"
-version=""
-
-if [ -d .git ] && command -v git >/dev/null 2>&1 ; then
-    localver="$(($(git rev-list HEAD | wc -l)))"
-    if [ "$localver" -gt 1 ] ; then
-        ver_diff="$(($(git rev-list origin/master..HEAD | wc -l)))"
-        ver="$((localver-ver_diff))"
-        echo "#define X264_REV $ver"
-        echo "#define X264_REV_DIFF $ver_diff"
-        if [ "$ver_diff" -ne 0 ] ; then
-            ver="$ver+$ver_diff"
-        fi
-        if git status | grep -q "modified:" ; then
-            ver="${ver}M"
-        fi
-        ver="$ver $(git rev-list -n 1 HEAD | cut -c 1-7)"
-        version=" r$ver"
-    fi
-fi
-
-echo "#define X264_VERSION \"$version\""
-echo "#define X264_POINTVER \"0.$api.$ver\""
diff --git a/android/src/main/libenc/jni/libx264/x264.c b/android/src/main/libenc/jni/libx264/x264.c
deleted file mode 100755
index 6834996..0000000
--- a/android/src/main/libenc/jni/libx264/x264.c
+++ /dev/null
@@ -1,2035 +0,0 @@
-/*****************************************************************************
- * x264: top-level x264cli functions
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Steven Walters <kemuri9@gmail.com>
- *          Fiona Glaser <fiona@x264.com>
- *          Kieran Kunhya <kieran@kunhya.com>
- *          Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifdef _WIN32
-/* The following two defines must be located before the inclusion of any system header files. */
-#define WINVER       0x0500
-#define _WIN32_WINNT 0x0500
-#include <windows.h>
-#include <io.h>       /* _setmode() */
-#include <fcntl.h>    /* _O_BINARY */
-#endif
-
-#include <signal.h>
-#include <getopt.h>
-#include "common/common.h"
-#include "x264cli.h"
-#include "input/input.h"
-#include "output/output.h"
-#include "filters/filters.h"
-
-#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ )
-
-#if HAVE_LAVF
-#undef DECLARE_ALIGNED
-#include <libavformat/avformat.h>
-#include <libavutil/pixfmt.h>
-#include <libavutil/pixdesc.h>
-#endif
-
-#if HAVE_SWSCALE
-#undef DECLARE_ALIGNED
-#include <libswscale/swscale.h>
-#endif
-
-#if HAVE_FFMS
-#include <ffms.h>
-#endif
-
-#ifdef _WIN32
-#define CONSOLE_TITLE_SIZE 200
-static wchar_t org_console_title[CONSOLE_TITLE_SIZE] = L"";
-
-void x264_cli_set_console_title( const char *title )
-{
-    wchar_t title_utf16[CONSOLE_TITLE_SIZE];
-    if( utf8_to_utf16( title, title_utf16 ) )
-        SetConsoleTitleW( title_utf16 );
-}
-
-static int utf16_to_ansi( const wchar_t *utf16, char *ansi, int size )
-{
-    int invalid;
-    return WideCharToMultiByte( CP_ACP, WC_NO_BEST_FIT_CHARS, utf16, -1, ansi, size, NULL, &invalid ) && !invalid;
-}
-
-/* Some external libraries doesn't support Unicode in filenames,
- * as a workaround we can try to get an ANSI filename instead. */
-int x264_ansi_filename( const char *filename, char *ansi_filename, int size, int create_file )
-{
-    wchar_t filename_utf16[MAX_PATH];
-    if( utf8_to_utf16( filename, filename_utf16 ) )
-    {
-        if( create_file )
-        {
-            /* Create the file using the Unicode filename if it doesn't already exist. */
-            FILE *fh = _wfopen( filename_utf16, L"ab" );
-            if( fh )
-                fclose( fh );
-        }
-
-        /* Check if the filename already is valid ANSI. */
-        if( utf16_to_ansi( filename_utf16, ansi_filename, size ) )
-            return 1;
-
-        /* Check for a legacy 8.3 short filename. */
-        int short_length = GetShortPathNameW( filename_utf16, filename_utf16, MAX_PATH );
-        if( short_length > 0 && short_length < MAX_PATH )
-            if( utf16_to_ansi( filename_utf16, ansi_filename, size ) )
-                return 1;
-    }
-    return 0;
-}
-
-/* Retrieve command line arguments as UTF-8. */
-static int get_argv_utf8( int *argc_ptr, char ***argv_ptr )
-{
-    int ret = 0;
-    wchar_t **argv_utf16 = CommandLineToArgvW( GetCommandLineW(), argc_ptr );
-    if( argv_utf16 )
-    {
-        int argc = *argc_ptr;
-        int offset = (argc+1) * sizeof(char*);
-        int size = offset;
-
-        for( int i = 0; i < argc; i++ )
-            size += WideCharToMultiByte( CP_UTF8, 0, argv_utf16[i], -1, NULL, 0, NULL, NULL );
-
-        char **argv = *argv_ptr = malloc( size );
-        if( argv )
-        {
-            for( int i = 0; i < argc; i++ )
-            {
-                argv[i] = (char*)argv + offset;
-                offset += WideCharToMultiByte( CP_UTF8, 0, argv_utf16[i], -1, argv[i], size-offset, NULL, NULL );
-            }
-            argv[argc] = NULL;
-            ret = 1;
-        }
-        LocalFree( argv_utf16 );
-    }
-    return ret;
-}
-#endif
-
-/* Ctrl-C handler */
-static volatile int b_ctrl_c = 0;
-static void sigint_handler( int a )
-{
-    b_ctrl_c = 1;
-}
-
-typedef struct {
-    int b_progress;
-    int i_seek;
-    hnd_t hin;
-    hnd_t hout;
-    FILE *qpfile;
-    FILE *tcfile_out;
-    double timebase_convert_multiplier;
-    int i_pulldown;
-} cli_opt_t;
-
-/* file i/o operation structs */
-cli_input_t cli_input;
-static cli_output_t cli_output;
-
-/* video filter operation struct */
-static cli_vid_filter_t filter;
-
-static const char * const demuxer_names[] =
-{
-    "auto",
-    "raw",
-    "y4m",
-#if HAVE_AVS
-    "avs",
-#endif
-#if HAVE_LAVF
-    "lavf",
-#endif
-#if HAVE_FFMS
-    "ffms",
-#endif
-    0
-};
-
-static const char * const muxer_names[] =
-{
-    "auto",
-    "raw",
-    "mkv",
-    "flv",
-#if HAVE_GPAC || HAVE_LSMASH
-    "mp4",
-#endif
-    0
-};
-
-static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
-static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
-static const char * const output_csp_names[] =
-{
-#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420
-    "i420",
-#endif
-#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422
-    "i422",
-#endif
-#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I444
-    "i444", "rgb",
-#endif
-    0
-};
-static const char * const chroma_format_names[] =
-{
-    [0] = "all",
-    [X264_CSP_I420] = "i420",
-    [X264_CSP_I422] = "i422",
-    [X264_CSP_I444] = "i444"
-};
-
-static const char * const range_names[] = { "auto", "tv", "pc", 0 };
-
-typedef struct
-{
-    int mod;
-    uint8_t pattern[24];
-    float fps_factor;
-} cli_pulldown_t;
-
-enum pulldown_type_e
-{
-    X264_PULLDOWN_22 = 1,
-    X264_PULLDOWN_32,
-    X264_PULLDOWN_64,
-    X264_PULLDOWN_DOUBLE,
-    X264_PULLDOWN_TRIPLE,
-    X264_PULLDOWN_EURO
-};
-
-#define TB  PIC_STRUCT_TOP_BOTTOM
-#define BT  PIC_STRUCT_BOTTOM_TOP
-#define TBT PIC_STRUCT_TOP_BOTTOM_TOP
-#define BTB PIC_STRUCT_BOTTOM_TOP_BOTTOM
-
-static const cli_pulldown_t pulldown_values[] =
-{
-    [X264_PULLDOWN_22]     = {1,  {TB},                                   1.0},
-    [X264_PULLDOWN_32]     = {4,  {TBT, BT, BTB, TB},                     1.25},
-    [X264_PULLDOWN_64]     = {2,  {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0},
-    [X264_PULLDOWN_DOUBLE] = {1,  {PIC_STRUCT_DOUBLE},                    2.0},
-    [X264_PULLDOWN_TRIPLE] = {1,  {PIC_STRUCT_TRIPLE},                    3.0},
-    [X264_PULLDOWN_EURO]   = {24, {TBT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT,
-                                   BTB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB}, 25.0/24.0}
-};
-
-#undef TB
-#undef BT
-#undef TBT
-#undef BTB
-
-// indexed by pic_struct enum
-static const float pulldown_frame_duration[10] = { 0.0, 1, 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 3 };
-
-static void help( x264_param_t *defaults, int longhelp );
-static int  parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt );
-static int  encode( x264_param_t *param, cli_opt_t *opt );
-
-/* logging and printing for within the cli system */
-static int cli_log_level;
-void x264_cli_log( const char *name, int i_level, const char *fmt, ... )
-{
-    if( i_level > cli_log_level )
-        return;
-    char *s_level;
-    switch( i_level )
-    {
-        case X264_LOG_ERROR:
-            s_level = "error";
-            break;
-        case X264_LOG_WARNING:
-            s_level = "warning";
-            break;
-        case X264_LOG_INFO:
-            s_level = "info";
-            break;
-        case X264_LOG_DEBUG:
-            s_level = "debug";
-            break;
-        default:
-            s_level = "unknown";
-            break;
-    }
-    fprintf( stderr, "%s [%s]: ", name, s_level );
-    va_list arg;
-    va_start( arg, fmt );
-    x264_vfprintf( stderr, fmt, arg );
-    va_end( arg );
-}
-
-void x264_cli_printf( int i_level, const char *fmt, ... )
-{
-    if( i_level > cli_log_level )
-        return;
-    va_list arg;
-    va_start( arg, fmt );
-    x264_vfprintf( stderr, fmt, arg );
-    va_end( arg );
-}
-
-static void print_version_info( void )
-{
-#ifdef X264_POINTVER
-    printf( "x264 "X264_POINTVER"\n" );
-#else
-    printf( "x264 0.%d.X\n", X264_BUILD );
-#endif
-#if HAVE_SWSCALE
-    printf( "(libswscale %d.%d.%d)\n", LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO );
-#endif
-#if HAVE_LAVF
-    printf( "(libavformat %d.%d.%d)\n", LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO );
-#endif
-#if HAVE_FFMS
-    printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff );
-#endif
-    printf( "built on " __DATE__ ", " );
-#ifdef __INTEL_COMPILER
-    printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE );
-#elif defined(__GNUC__)
-    printf( "gcc: " __VERSION__ "\n" );
-#elif defined(_MSC_FULL_VER)
-    printf( "msvc: %.2f (%u)\n", _MSC_VER / 100.f, _MSC_FULL_VER );
-#else
-    printf( "using an unknown compiler\n" );
-#endif
-    printf( "x264 configuration: --bit-depth=%d --chroma-format=%s\n", X264_BIT_DEPTH, chroma_format_names[X264_CHROMA_FORMAT] );
-    printf( "libx264 configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, chroma_format_names[x264_chroma_format] );
-    printf( "x264 license: " );
-#if HAVE_GPL
-    printf( "GPL version 2 or later\n" );
-#else
-    printf( "Non-GPL commercial\n" );
-#endif
-#if HAVE_SWSCALE
-    const char *license = swscale_license();
-    printf( "libswscale%s%s license: %s\n", HAVE_LAVF ? "/libavformat" : "", HAVE_FFMS ? "/ffmpegsource" : "" , license );
-    if( !strcmp( license, "nonfree and unredistributable" ) ||
-       (!HAVE_GPL && (!strcmp( license, "GPL version 2 or later" )
-                  ||  !strcmp( license, "GPL version 3 or later" ))))
-        printf( "WARNING: This binary is unredistributable!\n" );
-#endif
-}
-
-int main( int argc, char **argv )
-{
-    x264_param_t param;
-    cli_opt_t opt = {0};
-    int ret = 0;
-
-    FAIL_IF_ERROR( x264_threading_init(), "unable to initialize threading\n" );
-
-#ifdef _WIN32
-    FAIL_IF_ERROR( !get_argv_utf8( &argc, &argv ), "unable to convert command line to UTF-8\n" );
-
-    GetConsoleTitleW( org_console_title, CONSOLE_TITLE_SIZE );
-    _setmode( _fileno( stdin ),  _O_BINARY );
-    _setmode( _fileno( stdout ), _O_BINARY );
-    _setmode( _fileno( stderr ), _O_BINARY );
-#endif
-
-    /* Parse command line */
-    if( parse( argc, argv, &param, &opt ) < 0 )
-        ret = -1;
-
-#ifdef _WIN32
-    /* Restore title; it can be changed by input modules */
-    SetConsoleTitleW( org_console_title );
-#endif
-
-    /* Control-C handler */
-    signal( SIGINT, sigint_handler );
-
-    if( !ret )
-        ret = encode( &param, &opt );
-
-    /* clean up handles */
-    if( filter.free )
-        filter.free( opt.hin );
-    else if( opt.hin )
-        cli_input.close_file( opt.hin );
-    if( opt.hout )
-        cli_output.close_file( opt.hout, 0, 0 );
-    if( opt.tcfile_out )
-        fclose( opt.tcfile_out );
-    if( opt.qpfile )
-        fclose( opt.qpfile );
-
-#ifdef _WIN32
-    SetConsoleTitleW( org_console_title );
-    free( argv );
-#endif
-
-    return ret;
-}
-
-static char const *strtable_lookup( const char * const table[], int idx )
-{
-    int i = 0; while( table[i] ) i++;
-    return ( ( idx >= 0 && idx < i ) ? table[ idx ] : "???" );
-}
-
-static char *stringify_names( char *buf, const char * const names[] )
-{
-    int i = 0;
-    char *p = buf;
-    for( p[0] = 0; names[i]; i++ )
-    {
-        p += sprintf( p, "%s", names[i] );
-        if( names[i+1] )
-            p += sprintf( p, ", " );
-    }
-    return buf;
-}
-
-static void print_csp_names( int longhelp )
-{
-    if( longhelp < 2 )
-        return;
-#   define INDENT "                                "
-    printf( "                              - valid csps for `raw' demuxer:\n" );
-    printf( INDENT );
-    for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
-    {
-        if( x264_cli_csps[i].name )
-        {
-            printf( "%s", x264_cli_csps[i].name );
-            if( i+1 < X264_CSP_CLI_MAX )
-                printf( ", " );
-        }
-    }
-#if HAVE_LAVF
-    printf( "\n" );
-    printf( "                              - valid csps for `lavf' demuxer:\n" );
-    printf( INDENT );
-    size_t line_len = strlen( INDENT );
-    for( enum AVPixelFormat i = AV_PIX_FMT_NONE+1; i < AV_PIX_FMT_NB; i++ )
-    {
-        const char *pfname = av_get_pix_fmt_name( i );
-        if( pfname )
-        {
-            size_t name_len = strlen( pfname );
-            if( line_len + name_len > (80 - strlen( ", " )) )
-            {
-                printf( "\n" INDENT );
-                line_len = strlen( INDENT );
-            }
-            printf( "%s", pfname );
-            line_len += name_len;
-            if( i+1 < AV_PIX_FMT_NB )
-            {
-                printf( ", " );
-                line_len += 2;
-            }
-        }
-    }
-#endif
-    printf( "\n" );
-}
-
-static void help( x264_param_t *defaults, int longhelp )
-{
-    char buf[50];
-#define H0 printf
-#define H1 if(longhelp>=1) printf
-#define H2 if(longhelp==2) printf
-    H0( "x264 core:%d%s\n"
-        "Syntax: x264 [options] -o outfile infile\n"
-        "\n"
-        "Infile can be raw (in which case resolution is required),\n"
-        "  or YUV4MPEG (*.y4m),\n"
-        "  or Avisynth if compiled with support (%s).\n"
-        "  or libav* formats if compiled with lavf support (%s) or ffms support (%s).\n"
-        "Outfile type is selected by filename:\n"
-        " .264 -> Raw bytestream\n"
-        " .mkv -> Matroska\n"
-        " .flv -> Flash Video\n"
-        " .mp4 -> MP4 if compiled with GPAC or L-SMASH support (%s)\n"
-        "Output bit depth: %d (configured at compile time)\n"
-        "\n"
-        "Options:\n"
-        "\n"
-        "  -h, --help                  List basic options\n"
-        "      --longhelp              List more options\n"
-        "      --fullhelp              List all options\n"
-        "\n",
-        X264_BUILD, X264_VERSION,
-#if HAVE_AVS
-        "yes",
-#else
-        "no",
-#endif
-#if HAVE_LAVF
-        "yes",
-#else
-        "no",
-#endif
-#if HAVE_FFMS
-        "yes",
-#else
-        "no",
-#endif
-#if HAVE_GPAC
-        "gpac",
-#elif HAVE_LSMASH
-        "lsmash",
-#else
-        "no",
-#endif
-        x264_bit_depth
-      );
-    H0( "Example usage:\n" );
-    H0( "\n" );
-    H0( "      Constant quality mode:\n" );
-    H0( "            x264 --crf 24 -o <output> <input>\n" );
-    H0( "\n" );
-    H0( "      Two-pass with a bitrate of 1000kbps:\n" );
-    H0( "            x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
-    H0( "            x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
-    H0( "\n" );
-    H0( "      Lossless:\n" );
-    H0( "            x264 --qp 0 -o <output> <input>\n" );
-    H0( "\n" );
-    H0( "      Maximum PSNR at the cost of speed and visual quality:\n" );
-    H0( "            x264 --preset placebo --tune psnr -o <output> <input>\n" );
-    H0( "\n" );
-    H0( "      Constant bitrate at 1000kbps with a 2 second-buffer:\n");
-    H0( "            x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
-    H0( "\n" );
-    H0( "Presets:\n" );
-    H0( "\n" );
-    H0( "      --profile <string>      Force the limits of an H.264 profile\n"
-        "                                  Overrides all settings.\n" );
-    H2(
-#if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if X264_BIT_DEPTH==8
-        "                                  - baseline:\n"
-        "                                    --no-8x8dct --bframes 0 --no-cabac\n"
-        "                                    --cqm flat --weightp 0\n"
-        "                                    No interlaced.\n"
-        "                                    No lossless.\n"
-        "                                  - main:\n"
-        "                                    --no-8x8dct --cqm flat\n"
-        "                                    No lossless.\n"
-        "                                  - high:\n"
-        "                                    No lossless.\n"
-#endif
-        "                                  - high10:\n"
-        "                                    No lossless.\n"
-        "                                    Support for bit depth 8-10.\n"
-#endif
-#if X264_CHROMA_FORMAT <= X264_CSP_I422
-        "                                  - high422:\n"
-        "                                    No lossless.\n"
-        "                                    Support for bit depth 8-10.\n"
-        "                                    Support for 4:2:0/4:2:2 chroma subsampling.\n"
-#endif
-        "                                  - high444:\n"
-        "                                    Support for bit depth 8-10.\n"
-        "                                    Support for 4:2:0/4:2:2/4:4:4 chroma subsampling.\n" );
-        else H0(
-        "                                  - "
-#if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if X264_BIT_DEPTH==8
-        "baseline,main,high,"
-#endif
-        "high10,"
-#endif
-#if X264_CHROMA_FORMAT <= X264_CSP_I422
-        "high422,"
-#endif
-        "high444\n"
-               );
-    H0( "      --preset <string>       Use a preset to select encoding settings [medium]\n"
-        "                                  Overridden by user settings.\n" );
-    H2( "                                  - ultrafast:\n"
-        "                                    --no-8x8dct --aq-mode 0 --b-adapt 0\n"
-        "                                    --bframes 0 --no-cabac --no-deblock\n"
-        "                                    --no-mbtree --me dia --no-mixed-refs\n"
-        "                                    --partitions none --rc-lookahead 0 --ref 1\n"
-        "                                    --scenecut 0 --subme 0 --trellis 0\n"
-        "                                    --no-weightb --weightp 0\n"
-        "                                  - superfast:\n"
-        "                                    --no-mbtree --me dia --no-mixed-refs\n"
-        "                                    --partitions i8x8,i4x4 --rc-lookahead 0\n"
-        "                                    --ref 1 --subme 1 --trellis 0 --weightp 1\n"
-        "                                  - veryfast:\n"
-        "                                    --no-mixed-refs --rc-lookahead 10\n"
-        "                                    --ref 1 --subme 2 --trellis 0 --weightp 1\n"
-        "                                  - faster:\n"
-        "                                    --no-mixed-refs --rc-lookahead 20\n"
-        "                                    --ref 2 --subme 4 --weightp 1\n"
-        "                                  - fast:\n"
-        "                                    --rc-lookahead 30 --ref 2 --subme 6\n"
-        "                                    --weightp 1\n"
-        "                                  - medium:\n"
-        "                                    Default settings apply.\n"
-        "                                  - slow:\n"
-        "                                    --b-adapt 2 --direct auto --me umh\n"
-        "                                    --rc-lookahead 50 --ref 5 --subme 8\n"
-        "                                  - slower:\n"
-        "                                    --b-adapt 2 --direct auto --me umh\n"
-        "                                    --partitions all --rc-lookahead 60\n"
-        "                                    --ref 8 --subme 9 --trellis 2\n"
-        "                                  - veryslow:\n"
-        "                                    --b-adapt 2 --bframes 8 --direct auto\n"
-        "                                    --me umh --merange 24 --partitions all\n"
-        "                                    --ref 16 --subme 10 --trellis 2\n"
-        "                                    --rc-lookahead 60\n"
-        "                                  - placebo:\n"
-        "                                    --bframes 16 --b-adapt 2 --direct auto\n"
-        "                                    --slow-firstpass --no-fast-pskip\n"
-        "                                    --me tesa --merange 24 --partitions all\n"
-        "                                    --rc-lookahead 60 --ref 16 --subme 11\n"
-        "                                    --trellis 2\n" );
-    else H0( "                                  - ultrafast,superfast,veryfast,faster,fast\n"
-             "                                  - medium,slow,slower,veryslow,placebo\n" );
-    H0( "      --tune <string>         Tune the settings for a particular type of source\n"
-        "                              or situation\n"
-        "                                  Overridden by user settings.\n"
-        "                                  Multiple tunings are separated by commas.\n"
-        "                                  Only one psy tuning can be used at a time.\n" );
-    H2( "                                  - film (psy tuning):\n"
-        "                                    --deblock -1:-1 --psy-rd <unset>:0.15\n"
-        "                                  - animation (psy tuning):\n"
-        "                                    --bframes {+2} --deblock 1:1\n"
-        "                                    --psy-rd 0.4:<unset> --aq-strength 0.6\n"
-        "                                    --ref {Double if >1 else 1}\n"
-        "                                  - grain (psy tuning):\n"
-        "                                    --aq-strength 0.5 --no-dct-decimate\n"
-        "                                    --deadzone-inter 6 --deadzone-intra 6\n"
-        "                                    --deblock -2:-2 --ipratio 1.1 \n"
-        "                                    --pbratio 1.1 --psy-rd <unset>:0.25\n"
-        "                                    --qcomp 0.8\n"
-        "                                  - stillimage (psy tuning):\n"
-        "                                    --aq-strength 1.2 --deblock -3:-3\n"
-        "                                    --psy-rd 2.0:0.7\n"
-        "                                  - psnr (psy tuning):\n"
-        "                                    --aq-mode 0 --no-psy\n"
-        "                                  - ssim (psy tuning):\n"
-        "                                    --aq-mode 2 --no-psy\n"
-        "                                  - fastdecode:\n"
-        "                                    --no-cabac --no-deblock --no-weightb\n"
-        "                                    --weightp 0\n"
-        "                                  - zerolatency:\n"
-        "                                    --bframes 0 --force-cfr --no-mbtree\n"
-        "                                    --sync-lookahead 0 --sliced-threads\n"
-        "                                    --rc-lookahead 0\n" );
-    else H0( "                                  - psy tunings: film,animation,grain,\n"
-             "                                                 stillimage,psnr,ssim\n"
-             "                                  - other tunings: fastdecode,zerolatency\n" );
-    H2( "      --slow-firstpass        Don't force these faster settings with --pass 1:\n"
-        "                                  --no-8x8dct --me dia --partitions none\n"
-        "                                  --ref 1 --subme {2 if >2 else unchanged}\n"
-        "                                  --trellis 0 --fast-pskip\n" );
-    else H1( "      --slow-firstpass        Don't force faster settings with --pass 1\n" );
-    H0( "\n" );
-    H0( "Frame-type options:\n" );
-    H0( "\n" );
-    H0( "  -I, --keyint <integer or \"infinite\"> Maximum GOP size [%d]\n", defaults->i_keyint_max );
-    H2( "  -i, --min-keyint <integer>  Minimum GOP size [auto]\n" );
-    H2( "      --no-scenecut           Disable adaptive I-frame decision\n" );
-    H2( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
-    H2( "      --intra-refresh         Use Periodic Intra Refresh instead of IDR frames\n" );
-    H1( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
-    H1( "      --b-adapt <integer>     Adaptive B-frame decision method [%d]\n"
-        "                                  Higher values may lower threading efficiency.\n"
-        "                                  - 0: Disabled\n"
-        "                                  - 1: Fast\n"
-        "                                  - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive );
-    H2( "      --b-bias <integer>      Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
-    H1( "      --b-pyramid <string>    Keep some B-frames as references [%s]\n"
-        "                                  - none: Disabled\n"
-        "                                  - strict: Strictly hierarchical pyramid\n"
-        "                                  - normal: Non-strict (not Blu-ray compatible)\n",
-        strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) );
-    H1( "      --open-gop              Use recovery points to close GOPs\n"
-        "                              Only available with b-frames\n" );
-    H1( "      --no-cabac              Disable CABAC\n" );
-    H1( "  -r, --ref <integer>         Number of reference frames [%d]\n", defaults->i_frame_reference );
-    H1( "      --no-deblock            Disable loop filter\n" );
-    H1( "  -f, --deblock <alpha:beta>  Loop filter parameters [%d:%d]\n",
-                                       defaults->i_deblocking_filter_alphac0, defaults->i_deblocking_filter_beta );
-    H2( "      --slices <integer>      Number of slices per frame; forces rectangular\n"
-        "                              slices and is overridden by other slicing options\n" );
-    else H1( "      --slices <integer>      Number of slices per frame\n" );
-    H2( "      --slices-max <integer>  Absolute maximum slices per frame; overrides\n"
-        "                              slice-max-size/slice-max-mbs when necessary\n" );
-    H2( "      --slice-max-size <integer> Limit the size of each slice in bytes\n");
-    H2( "      --slice-max-mbs <integer> Limit the size of each slice in macroblocks (max)\n");
-    H2( "      --slice-min-mbs <integer> Limit the size of each slice in macroblocks (min)\n");
-    H0( "      --tff                   Enable interlaced mode (top field first)\n" );
-    H0( "      --bff                   Enable interlaced mode (bottom field first)\n" );
-    H2( "      --constrained-intra     Enable constrained intra prediction.\n" );
-    H0( "      --pulldown <string>     Use soft pulldown to change frame rate\n"
-        "                                  - none, 22, 32, 64, double, triple, euro (requires cfr input)\n" );
-    H2( "      --fake-interlaced       Flag stream as interlaced but encode progressive.\n"
-        "                              Makes it possible to encode 25p and 30p Blu-Ray\n"
-        "                              streams. Ignored in interlaced mode.\n" );
-    H2( "      --frame-packing <integer> For stereoscopic videos define frame arrangement\n"
-        "                                  - 0: checkerboard - pixels are alternatively from L and R\n"
-        "                                  - 1: column alternation - L and R are interlaced by column\n"
-        "                                  - 2: row alternation - L and R are interlaced by row\n"
-        "                                  - 3: side by side - L is on the left, R on the right\n"
-        "                                  - 4: top bottom - L is on top, R on bottom\n"
-        "                                  - 5: frame alternation - one view per frame\n"
-        "                                  - 6: mono - 2D frame without any frame packing\n"
-        "                                  - 7: tile format - L is on top-left, R split across\n" );
-    H0( "\n" );
-    H0( "Ratecontrol:\n" );
-    H0( "\n" );
-    H1( "  -q, --qp <integer>          Force constant QP (0-%d, 0=lossless)\n", QP_MAX );
-    H0( "  -B, --bitrate <integer>     Set bitrate (kbit/s)\n" );
-    H0( "      --crf <float>           Quality-based VBR (%d-51) [%.1f]\n", 51 - QP_MAX_SPEC, defaults->rc.f_rf_constant );
-    H1( "      --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
-    H0( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
-    H0( "      --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
-    H2( "      --vbv-init <float>      Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
-    H2( "      --crf-max <float>       With CRF+VBV, limit RF to this value\n"
-        "                                  May cause VBV underflows!\n" );
-    H2( "      --qpmin <integer>       Set min QP [%d]\n", defaults->rc.i_qp_min );
-    H2( "      --qpmax <integer>       Set max QP [%d]\n", defaults->rc.i_qp_max );
-    H2( "      --qpstep <integer>      Set max QP step [%d]\n", defaults->rc.i_qp_step );
-    H2( "      --ratetol <float>       Tolerance of ABR ratecontrol and VBV [%.1f]\n", defaults->rc.f_rate_tolerance );
-    H2( "      --ipratio <float>       QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
-    H2( "      --pbratio <float>       QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
-    H2( "      --chroma-qp-offset <integer>  QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
-    H2( "      --aq-mode <integer>     AQ method [%d]\n"
-        "                                  - 0: Disabled\n"
-        "                                  - 1: Variance AQ (complexity mask)\n"
-        "                                  - 2: Auto-variance AQ\n"
-        "                                  - 3: Auto-variance AQ with bias to dark scenes\n", defaults->rc.i_aq_mode );
-    H1( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
-        "                              textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
-    H1( "\n" );
-    H0( "  -p, --pass <integer>        Enable multipass ratecontrol\n"
-        "                                  - 1: First pass, creates stats file\n"
-        "                                  - 2: Last pass, does not overwrite stats file\n" );
-    H2( "                                  - 3: Nth pass, overwrites stats file\n" );
-    H1( "      --stats <string>        Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
-    H2( "      --no-mbtree             Disable mb-tree ratecontrol.\n");
-    H2( "      --qcomp <float>         QP curve compression [%.2f]\n", defaults->rc.f_qcompress );
-    H2( "      --cplxblur <float>      Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
-    H2( "      --qblur <float>         Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
-    H2( "      --zones <zone0>/<zone1>/...  Tweak the bitrate of regions of the video\n" );
-    H2( "                              Each zone is of the form\n"
-        "                                  <start frame>,<end frame>,<option>\n"
-        "                                  where <option> is either\n"
-        "                                      q=<integer> (force QP)\n"
-        "                                  or  b=<float> (bitrate multiplier)\n" );
-    H2( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
-        "                              Format of each line: framenumber frametype QP\n"
-        "                              QP is optional (none lets x264 choose). Frametypes: I,i,K,P,B,b.\n"
-        "                                  K=<I or i> depending on open-gop setting\n"
-        "                              QPs are restricted by qpmin/qpmax.\n" );
-    H1( "\n" );
-    H1( "Analysis:\n" );
-    H1( "\n" );
-    H1( "  -A, --partitions <string>   Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
-        "                                  - p8x8, p4x4, b8x8, i8x8, i4x4\n"
-        "                                  - none, all\n"
-        "                                  (p4x4 requires p8x8. i8x8 requires --8x8dct.)\n" );
-    H1( "      --direct <string>       Direct MV prediction mode [\"%s\"]\n"
-        "                                  - none, spatial, temporal, auto\n",
-                                       strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
-    H2( "      --no-weightb            Disable weighted prediction for B-frames\n" );
-    H1( "      --weightp <integer>     Weighted prediction for P-frames [%d]\n"
-        "                                  - 0: Disabled\n"
-        "                                  - 1: Weighted refs\n"
-        "                                  - 2: Weighted refs + Duplicates\n", defaults->analyse.i_weighted_pred );
-    H1( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
-                                       strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
-    H2( "                                  - dia: diamond search, radius 1 (fast)\n"
-        "                                  - hex: hexagonal search, radius 2\n"
-        "                                  - umh: uneven multi-hexagon search\n"
-        "                                  - esa: exhaustive search\n"
-        "                                  - tesa: hadamard exhaustive search (slow)\n" );
-    else H1( "                                  - dia, hex, umh\n" );
-    H2( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
-    H2( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
-    H2( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
-    H1( "  -m, --subme <integer>       Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
-    H2( "                                  - 0: fullpel only (not recommended)\n"
-        "                                  - 1: SAD mode decision, one qpel iteration\n"
-        "                                  - 2: SATD mode decision\n"
-        "                                  - 3-5: Progressively more qpel\n"
-        "                                  - 6: RD mode decision for I/P-frames\n"
-        "                                  - 7: RD mode decision for all frames\n"
-        "                                  - 8: RD refinement for I/P-frames\n"
-        "                                  - 9: RD refinement for all frames\n"
-        "                                  - 10: QP-RD - requires trellis=2, aq-mode>0\n"
-        "                                  - 11: Full RD: disable all early terminations\n" );
-    else H1( "                                  decision quality: 1=fast, 11=best\n" );
-    H1( "      --psy-rd <float:float>  Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
-        "                                  #1: RD (requires subme>=6)\n"
-        "                                  #2: Trellis (requires trellis, experimental)\n",
-                                       defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
-    H2( "      --no-psy                Disable all visual optimizations that worsen\n"
-        "                              both PSNR and SSIM.\n" );
-    H2( "      --no-mixed-refs         Don't decide references on a per partition basis\n" );
-    H2( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
-    H1( "      --no-8x8dct             Disable adaptive spatial transform size\n" );
-    H1( "  -t, --trellis <integer>     Trellis RD quantization. [%d]\n"
-        "                                  - 0: disabled\n"
-        "                                  - 1: enabled only on the final encode of a MB\n"
-        "                                  - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );
-    H2( "      --no-fast-pskip         Disables early SKIP detection on P-frames\n" );
-    H2( "      --no-dct-decimate       Disables coefficient thresholding on P-frames\n" );
-    H1( "      --nr <integer>          Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
-    H2( "\n" );
-    H2( "      --deadzone-inter <int>  Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
-    H2( "      --deadzone-intra <int>  Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
-    H2( "                                  Deadzones should be in the range 0 - 32.\n" );
-    H2( "      --cqm <string>          Preset quant matrices [\"flat\"]\n"
-        "                                  - jvt, flat\n" );
-    H1( "      --cqmfile <string>      Read custom quant matrices from a JM-compatible file\n" );
-    H2( "                                  Overrides any other --cqm* options.\n" );
-    H2( "      --cqm4 <list>           Set all 4x4 quant matrices\n"
-        "                                  Takes a comma-separated list of 16 integers.\n" );
-    H2( "      --cqm8 <list>           Set all 8x8 quant matrices\n"
-        "                                  Takes a comma-separated list of 64 integers.\n" );
-    H2( "      --cqm4i, --cqm4p, --cqm8i, --cqm8p <list>\n"
-        "                              Set both luma and chroma quant matrices\n" );
-    H2( "      --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc <list>\n"
-        "                              Set individual quant matrices\n" );
-    H2( "\n" );
-    H2( "Video Usability Info (Annex E):\n" );
-    H2( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
-    H2( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
-    H2( "\n" );
-    H2( "      --overscan <string>     Specify crop overscan setting [\"%s\"]\n"
-        "                                  - undef, show, crop\n",
-                                       strtable_lookup( x264_overscan_names, defaults->vui.i_overscan ) );
-    H2( "      --videoformat <string>  Specify video format [\"%s\"]\n"
-        "                                  - component, pal, ntsc, secam, mac, undef\n",
-                                       strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) );
-    H2( "      --range <string>        Specify color range [\"%s\"]\n"
-        "                                  - %s\n", range_names[0], stringify_names( buf, range_names ) );
-    H2( "      --colorprim <string>    Specify color primaries [\"%s\"]\n"
-        "                                  - undef, bt709, bt470m, bt470bg, smpte170m,\n"
-        "                                    smpte240m, film, bt2020, smpte428,\n"
-        "                                    smpte431, smpte432\n",
-                                       strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
-    H2( "      --transfer <string>     Specify transfer characteristics [\"%s\"]\n"
-        "                                  - undef, bt709, bt470m, bt470bg, smpte170m,\n"
-        "                                    smpte240m, linear, log100, log316,\n"
-        "                                    iec61966-2-4, bt1361e, iec61966-2-1,\n"
-        "                                    bt2020-10, bt2020-12, smpte2084, smpte428\n",
-                                       strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
-    H2( "      --colormatrix <string>  Specify color matrix setting [\"%s\"]\n"
-        "                                  - undef, bt709, fcc, bt470bg, smpte170m,\n"
-        "                                    smpte240m, GBR, YCgCo, bt2020nc, bt2020c,\n"
-        "                                    smpte2085\n",
-                                       strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
-    H2( "      --chromaloc <integer>   Specify chroma sample location (0 to 5) [%d]\n",
-                                       defaults->vui.i_chroma_loc );
-
-    H2( "      --nal-hrd <string>      Signal HRD information (requires vbv-bufsize)\n"
-        "                                  - none, vbr, cbr (cbr not allowed in .mp4)\n" );
-    H2( "      --filler                Force hard-CBR and generate filler (implied by\n"
-        "                              --nal-hrd cbr)\n" );
-    H2( "      --pic-struct            Force pic_struct in Picture Timing SEI\n" );
-    H2( "      --crop-rect <string>    Add 'left,top,right,bottom' to the bitstream-level\n"
-        "                              cropping rectangle\n" );
-
-    H0( "\n" );
-    H0( "Input/Output:\n" );
-    H0( "\n" );
-    H0( "  -o, --output <string>       Specify output file\n" );
-    H1( "      --muxer <string>        Specify output container format [\"%s\"]\n"
-        "                                  - %s\n", muxer_names[0], stringify_names( buf, muxer_names ) );
-    H1( "      --demuxer <string>      Specify input container format [\"%s\"]\n"
-        "                                  - %s\n", demuxer_names[0], stringify_names( buf, demuxer_names ) );
-    H1( "      --input-fmt <string>    Specify input file format (requires lavf support)\n" );
-    H1( "      --input-csp <string>    Specify input colorspace format for raw input\n" );
-    print_csp_names( longhelp );
-    H1( "      --output-csp <string>   Specify output colorspace [\"%s\"]\n"
-        "                                  - %s\n", output_csp_names[0], stringify_names( buf, output_csp_names ) );
-    H1( "      --input-depth <integer> Specify input bit depth for raw input\n" );
-    H1( "      --input-range <string>  Specify input color range [\"%s\"]\n"
-        "                                  - %s\n", range_names[0], stringify_names( buf, range_names ) );
-    H1( "      --input-res <intxint>   Specify input resolution (width x height)\n" );
-    H1( "      --index <string>        Filename for input index file\n" );
-    H0( "      --sar width:height      Specify Sample Aspect Ratio\n" );
-    H0( "      --fps <float|rational>  Specify framerate\n" );
-    H0( "      --seek <integer>        First frame to encode\n" );
-    H0( "      --frames <integer>      Maximum number of frames to encode\n" );
-    H0( "      --level <string>        Specify level (as defined by Annex A)\n" );
-    H1( "      --bluray-compat         Enable compatibility hacks for Blu-ray support\n" );
-    H1( "      --avcintra-class <integer> Use compatibility hacks for AVC-Intra class\n"
-        "                                  - 50, 100, 200\n" );
-    H1( "      --stitchable            Don't optimize headers based on video content\n"
-        "                              Ensures ability to recombine a segmented encode\n" );
-    H1( "\n" );
-    H1( "  -v, --verbose               Print stats for each frame\n" );
-    H1( "      --no-progress           Don't show the progress indicator while encoding\n" );
-    H0( "      --quiet                 Quiet Mode\n" );
-    H1( "      --log-level <string>    Specify the maximum level of logging [\"%s\"]\n"
-        "                                  - %s\n", strtable_lookup( log_level_names, cli_log_level - X264_LOG_NONE ),
-                                       stringify_names( buf, log_level_names ) );
-    H1( "      --psnr                  Enable PSNR computation\n" );
-    H1( "      --ssim                  Enable SSIM computation\n" );
-    H1( "      --threads <integer>     Force a specific number of threads\n" );
-    H2( "      --lookahead-threads <integer> Force a specific number of lookahead threads\n" );
-    H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
-    H2( "      --thread-input          Run Avisynth in its own thread\n" );
-    H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
-    H2( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
-    H2( "      --cpu-independent       Ensure exact reproducibility across different cpus,\n"
-        "                                  as opposed to letting them select different algorithms\n" );
-    H2( "      --asm <integer>         Override CPU detection\n" );
-    H2( "      --no-asm                Disable all CPU optimizations\n" );
-    H2( "      --opencl                Enable use of OpenCL\n" );
-    H2( "      --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" );
-    H2( "      --opencl-device <integer> Specify OpenCL device ordinal\n" );
-    H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
-    H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
-    H2( "      --aud                   Use access unit delimiters\n" );
-    H2( "      --force-cfr             Force constant framerate timestamp generation\n" );
-    H2( "      --tcfile-in <string>    Force timestamp generation with timecode file\n" );
-    H2( "      --tcfile-out <string>   Output timecode v2 file from input timestamps\n" );
-    H2( "      --timebase <int/int>    Specify timebase numerator and denominator\n"
-        "                 <integer>    Specify timebase numerator for input timecode file\n"
-        "                              or specify timebase denominator for other input\n" );
-    H2( "      --dts-compress          Eliminate initial delay with container DTS hack\n" );
-    H0( "\n" );
-    H0( "Filtering:\n" );
-    H0( "\n" );
-    H0( "      --vf, --video-filter <filter0>/<filter1>/... Apply video filtering to the input file\n" );
-    H0( "\n" );
-    H0( "      Filter options may be specified in <filter>:<option>=<value> format.\n" );
-    H0( "\n" );
-    H0( "      Available filters:\n" );
-    x264_register_vid_filters();
-    x264_vid_filter_help( longhelp );
-    H0( "\n" );
-}
-
-typedef enum
-{
-    OPT_FRAMES = 256,
-    OPT_SEEK,
-    OPT_QPFILE,
-    OPT_THREAD_INPUT,
-    OPT_QUIET,
-    OPT_NOPROGRESS,
-    OPT_LONGHELP,
-    OPT_PROFILE,
-    OPT_PRESET,
-    OPT_TUNE,
-    OPT_SLOWFIRSTPASS,
-    OPT_FULLHELP,
-    OPT_FPS,
-    OPT_MUXER,
-    OPT_DEMUXER,
-    OPT_INDEX,
-    OPT_INTERLACED,
-    OPT_TCFILE_IN,
-    OPT_TCFILE_OUT,
-    OPT_TIMEBASE,
-    OPT_PULLDOWN,
-    OPT_LOG_LEVEL,
-    OPT_VIDEO_FILTER,
-    OPT_INPUT_FMT,
-    OPT_INPUT_RES,
-    OPT_INPUT_CSP,
-    OPT_INPUT_DEPTH,
-    OPT_DTS_COMPRESSION,
-    OPT_OUTPUT_CSP,
-    OPT_INPUT_RANGE,
-    OPT_RANGE
-} OptionsOPT;
-
-static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
-static struct option long_options[] =
-{
-    { "help",              no_argument, NULL, 'h' },
-    { "longhelp",          no_argument, NULL, OPT_LONGHELP },
-    { "fullhelp",          no_argument, NULL, OPT_FULLHELP },
-    { "version",           no_argument, NULL, 'V' },
-    { "profile",     required_argument, NULL, OPT_PROFILE },
-    { "preset",      required_argument, NULL, OPT_PRESET },
-    { "tune",        required_argument, NULL, OPT_TUNE },
-    { "slow-firstpass",    no_argument, NULL, OPT_SLOWFIRSTPASS },
-    { "bitrate",     required_argument, NULL, 'B' },
-    { "bframes",     required_argument, NULL, 'b' },
-    { "b-adapt",     required_argument, NULL, 0 },
-    { "no-b-adapt",        no_argument, NULL, 0 },
-    { "b-bias",      required_argument, NULL, 0 },
-    { "b-pyramid",   required_argument, NULL, 0 },
-    { "open-gop",          no_argument, NULL, 0 },
-    { "bluray-compat",     no_argument, NULL, 0 },
-    { "avcintra-class", required_argument, NULL, 0 },
-    { "min-keyint",  required_argument, NULL, 'i' },
-    { "keyint",      required_argument, NULL, 'I' },
-    { "intra-refresh",     no_argument, NULL, 0 },
-    { "scenecut",    required_argument, NULL, 0 },
-    { "no-scenecut",       no_argument, NULL, 0 },
-    { "nf",                no_argument, NULL, 0 },
-    { "no-deblock",        no_argument, NULL, 0 },
-    { "filter",      required_argument, NULL, 0 },
-    { "deblock",     required_argument, NULL, 'f' },
-    { "interlaced",        no_argument, NULL, OPT_INTERLACED },
-    { "tff",               no_argument, NULL, OPT_INTERLACED },
-    { "bff",               no_argument, NULL, OPT_INTERLACED },
-    { "no-interlaced",     no_argument, NULL, OPT_INTERLACED },
-    { "constrained-intra", no_argument, NULL, 0 },
-    { "cabac",             no_argument, NULL, 0 },
-    { "no-cabac",          no_argument, NULL, 0 },
-    { "qp",          required_argument, NULL, 'q' },
-    { "qpmin",       required_argument, NULL, 0 },
-    { "qpmax",       required_argument, NULL, 0 },
-    { "qpstep",      required_argument, NULL, 0 },
-    { "crf",         required_argument, NULL, 0 },
-    { "rc-lookahead",required_argument, NULL, 0 },
-    { "ref",         required_argument, NULL, 'r' },
-    { "asm",         required_argument, NULL, 0 },
-    { "no-asm",            no_argument, NULL, 0 },
-    { "opencl",            no_argument, NULL, 1 },
-    { "opencl-clbin",required_argument, NULL, 0 },
-    { "opencl-device",required_argument, NULL, 0 },
-    { "sar",         required_argument, NULL, 0 },
-    { "fps",         required_argument, NULL, OPT_FPS },
-    { "frames",      required_argument, NULL, OPT_FRAMES },
-    { "seek",        required_argument, NULL, OPT_SEEK },
-    { "output",      required_argument, NULL, 'o' },
-    { "muxer",       required_argument, NULL, OPT_MUXER },
-    { "demuxer",     required_argument, NULL, OPT_DEMUXER },
-    { "stdout",      required_argument, NULL, OPT_MUXER },
-    { "stdin",       required_argument, NULL, OPT_DEMUXER },
-    { "index",       required_argument, NULL, OPT_INDEX },
-    { "analyse",     required_argument, NULL, 0 },
-    { "partitions",  required_argument, NULL, 'A' },
-    { "direct",      required_argument, NULL, 0 },
-    { "weightb",           no_argument, NULL, 'w' },
-    { "no-weightb",        no_argument, NULL, 0 },
-    { "weightp",     required_argument, NULL, 0 },
-    { "me",          required_argument, NULL, 0 },
-    { "merange",     required_argument, NULL, 0 },
-    { "mvrange",     required_argument, NULL, 0 },
-    { "mvrange-thread", required_argument, NULL, 0 },
-    { "subme",       required_argument, NULL, 'm' },
-    { "psy-rd",      required_argument, NULL, 0 },
-    { "no-psy",            no_argument, NULL, 0 },
-    { "psy",               no_argument, NULL, 0 },
-    { "mixed-refs",        no_argument, NULL, 0 },
-    { "no-mixed-refs",     no_argument, NULL, 0 },
-    { "no-chroma-me",      no_argument, NULL, 0 },
-    { "8x8dct",            no_argument, NULL, '8' },
-    { "no-8x8dct",         no_argument, NULL, 0 },
-    { "trellis",     required_argument, NULL, 't' },
-    { "fast-pskip",        no_argument, NULL, 0 },
-    { "no-fast-pskip",     no_argument, NULL, 0 },
-    { "no-dct-decimate",   no_argument, NULL, 0 },
-    { "aq-strength", required_argument, NULL, 0 },
-    { "aq-mode",     required_argument, NULL, 0 },
-    { "deadzone-inter", required_argument, NULL, 0 },
-    { "deadzone-intra", required_argument, NULL, 0 },
-    { "level",       required_argument, NULL, 0 },
-    { "ratetol",     required_argument, NULL, 0 },
-    { "vbv-maxrate", required_argument, NULL, 0 },
-    { "vbv-bufsize", required_argument, NULL, 0 },
-    { "vbv-init",    required_argument, NULL, 0 },
-    { "crf-max",     required_argument, NULL, 0 },
-    { "ipratio",     required_argument, NULL, 0 },
-    { "pbratio",     required_argument, NULL, 0 },
-    { "chroma-qp-offset", required_argument, NULL, 0 },
-    { "pass",        required_argument, NULL, 'p' },
-    { "stats",       required_argument, NULL, 0 },
-    { "qcomp",       required_argument, NULL, 0 },
-    { "mbtree",            no_argument, NULL, 0 },
-    { "no-mbtree",         no_argument, NULL, 0 },
-    { "qblur",       required_argument, NULL, 0 },
-    { "cplxblur",    required_argument, NULL, 0 },
-    { "zones",       required_argument, NULL, 0 },
-    { "qpfile",      required_argument, NULL, OPT_QPFILE },
-    { "threads",     required_argument, NULL, 0 },
-    { "lookahead-threads", required_argument, NULL, 0 },
-    { "sliced-threads",    no_argument, NULL, 0 },
-    { "no-sliced-threads", no_argument, NULL, 0 },
-    { "slice-max-size",    required_argument, NULL, 0 },
-    { "slice-max-mbs",     required_argument, NULL, 0 },
-    { "slice-min-mbs",     required_argument, NULL, 0 },
-    { "slices",            required_argument, NULL, 0 },
-    { "slices-max",        required_argument, NULL, 0 },
-    { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
-    { "sync-lookahead",    required_argument, NULL, 0 },
-    { "non-deterministic", no_argument, NULL, 0 },
-    { "cpu-independent",   no_argument, NULL, 0 },
-    { "psnr",              no_argument, NULL, 0 },
-    { "ssim",              no_argument, NULL, 0 },
-    { "quiet",             no_argument, NULL, OPT_QUIET },
-    { "verbose",           no_argument, NULL, 'v' },
-    { "log-level",   required_argument, NULL, OPT_LOG_LEVEL },
-    { "no-progress",       no_argument, NULL, OPT_NOPROGRESS },
-    { "dump-yuv",    required_argument, NULL, 0 },
-    { "sps-id",      required_argument, NULL, 0 },
-    { "aud",               no_argument, NULL, 0 },
-    { "nr",          required_argument, NULL, 0 },
-    { "cqm",         required_argument, NULL, 0 },
-    { "cqmfile",     required_argument, NULL, 0 },
-    { "cqm4",        required_argument, NULL, 0 },
-    { "cqm4i",       required_argument, NULL, 0 },
-    { "cqm4iy",      required_argument, NULL, 0 },
-    { "cqm4ic",      required_argument, NULL, 0 },
-    { "cqm4p",       required_argument, NULL, 0 },
-    { "cqm4py",      required_argument, NULL, 0 },
-    { "cqm4pc",      required_argument, NULL, 0 },
-    { "cqm8",        required_argument, NULL, 0 },
-    { "cqm8i",       required_argument, NULL, 0 },
-    { "cqm8p",       required_argument, NULL, 0 },
-    { "overscan",    required_argument, NULL, 0 },
-    { "videoformat", required_argument, NULL, 0 },
-    { "range",       required_argument, NULL, OPT_RANGE },
-    { "colorprim",   required_argument, NULL, 0 },
-    { "transfer",    required_argument, NULL, 0 },
-    { "colormatrix", required_argument, NULL, 0 },
-    { "chromaloc",   required_argument, NULL, 0 },
-    { "force-cfr",         no_argument, NULL, 0 },
-    { "tcfile-in",   required_argument, NULL, OPT_TCFILE_IN },
-    { "tcfile-out",  required_argument, NULL, OPT_TCFILE_OUT },
-    { "timebase",    required_argument, NULL, OPT_TIMEBASE },
-    { "pic-struct",        no_argument, NULL, 0 },
-    { "crop-rect",   required_argument, NULL, 0 },
-    { "nal-hrd",     required_argument, NULL, 0 },
-    { "pulldown",    required_argument, NULL, OPT_PULLDOWN },
-    { "fake-interlaced",   no_argument, NULL, 0 },
-    { "frame-packing",     required_argument, NULL, 0 },
-    { "vf",          required_argument, NULL, OPT_VIDEO_FILTER },
-    { "video-filter", required_argument, NULL, OPT_VIDEO_FILTER },
-    { "input-fmt",   required_argument, NULL, OPT_INPUT_FMT },
-    { "input-res",   required_argument, NULL, OPT_INPUT_RES },
-    { "input-csp",   required_argument, NULL, OPT_INPUT_CSP },
-    { "input-depth", required_argument, NULL, OPT_INPUT_DEPTH },
-    { "dts-compress",      no_argument, NULL, OPT_DTS_COMPRESSION },
-    { "output-csp",  required_argument, NULL, OPT_OUTPUT_CSP },
-    { "input-range", required_argument, NULL, OPT_INPUT_RANGE },
-    { "stitchable",        no_argument, NULL, 0 },
-    { "filler",            no_argument, NULL, 0 },
-    {0, 0, 0, 0}
-};
-
-static int select_output( const char *muxer, char *filename, x264_param_t *param )
-{
-    const char *ext = get_filename_extension( filename );
-    if( !strcmp( filename, "-" ) || strcasecmp( muxer, "auto" ) )
-        ext = muxer;
-
-    if( !strcasecmp( ext, "mp4" ) )
-    {
-#if HAVE_GPAC || HAVE_LSMASH
-        cli_output = mp4_output;
-        param->b_annexb = 0;
-        param->b_repeat_headers = 0;
-        if( param->i_nal_hrd == X264_NAL_HRD_CBR )
-        {
-            x264_cli_log( "x264", X264_LOG_WARNING, "cbr nal-hrd is not compatible with mp4\n" );
-            param->i_nal_hrd = X264_NAL_HRD_VBR;
-        }
-#else
-        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with MP4 output support\n" );
-        return -1;
-#endif
-    }
-    else if( !strcasecmp( ext, "mkv" ) )
-    {
-        cli_output = mkv_output;
-        param->b_annexb = 0;
-        param->b_repeat_headers = 0;
-    }
-    else if( !strcasecmp( ext, "flv" ) )
-    {
-        cli_output = flv_output;
-        param->b_annexb = 0;
-        param->b_repeat_headers = 0;
-    }
-    else
-        cli_output = raw_output;
-    return 0;
-}
-
-static int select_input( const char *demuxer, char *used_demuxer, char *filename,
-                         hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
-{
-    int b_auto = !strcasecmp( demuxer, "auto" );
-    const char *ext = b_auto ? get_filename_extension( filename ) : "";
-    int b_regular = strcmp( filename, "-" );
-    if( !b_regular && b_auto )
-        ext = "raw";
-    b_regular = b_regular && x264_is_regular_file_path( filename );
-    if( b_regular )
-    {
-        FILE *f = x264_fopen( filename, "r" );
-        if( f )
-        {
-            b_regular = x264_is_regular_file( f );
-            fclose( f );
-        }
-    }
-    const char *module = b_auto ? ext : demuxer;
-
-    if( !strcasecmp( module, "avs" ) || !strcasecmp( ext, "d2v" ) || !strcasecmp( ext, "dga" ) )
-    {
-#if HAVE_AVS
-        cli_input = avs_input;
-        module = "avs";
-#else
-        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with AVS input support\n" );
-        return -1;
-#endif
-    }
-    else if( !strcasecmp( module, "y4m" ) )
-        cli_input = y4m_input;
-    else if( !strcasecmp( module, "raw" ) || !strcasecmp( ext, "yuv" ) )
-        cli_input = raw_input;
-    else
-    {
-#if HAVE_FFMS
-        if( b_regular && (b_auto || !strcasecmp( demuxer, "ffms" )) &&
-            !ffms_input.open_file( filename, p_handle, info, opt ) )
-        {
-            module = "ffms";
-            b_auto = 0;
-            cli_input = ffms_input;
-        }
-#endif
-#if HAVE_LAVF
-        if( (b_auto || !strcasecmp( demuxer, "lavf" )) &&
-            !lavf_input.open_file( filename, p_handle, info, opt ) )
-        {
-            module = "lavf";
-            b_auto = 0;
-            cli_input = lavf_input;
-        }
-#endif
-#if HAVE_AVS
-        if( b_regular && (b_auto || !strcasecmp( demuxer, "avs" )) &&
-            !avs_input.open_file( filename, p_handle, info, opt ) )
-        {
-            module = "avs";
-            b_auto = 0;
-            cli_input = avs_input;
-        }
-#endif
-        if( b_auto && !raw_input.open_file( filename, p_handle, info, opt ) )
-        {
-            module = "raw";
-            b_auto = 0;
-            cli_input = raw_input;
-        }
-
-        FAIL_IF_ERROR( !(*p_handle), "could not open input file `%s' via any method!\n", filename );
-    }
-    strcpy( used_demuxer, module );
-
-    return 0;
-}
-
-static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, x264_param_t *param, int output_csp )
-{
-    x264_register_vid_filters();
-
-    /* intialize baseline filters */
-    if( x264_init_vid_filter( "source", handle, &filter, info, param, NULL ) ) /* wrap demuxer into a filter */
-        return -1;
-    if( x264_init_vid_filter( "resize", handle, &filter, info, param, "normcsp" ) ) /* normalize csps to be of a known/supported format */
-        return -1;
-    if( x264_init_vid_filter( "fix_vfr_pts", handle, &filter, info, param, NULL ) ) /* fix vfr pts */
-        return -1;
-
-    /* parse filter chain */
-    for( char *p = sequence; p && *p; )
-    {
-        int tok_len = strcspn( p, "/" );
-        int p_len = strlen( p );
-        p[tok_len] = 0;
-        int name_len = strcspn( p, ":" );
-        p[name_len] = 0;
-        name_len += name_len != tok_len;
-        if( x264_init_vid_filter( p, handle, &filter, info, param, p + name_len ) )
-            return -1;
-        p += X264_MIN( tok_len+1, p_len );
-    }
-
-    /* force end result resolution */
-    if( !param->i_width && !param->i_height )
-    {
-        param->i_height = info->height;
-        param->i_width  = info->width;
-    }
-    /* force the output csp to what the user specified (or the default) */
-    param->i_csp = info->csp;
-    int csp = info->csp & X264_CSP_MASK;
-    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp >= X264_CSP_I422) )
-        param->i_csp = X264_CSP_I420;
-    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp >= X264_CSP_I444) )
-        param->i_csp = X264_CSP_I422;
-    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp >= X264_CSP_BGR) )
-        param->i_csp = X264_CSP_I444;
-    else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
-        param->i_csp = X264_CSP_RGB;
-    param->i_csp |= info->csp & X264_CSP_HIGH_DEPTH;
-    /* if the output range is not forced, assign it to the input one now */
-    if( param->vui.b_fullrange == RANGE_AUTO )
-        param->vui.b_fullrange = info->fullrange;
-
-    if( x264_init_vid_filter( "resize", handle, &filter, info, param, NULL ) )
-        return -1;
-
-    char args[20];
-    sprintf( args, "bit_depth=%d", x264_bit_depth );
-
-    if( x264_init_vid_filter( "depth", handle, &filter, info, param, args ) )
-        return -1;
-
-    return 0;
-}
-
-static int parse_enum_name( const char *arg, const char * const *names, const char **dst )
-{
-    for( int i = 0; names[i]; i++ )
-        if( !strcasecmp( arg, names[i] ) )
-        {
-            *dst = names[i];
-            return 0;
-        }
-    return -1;
-}
-
-static int parse_enum_value( const char *arg, const char * const *names, int *dst )
-{
-    for( int i = 0; names[i]; i++ )
-        if( !strcasecmp( arg, names[i] ) )
-        {
-            *dst = i;
-            return 0;
-        }
-    return -1;
-}
-
-static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
-{
-    char *input_filename = NULL;
-    const char *demuxer = demuxer_names[0];
-    char *output_filename = NULL;
-    const char *muxer = muxer_names[0];
-    char *tcfile_name = NULL;
-    x264_param_t defaults;
-    char *profile = NULL;
-    char *vid_filters = NULL;
-    int b_thread_input = 0;
-    int b_turbo = 1;
-    int b_user_ref = 0;
-    int b_user_fps = 0;
-    int b_user_interlaced = 0;
-    cli_input_opt_t input_opt;
-    cli_output_opt_t output_opt;
-    char *preset = NULL;
-    char *tune = NULL;
-
-    x264_param_default( &defaults );
-    cli_log_level = defaults.i_log_level;
-
-    memset( &input_opt, 0, sizeof(cli_input_opt_t) );
-    memset( &output_opt, 0, sizeof(cli_output_opt_t) );
-    input_opt.bit_depth = 8;
-    input_opt.input_range = input_opt.output_range = param->vui.b_fullrange = RANGE_AUTO;
-    int output_csp = defaults.i_csp;
-    opt->b_progress = 1;
-
-    /* Presets are applied before all other options. */
-    for( optind = 0;; )
-    {
-        int c = getopt_long( argc, argv, short_options, long_options, NULL );
-        if( c == -1 )
-            break;
-        if( c == OPT_PRESET )
-            preset = optarg;
-        if( c == OPT_TUNE )
-            tune = optarg;
-        else if( c == '?' )
-            return -1;
-    }
-
-    if( preset && !strcasecmp( preset, "placebo" ) )
-        b_turbo = 0;
-
-    if( x264_param_default_preset( param, preset, tune ) < 0 )
-        return -1;
-
-    /* Parse command line options */
-    for( optind = 0;; )
-    {
-        int b_error = 0;
-        int long_options_index = -1;
-
-        int c = getopt_long( argc, argv, short_options, long_options, &long_options_index );
-
-        if( c == -1 )
-        {
-            break;
-        }
-
-        switch( c )
-        {
-            case 'h':
-                help( &defaults, 0 );
-                exit(0);
-            case OPT_LONGHELP:
-                help( &defaults, 1 );
-                exit(0);
-            case OPT_FULLHELP:
-                help( &defaults, 2 );
-                exit(0);
-            case 'V':
-                print_version_info();
-                exit(0);
-            case OPT_FRAMES:
-                param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
-                break;
-            case OPT_SEEK:
-                opt->i_seek = X264_MAX( atoi( optarg ), 0 );
-                break;
-            case 'o':
-                output_filename = optarg;
-                break;
-            case OPT_MUXER:
-                FAIL_IF_ERROR( parse_enum_name( optarg, muxer_names, &muxer ), "Unknown muxer `%s'\n", optarg );
-                break;
-            case OPT_DEMUXER:
-                FAIL_IF_ERROR( parse_enum_name( optarg, demuxer_names, &demuxer ), "Unknown demuxer `%s'\n", optarg );
-                break;
-            case OPT_INDEX:
-                input_opt.index_file = optarg;
-                break;
-            case OPT_QPFILE:
-                opt->qpfile = x264_fopen( optarg, "rb" );
-                FAIL_IF_ERROR( !opt->qpfile, "can't open qpfile `%s'\n", optarg );
-                if( !x264_is_regular_file( opt->qpfile ) )
-                {
-                    x264_cli_log( "x264", X264_LOG_ERROR, "qpfile incompatible with non-regular file `%s'\n", optarg );
-                    fclose( opt->qpfile );
-                    return -1;
-                }
-                break;
-            case OPT_THREAD_INPUT:
-                b_thread_input = 1;
-                break;
-            case OPT_QUIET:
-                cli_log_level = param->i_log_level = X264_LOG_NONE;
-                break;
-            case 'v':
-                cli_log_level = param->i_log_level = X264_LOG_DEBUG;
-                break;
-            case OPT_LOG_LEVEL:
-                if( !parse_enum_value( optarg, log_level_names, &cli_log_level ) )
-                    cli_log_level += X264_LOG_NONE;
-                else
-                    cli_log_level = atoi( optarg );
-                param->i_log_level = cli_log_level;
-                break;
-            case OPT_NOPROGRESS:
-                opt->b_progress = 0;
-                break;
-            case OPT_TUNE:
-            case OPT_PRESET:
-                break;
-            case OPT_PROFILE:
-                profile = optarg;
-                break;
-            case OPT_SLOWFIRSTPASS:
-                b_turbo = 0;
-                break;
-            case 'r':
-                b_user_ref = 1;
-                goto generic_option;
-            case OPT_FPS:
-                b_user_fps = 1;
-                param->b_vfr_input = 0;
-                goto generic_option;
-            case OPT_INTERLACED:
-                b_user_interlaced = 1;
-                goto generic_option;
-            case OPT_TCFILE_IN:
-                tcfile_name = optarg;
-                break;
-            case OPT_TCFILE_OUT:
-                opt->tcfile_out = x264_fopen( optarg, "wb" );
-                FAIL_IF_ERROR( !opt->tcfile_out, "can't open `%s'\n", optarg );
-                break;
-            case OPT_TIMEBASE:
-                input_opt.timebase = optarg;
-                break;
-            case OPT_PULLDOWN:
-                FAIL_IF_ERROR( parse_enum_value( optarg, pulldown_names, &opt->i_pulldown ), "Unknown pulldown `%s'\n", optarg );
-                break;
-            case OPT_VIDEO_FILTER:
-                vid_filters = optarg;
-                break;
-            case OPT_INPUT_FMT:
-                input_opt.format = optarg;
-                break;
-            case OPT_INPUT_RES:
-                input_opt.resolution = optarg;
-                break;
-            case OPT_INPUT_CSP:
-                input_opt.colorspace = optarg;
-                break;
-            case OPT_INPUT_DEPTH:
-                input_opt.bit_depth = atoi( optarg );
-                break;
-            case OPT_DTS_COMPRESSION:
-                output_opt.use_dts_compress = 1;
-                break;
-            case OPT_OUTPUT_CSP:
-                FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg );
-                // correct the parsed value to the libx264 csp value
-#if X264_CHROMA_FORMAT
-                static const uint8_t output_csp_fix[] = { X264_CHROMA_FORMAT, X264_CSP_RGB };
-#else
-                static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
-#endif
-                param->i_csp = output_csp = output_csp_fix[output_csp];
-                break;
-            case OPT_INPUT_RANGE:
-                FAIL_IF_ERROR( parse_enum_value( optarg, range_names, &input_opt.input_range ), "Unknown input range `%s'\n", optarg );
-                input_opt.input_range += RANGE_AUTO;
-                break;
-            case OPT_RANGE:
-                FAIL_IF_ERROR( parse_enum_value( optarg, range_names, &param->vui.b_fullrange ), "Unknown range `%s'\n", optarg );
-                input_opt.output_range = param->vui.b_fullrange += RANGE_AUTO;
-                break;
-            default:
-generic_option:
-            {
-                if( long_options_index < 0 )
-                {
-                    for( int i = 0; long_options[i].name; i++ )
-                        if( long_options[i].val == c )
-                        {
-                            long_options_index = i;
-                            break;
-                        }
-                    if( long_options_index < 0 )
-                    {
-                        /* getopt_long already printed an error message */
-                        return -1;
-                    }
-                }
-
-                b_error |= x264_param_parse( param, long_options[long_options_index].name, optarg );
-            }
-        }
-
-        if( b_error )
-        {
-            const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind-2];
-            x264_cli_log( "x264", X264_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg );
-            return -1;
-        }
-    }
-
-    /* If first pass mode is used, apply faster settings. */
-    if( b_turbo )
-        x264_param_apply_fastfirstpass( param );
-
-    /* Apply profile restrictions. */
-    if( x264_param_apply_profile( param, profile ) < 0 )
-        return -1;
-
-    /* Get the file name */
-    FAIL_IF_ERROR( optind > argc - 1 || !output_filename, "No %s file. Run x264 --help for a list of options.\n",
-                   optind > argc - 1 ? "input" : "output" );
-
-    if( select_output( muxer, output_filename, param ) )
-        return -1;
-    FAIL_IF_ERROR( cli_output.open_file( output_filename, &opt->hout, &output_opt ), "could not open output file `%s'\n", output_filename );
-
-    input_filename = argv[optind++];
-    video_info_t info = {0};
-    char demuxername[5];
-
-    /* set info flags to be overwritten by demuxer as necessary. */
-    info.csp        = param->i_csp;
-    info.fps_num    = param->i_fps_num;
-    info.fps_den    = param->i_fps_den;
-    info.fullrange  = input_opt.input_range == RANGE_PC;
-    info.interlaced = param->b_interlaced;
-    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
-    {
-        info.sar_width  = param->vui.i_sar_width;
-        info.sar_height = param->vui.i_sar_height;
-    }
-    info.tff        = param->b_tff;
-    info.vfr        = param->b_vfr_input;
-
-    input_opt.seek = opt->i_seek;
-    input_opt.progress = opt->b_progress;
-    input_opt.output_csp = output_csp;
-
-    if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
-        return -1;
-
-    FAIL_IF_ERROR( !opt->hin && cli_input.open_file( input_filename, &opt->hin, &info, &input_opt ),
-                   "could not open input file `%s'\n", input_filename );
-
-    x264_reduce_fraction( &info.sar_width, &info.sar_height );
-    x264_reduce_fraction( &info.fps_num, &info.fps_den );
-    x264_cli_log( demuxername, X264_LOG_INFO, "%dx%d%c %u:%u @ %u/%u fps (%cfr)\n", info.width,
-                  info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
-                  info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );
-
-    if( tcfile_name )
-    {
-        FAIL_IF_ERROR( b_user_fps, "--fps + --tcfile-in is incompatible.\n" );
-        FAIL_IF_ERROR( timecode_input.open_file( tcfile_name, &opt->hin, &info, &input_opt ), "timecode input failed\n" );
-        cli_input = timecode_input;
-    }
-    else FAIL_IF_ERROR( !info.vfr && input_opt.timebase, "--timebase is incompatible with cfr input\n" );
-
-    /* init threaded input while the information about the input video is unaltered by filtering */
-#if HAVE_THREAD
-    if( info.thread_safe && (b_thread_input || param->i_threads > 1
-        || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1)) )
-    {
-        if( thread_input.open_file( NULL, &opt->hin, &info, NULL ) )
-        {
-            fprintf( stderr, "x264 [error]: threaded input failed\n" );
-            return -1;
-        }
-        cli_input = thread_input;
-    }
-#endif
-
-    /* override detected values by those specified by the user */
-    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
-    {
-        info.sar_width  = param->vui.i_sar_width;
-        info.sar_height = param->vui.i_sar_height;
-    }
-    if( b_user_fps )
-    {
-        info.fps_num = param->i_fps_num;
-        info.fps_den = param->i_fps_den;
-    }
-    if( !info.vfr )
-    {
-        info.timebase_num = info.fps_den;
-        info.timebase_den = info.fps_num;
-    }
-    if( !tcfile_name && input_opt.timebase )
-    {
-        uint64_t i_user_timebase_num;
-        uint64_t i_user_timebase_den;
-        int ret = sscanf( input_opt.timebase, "%"SCNu64"/%"SCNu64, &i_user_timebase_num, &i_user_timebase_den );
-        FAIL_IF_ERROR( !ret, "invalid argument: timebase = %s\n", input_opt.timebase );
-        if( ret == 1 )
-        {
-            i_user_timebase_num = info.timebase_num;
-            i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
-        }
-        FAIL_IF_ERROR( i_user_timebase_num > UINT32_MAX || i_user_timebase_den > UINT32_MAX,
-                       "timebase you specified exceeds H.264 maximum\n" );
-        opt->timebase_convert_multiplier = ((double)i_user_timebase_den / info.timebase_den)
-                                         * ((double)info.timebase_num / i_user_timebase_num);
-        info.timebase_num = i_user_timebase_num;
-        info.timebase_den = i_user_timebase_den;
-        info.vfr = 1;
-    }
-    if( b_user_interlaced )
-    {
-        info.interlaced = param->b_interlaced;
-        info.tff = param->b_tff;
-    }
-    if( input_opt.input_range != RANGE_AUTO )
-        info.fullrange = input_opt.input_range;
-
-    if( init_vid_filters( vid_filters, &opt->hin, &info, param, output_csp ) )
-        return -1;
-
-    /* set param flags from the post-filtered video */
-    param->b_vfr_input = info.vfr;
-    param->i_fps_num = info.fps_num;
-    param->i_fps_den = info.fps_den;
-    param->i_timebase_num = info.timebase_num;
-    param->i_timebase_den = info.timebase_den;
-    param->vui.i_sar_width  = info.sar_width;
-    param->vui.i_sar_height = info.sar_height;
-
-    info.num_frames = X264_MAX( info.num_frames - opt->i_seek, 0 );
-    if( (!info.num_frames || param->i_frame_total < info.num_frames)
-        && param->i_frame_total > 0 )
-        info.num_frames = param->i_frame_total;
-    param->i_frame_total = info.num_frames;
-
-    if( !b_user_interlaced && info.interlaced )
-    {
-#if HAVE_INTERLACED
-        x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, enabling %cff interlaced mode.\n"
-                      "                If you want otherwise, use --no-interlaced or --%cff\n",
-                      info.tff ? 't' : 'b', info.tff ? 'b' : 't' );
-        param->b_interlaced = 1;
-        param->b_tff = !!info.tff;
-#else
-        x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, but not compiled with interlaced support\n" );
-#endif
-    }
-    /* if the user never specified the output range and the input is now rgb, default it to pc */
-    int csp = param->i_csp & X264_CSP_MASK;
-    if( csp >= X264_CSP_BGR && csp <= X264_CSP_RGB )
-    {
-        if( input_opt.output_range == RANGE_AUTO )
-            param->vui.b_fullrange = RANGE_PC;
-        /* otherwise fail if they specified tv */
-        FAIL_IF_ERROR( !param->vui.b_fullrange, "RGB must be PC range" );
-    }
-
-    /* Automatically reduce reference frame count to match the user's target level
-     * if the user didn't explicitly set a reference frame count. */
-    if( !b_user_ref )
-    {
-        int mbs = (((param->i_width)+15)>>4) * (((param->i_height)+15)>>4);
-        for( int i = 0; x264_levels[i].level_idc != 0; i++ )
-            if( param->i_level_idc == x264_levels[i].level_idc )
-            {
-                while( mbs * param->i_frame_reference > x264_levels[i].dpb && param->i_frame_reference > 1 )
-                    param->i_frame_reference--;
-                break;
-            }
-    }
-
-
-    return 0;
-}
-
-static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
-{
-    int num = -1;
-    char type;
-    while( num < i_frame )
-    {
-        int64_t file_pos = ftell( opt->qpfile );
-        int qp = -1;
-        int ret = fscanf( opt->qpfile, "%d %c%*[ \t]%d\n", &num, &type, &qp );
-        pic->i_type = X264_TYPE_AUTO;
-        pic->i_qpplus1 = X264_QP_AUTO;
-        if( num > i_frame || ret == EOF )
-        {
-            if( file_pos < 0 || fseek( opt->qpfile, file_pos, SEEK_SET ) )
-            {
-                x264_cli_log( "x264", X264_LOG_ERROR, "qpfile seeking failed\n" );
-                fclose( opt->qpfile );
-                opt->qpfile = NULL;
-            }
-            break;
-        }
-        if( num < i_frame && ret >= 2 )
-            continue;
-        if( ret == 3 && qp >= 0 )
-            pic->i_qpplus1 = qp+1;
-        if     ( type == 'I' ) pic->i_type = X264_TYPE_IDR;
-        else if( type == 'i' ) pic->i_type = X264_TYPE_I;
-        else if( type == 'K' ) pic->i_type = X264_TYPE_KEYFRAME;
-        else if( type == 'P' ) pic->i_type = X264_TYPE_P;
-        else if( type == 'B' ) pic->i_type = X264_TYPE_BREF;
-        else if( type == 'b' ) pic->i_type = X264_TYPE_B;
-        else ret = 0;
-        if( ret < 2 || qp < -1 || qp > QP_MAX )
-        {
-            x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame );
-            fclose( opt->qpfile );
-            opt->qpfile = NULL;
-            break;
-        }
-    }
-}
-
-static int encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts )
-{
-    x264_picture_t pic_out;
-    x264_nal_t *nal;
-    int i_nal;
-    int i_frame_size = 0;
-
-    i_frame_size = x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out );
-
-    FAIL_IF_ERROR( i_frame_size < 0, "x264_encoder_encode failed\n" );
-
-    if( i_frame_size )
-    {
-        i_frame_size = cli_output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
-        *last_dts = pic_out.i_dts;
-    }
-
-    return i_frame_size;
-}
-
-static int64_t print_status( int64_t i_start, int64_t i_previous, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts )
-{
-    char buf[200];
-    int64_t i_time = x264_mdate();
-    if( i_previous && i_time - i_previous < UPDATE_INTERVAL )
-        return i_previous;
-    int64_t i_elapsed = i_time - i_start;
-    double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
-    double bitrate;
-    if( last_ts )
-        bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den );
-    else
-        bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num );
-    if( i_frame_total )
-    {
-        int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
-        sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
-                 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
-                 eta/3600, (eta/60)%60, eta%60 );
-    }
-    else
-        sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
-    fprintf( stderr, "%s  \r", buf+5 );
-    x264_cli_set_console_title( buf );
-    fflush( stderr ); // needed in windows
-    return i_time;
-}
-
-static void convert_cli_to_lib_pic( x264_picture_t *lib, cli_pic_t *cli )
-{
-    memcpy( lib->img.i_stride, cli->img.stride, sizeof(cli->img.stride) );
-    memcpy( lib->img.plane, cli->img.plane, sizeof(cli->img.plane) );
-    lib->img.i_plane = cli->img.planes;
-    lib->img.i_csp = cli->img.csp;
-    lib->i_pts = cli->pts;
-}
-
-#define FAIL_IF_ERROR2( cond, ... )\
-do\
-{\
-    if( cond )\
-    {\
-        x264_cli_log( "x264", X264_LOG_ERROR, __VA_ARGS__ );\
-        retval = -1;\
-        goto fail;\
-    }\
-} while( 0 )
-
-static int encode( x264_param_t *param, cli_opt_t *opt )
-{
-    x264_t *h = NULL;
-    x264_picture_t pic;
-    cli_pic_t cli_pic;
-    const cli_pulldown_t *pulldown = NULL; // shut up gcc
-
-    int     i_frame = 0;
-    int     i_frame_output = 0;
-    int64_t i_end, i_previous = 0, i_start = 0;
-    int64_t i_file = 0;
-    int     i_frame_size;
-    int64_t last_dts = 0;
-    int64_t prev_dts = 0;
-    int64_t first_dts = 0;
-#   define  MAX_PTS_WARNING 3 /* arbitrary */
-    int     pts_warning_cnt = 0;
-    int64_t largest_pts = -1;
-    int64_t second_largest_pts = -1;
-    int64_t ticks_per_frame;
-    double  duration;
-    double  pulldown_pts = 0;
-    int     retval = 0;
-
-    opt->b_progress &= param->i_log_level < X264_LOG_DEBUG;
-
-    /* set up pulldown */
-    if( opt->i_pulldown && !param->b_vfr_input )
-    {
-        param->b_pulldown = 1;
-        param->b_pic_struct = 1;
-        pulldown = &pulldown_values[opt->i_pulldown];
-        param->i_timebase_num = param->i_fps_den;
-        FAIL_IF_ERROR2( fmod( param->i_fps_num * pulldown->fps_factor, 1 ),
-                        "unsupported framerate for chosen pulldown\n" );
-        param->i_timebase_den = param->i_fps_num * pulldown->fps_factor;
-    }
-
-    h = x264_encoder_open( param );
-    FAIL_IF_ERROR2( !h, "x264_encoder_open failed\n" );
-
-    x264_encoder_parameters( h, param );
-
-    FAIL_IF_ERROR2( cli_output.set_param( opt->hout, param ), "can't set outfile param\n" );
-
-    i_start = x264_mdate();
-
-    /* ticks/frame = ticks/second / frames/second */
-    ticks_per_frame = (int64_t)param->i_timebase_den * param->i_fps_den / param->i_timebase_num / param->i_fps_num;
-    FAIL_IF_ERROR2( ticks_per_frame < 1 && !param->b_vfr_input, "ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame );
-    ticks_per_frame = X264_MAX( ticks_per_frame, 1 );
-
-    if( !param->b_repeat_headers )
-    {
-        // Write SPS/PPS/SEI
-        x264_nal_t *headers;
-        int i_nal;
-
-        FAIL_IF_ERROR2( x264_encoder_headers( h, &headers, &i_nal ) < 0, "x264_encoder_headers failed\n" );
-        FAIL_IF_ERROR2( (i_file = cli_output.write_headers( opt->hout, headers )) < 0, "error writing headers to output file\n" );
-    }
-
-    if( opt->tcfile_out )
-        fprintf( opt->tcfile_out, "# timecode format v2\n" );
-
-    /* Encode frames */
-    for( ; !b_ctrl_c && (i_frame < param->i_frame_total || !param->i_frame_total); i_frame++ )
-    {
-        if( filter.get_frame( opt->hin, &cli_pic, i_frame + opt->i_seek ) )
-            break;
-        x264_picture_init( &pic );
-        convert_cli_to_lib_pic( &pic, &cli_pic );
-
-        if( !param->b_vfr_input )
-            pic.i_pts = i_frame;
-
-        if( opt->i_pulldown && !param->b_vfr_input )
-        {
-            pic.i_pic_struct = pulldown->pattern[ i_frame % pulldown->mod ];
-            pic.i_pts = (int64_t)( pulldown_pts + 0.5 );
-            pulldown_pts += pulldown_frame_duration[pic.i_pic_struct];
-        }
-        else if( opt->timebase_convert_multiplier )
-            pic.i_pts = (int64_t)( pic.i_pts * opt->timebase_convert_multiplier + 0.5 );
-
-        if( pic.i_pts <= largest_pts )
-        {
-            if( cli_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
-                x264_cli_log( "x264", X264_LOG_WARNING, "non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
-                             i_frame, pic.i_pts, largest_pts );
-            else if( pts_warning_cnt == MAX_PTS_WARNING )
-                x264_cli_log( "x264", X264_LOG_WARNING, "too many nonmonotonic pts warnings, suppressing further ones\n" );
-            pts_warning_cnt++;
-            pic.i_pts = largest_pts + ticks_per_frame;
-        }
-
-        second_largest_pts = largest_pts;
-        largest_pts = pic.i_pts;
-        if( opt->tcfile_out )
-            fprintf( opt->tcfile_out, "%.6f\n", pic.i_pts * ((double)param->i_timebase_num / param->i_timebase_den) * 1e3 );
-
-        if( opt->qpfile )
-            parse_qpfile( opt, &pic, i_frame + opt->i_seek );
-
-        prev_dts = last_dts;
-        i_frame_size = encode_frame( h, opt->hout, &pic, &last_dts );
-        if( i_frame_size < 0 )
-        {
-            b_ctrl_c = 1; /* lie to exit the loop */
-            retval = -1;
-        }
-        else if( i_frame_size )
-        {
-            i_file += i_frame_size;
-            i_frame_output++;
-            if( i_frame_output == 1 )
-                first_dts = prev_dts = last_dts;
-        }
-
-        if( filter.release_frame( opt->hin, &cli_pic, i_frame + opt->i_seek ) )
-            break;
-
-        /* update status line (up to 1000 times per input file) */
-        if( opt->b_progress && i_frame_output )
-            i_previous = print_status( i_start, i_previous, i_frame_output, param->i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
-    }
-    /* Flush delayed frames */
-    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
-    {
-        prev_dts = last_dts;
-        i_frame_size = encode_frame( h, opt->hout, NULL, &last_dts );
-        if( i_frame_size < 0 )
-        {
-            b_ctrl_c = 1; /* lie to exit the loop */
-            retval = -1;
-        }
-        else if( i_frame_size )
-        {
-            i_file += i_frame_size;
-            i_frame_output++;
-            if( i_frame_output == 1 )
-                first_dts = prev_dts = last_dts;
-        }
-        if( opt->b_progress && i_frame_output )
-            i_previous = print_status( i_start, i_previous, i_frame_output, param->i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
-    }
-fail:
-    if( pts_warning_cnt >= MAX_PTS_WARNING && cli_log_level < X264_LOG_DEBUG )
-        x264_cli_log( "x264", X264_LOG_WARNING, "%d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
-
-    /* duration algorithm fails when only 1 frame is output */
-    if( i_frame_output == 1 )
-        duration = (double)param->i_fps_den / param->i_fps_num;
-    else if( b_ctrl_c )
-        duration = (double)(2 * last_dts - prev_dts - first_dts) * param->i_timebase_num / param->i_timebase_den;
-    else
-        duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
-
-    i_end = x264_mdate();
-    /* Erase progress indicator before printing encoding stats. */
-    if( opt->b_progress )
-        fprintf( stderr, "                                                                               \r" );
-    if( h )
-        x264_encoder_close( h );
-    fprintf( stderr, "\n" );
-
-    if( b_ctrl_c )
-        fprintf( stderr, "aborted at input frame %d, output frame %d\n", opt->i_seek + i_frame, i_frame_output );
-
-    cli_output.close_file( opt->hout, largest_pts, second_largest_pts );
-    opt->hout = NULL;
-
-    if( i_frame_output > 0 )
-    {
-        double fps = (double)i_frame_output * (double)1000000 /
-                     (double)( i_end - i_start );
-
-        fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame_output, fps,
-                 (double) i_file * 8 / ( 1000 * duration ) );
-    }
-
-    return retval;
-}
diff --git a/android/src/main/libenc/jni/libx264/x264.h b/android/src/main/libenc/jni/libx264/x264.h
deleted file mode 100755
index 2b59b92..0000000
--- a/android/src/main/libenc/jni/libx264/x264.h
+++ /dev/null
@@ -1,961 +0,0 @@
-/*****************************************************************************
- * x264.h: x264 public header
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_X264_H
-#define X264_X264_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\
-    !defined(_SYS_STDINT_H_) && !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
-# ifdef _MSC_VER
-#  pragma message("You must include stdint.h or inttypes.h before x264.h")
-# else
-#  warning You must include stdint.h or inttypes.h before x264.h
-# endif
-#endif
-
-#include <stdarg.h>
-
-#include "x264_config.h"
-
-#define X264_BUILD 148
-
-/* Application developers planning to link against a shared library version of
- * libx264 from a Microsoft Visual Studio or similar development environment
- * will need to define X264_API_IMPORTS before including this header.
- * This clause does not apply to MinGW, similar development environments, or non
- * Windows platforms. */
-#ifdef X264_API_IMPORTS
-#define X264_API __declspec(dllimport)
-#else
-#define X264_API
-#endif
-
-/* x264_t:
- *      opaque handler for encoder */
-typedef struct x264_t x264_t;
-
-/****************************************************************************
- * NAL structure and functions
- ****************************************************************************/
-
-enum nal_unit_type_e
-{
-    NAL_UNKNOWN     = 0,
-    NAL_SLICE       = 1,
-    NAL_SLICE_DPA   = 2,
-    NAL_SLICE_DPB   = 3,
-    NAL_SLICE_DPC   = 4,
-    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
-    NAL_SEI         = 6,    /* ref_idc == 0 */
-    NAL_SPS         = 7,
-    NAL_PPS         = 8,
-    NAL_AUD         = 9,
-    NAL_FILLER      = 12,
-    /* ref_idc == 0 for 6,9,10,11,12 */
-};
-enum nal_priority_e
-{
-    NAL_PRIORITY_DISPOSABLE = 0,
-    NAL_PRIORITY_LOW        = 1,
-    NAL_PRIORITY_HIGH       = 2,
-    NAL_PRIORITY_HIGHEST    = 3,
-};
-
-/* The data within the payload is already NAL-encapsulated; the ref_idc and type
- * are merely in the struct for easy access by the calling application.
- * All data returned in an x264_nal_t, including the data in p_payload, is no longer
- * valid after the next call to x264_encoder_encode.  Thus it must be used or copied
- * before calling x264_encoder_encode or x264_encoder_headers again. */
-typedef struct x264_nal_t
-{
-    int i_ref_idc;  /* nal_priority_e */
-    int i_type;     /* nal_unit_type_e */
-    int b_long_startcode;
-    int i_first_mb; /* If this NAL is a slice, the index of the first MB in the slice. */
-    int i_last_mb;  /* If this NAL is a slice, the index of the last MB in the slice. */
-
-    /* Size of payload (including any padding) in bytes. */
-    int     i_payload;
-    /* If param->b_annexb is set, Annex-B bytestream with startcode.
-     * Otherwise, startcode is replaced with a 4-byte size.
-     * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
-    uint8_t *p_payload;
-
-    /* Size of padding in bytes. */
-    int i_padding;
-} x264_nal_t;
-
-/****************************************************************************
- * Encoder parameters
- ****************************************************************************/
-/* CPU flags */
-
-/* x86 */
-#define X264_CPU_CMOV            0x0000001
-#define X264_CPU_MMX             0x0000002
-#define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT          X264_CPU_MMX2
-#define X264_CPU_SSE             0x0000008
-#define X264_CPU_SSE2            0x0000010
-#define X264_CPU_SSE3            0x0000020
-#define X264_CPU_SSSE3           0x0000040
-#define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
-#define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
-#define X264_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP             0x0000800  /* AMD XOP */
-#define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X264_CPU_FMA3            0x0002000  /* FMA3 */
-#define X264_CPU_AVX2            0x0004000  /* AVX2 */
-#define X264_CPU_BMI1            0x0008000  /* BMI1 */
-#define X264_CPU_BMI2            0x0010000  /* BMI2 */
-/* x86 modifiers */
-#define X264_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
-                                             * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
-                                             * cacheline split penalties -- gather everything here that
-                                             * isn't shared by other CPUs to avoid making half a dozen
-                                             * new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
-
-/* PowerPC */
-#define X264_CPU_ALTIVEC         0x0000001
-
-/* ARM and AArch64 */
-#define X264_CPU_ARMV6           0x0000001
-#define X264_CPU_NEON            0x0000002  /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_ARMV8           0x0000008
-
-/* MIPS */
-#define X264_CPU_MSA             0x0000001  /* MIPS MSA */
-
-/* Analyse flags */
-#define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
-#define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
-#define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
-#define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
-#define X264_ANALYSE_BSUB16x16  0x0100  /* Analyse b16x8, b8x16 and b8x8 */
-#define X264_DIRECT_PRED_NONE        0
-#define X264_DIRECT_PRED_SPATIAL     1
-#define X264_DIRECT_PRED_TEMPORAL    2
-#define X264_DIRECT_PRED_AUTO        3
-#define X264_ME_DIA                  0
-#define X264_ME_HEX                  1
-#define X264_ME_UMH                  2
-#define X264_ME_ESA                  3
-#define X264_ME_TESA                 4
-#define X264_CQM_FLAT                0
-#define X264_CQM_JVT                 1
-#define X264_CQM_CUSTOM              2
-#define X264_RC_CQP                  0
-#define X264_RC_CRF                  1
-#define X264_RC_ABR                  2
-#define X264_QP_AUTO                 0
-#define X264_AQ_NONE                 0
-#define X264_AQ_VARIANCE             1
-#define X264_AQ_AUTOVARIANCE         2
-#define X264_AQ_AUTOVARIANCE_BIASED  3
-#define X264_B_ADAPT_NONE            0
-#define X264_B_ADAPT_FAST            1
-#define X264_B_ADAPT_TRELLIS         2
-#define X264_WEIGHTP_NONE            0
-#define X264_WEIGHTP_SIMPLE          1
-#define X264_WEIGHTP_SMART           2
-#define X264_B_PYRAMID_NONE          0
-#define X264_B_PYRAMID_STRICT        1
-#define X264_B_PYRAMID_NORMAL        2
-#define X264_KEYINT_MIN_AUTO         0
-#define X264_KEYINT_MAX_INFINITE     (1<<30)
-
-static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
-static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
-static const char * const x264_b_pyramid_names[] = { "none", "strict", "normal", 0 };
-static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
-static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
-static const char * const x264_fullrange_names[] = { "off", "on", 0 };
-static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", "smpte428",
-                                                     "smpte431", "smpte432", 0 };
-static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316",
-                                                    "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", "smpte2084", "smpte428", 0 };
-static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c",
-                                                     "smpte2085", 0 };
-static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
-
-/* Colorspace type */
-#define X264_CSP_MASK           0x00ff  /* */
-#define X264_CSP_NONE           0x0000  /* Invalid mode     */
-#define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
-#define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
-#define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_NV21           0x0004  /* yuv 4:2:0, with one y plane and one packed v+u */
-#define X264_CSP_I422           0x0005  /* yuv 4:2:2 planar */
-#define X264_CSP_YV16           0x0006  /* yvu 4:2:2 planar */
-#define X264_CSP_NV16           0x0007  /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_V210           0x0008  /* 10-bit yuv 4:2:2 packed in 32 */
-#define X264_CSP_I444           0x0009  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x000a  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x000b  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x000c  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x000d  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x000e  /* end of list */
-#define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
-#define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
-
-/* Slice type */
-#define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
-#define X264_TYPE_IDR           0x0001
-#define X264_TYPE_I             0x0002
-#define X264_TYPE_P             0x0003
-#define X264_TYPE_BREF          0x0004  /* Non-disposable B-frame */
-#define X264_TYPE_B             0x0005
-#define X264_TYPE_KEYFRAME      0x0006  /* IDR or I depending on b_open_gop option */
-#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR || (x)==X264_TYPE_KEYFRAME)
-#define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)
-
-/* Log level */
-#define X264_LOG_NONE          (-1)
-#define X264_LOG_ERROR          0
-#define X264_LOG_WARNING        1
-#define X264_LOG_INFO           2
-#define X264_LOG_DEBUG          3
-
-/* Threading */
-#define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
-#define X264_SYNC_LOOKAHEAD_AUTO (-1) /* Automatically select optimal lookahead thread buffer size */
-
-/* HRD */
-#define X264_NAL_HRD_NONE            0
-#define X264_NAL_HRD_VBR             1
-#define X264_NAL_HRD_CBR             2
-
-/* Zones: override ratecontrol or other options for specific sections of the video.
- * See x264_encoder_reconfig() for which options can be changed.
- * If zones overlap, whichever comes later in the list takes precedence. */
-typedef struct x264_zone_t
-{
-    int i_start, i_end; /* range of frame numbers */
-    int b_force_qp; /* whether to use qp vs bitrate factor */
-    int i_qp;
-    float f_bitrate_factor;
-    struct x264_param_t *param;
-} x264_zone_t;
-
-typedef struct x264_param_t
-{
-    /* CPU flags */
-    unsigned int cpu;
-    int         i_threads;           /* encode multiple frames in parallel */
-    int         i_lookahead_threads; /* multiple threads for lookahead analysis */
-    int         b_sliced_threads;  /* Whether to use slice-based threading. */
-    int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
-    int         b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */
-    int         i_sync_lookahead; /* threaded lookahead buffer */
-
-    /* Video Properties */
-    int         i_width;
-    int         i_height;
-    int         i_csp;         /* CSP of encoded bitstream */
-    int         i_level_idc;
-    int         i_frame_total; /* number of frames to encode if known, else 0 */
-
-    /* NAL HRD
-     * Uses Buffering and Picture Timing SEIs to signal HRD
-     * The HRD in H.264 was not designed with VFR in mind.
-     * It is therefore not recommendeded to use NAL HRD with VFR.
-     * Furthermore, reconfiguring the VBV (via x264_encoder_reconfig)
-     * will currently generate invalid HRD. */
-    int         i_nal_hrd;
-
-    struct
-    {
-        /* they will be reduced to be 0 < x <= 65535 and prime */
-        int         i_sar_height;
-        int         i_sar_width;
-
-        int         i_overscan;    /* 0=undef, 1=no overscan, 2=overscan */
-
-        /* see h264 annex E for the values of the following */
-        int         i_vidformat;
-        int         b_fullrange;
-        int         i_colorprim;
-        int         i_transfer;
-        int         i_colmatrix;
-        int         i_chroma_loc;    /* both top & bottom */
-    } vui;
-
-    /* Bitstream parameters */
-    int         i_frame_reference;  /* Maximum number of reference frames */
-    int         i_dpb_size;         /* Force a DPB size larger than that implied by B-frames and reference frames.
-                                     * Useful in combination with interactive error resilience. */
-    int         i_keyint_max;       /* Force an IDR keyframe at this interval */
-    int         i_keyint_min;       /* Scenecuts closer together than this are coded as I, not IDR. */
-    int         i_scenecut_threshold; /* how aggressively to insert extra I frames */
-    int         b_intra_refresh;    /* Whether or not to use periodic intra refresh instead of IDR frames. */
-
-    int         i_bframe;   /* how many b-frame between 2 references pictures */
-    int         i_bframe_adaptive;
-    int         i_bframe_bias;
-    int         i_bframe_pyramid;   /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */
-    int         b_open_gop;
-    int         b_bluray_compat;
-    int         i_avcintra_class;
-
-    int         b_deblocking_filter;
-    int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
-    int         i_deblocking_filter_beta;       /* [-6, 6]  idem */
-
-    int         b_cabac;
-    int         i_cabac_init_idc;
-
-    int         b_interlaced;
-    int         b_constrained_intra;
-
-    int         i_cqm_preset;
-    char        *psz_cqm_file;      /* filename (in UTF-8) of CQM file, JM format */
-    uint8_t     cqm_4iy[16];        /* used only if i_cqm_preset == X264_CQM_CUSTOM */
-    uint8_t     cqm_4py[16];
-    uint8_t     cqm_4ic[16];
-    uint8_t     cqm_4pc[16];
-    uint8_t     cqm_8iy[64];
-    uint8_t     cqm_8py[64];
-    uint8_t     cqm_8ic[64];
-    uint8_t     cqm_8pc[64];
-
-    /* Log */
-    void        (*pf_log)( void *, int i_level, const char *psz, va_list );
-    void        *p_log_private;
-    int         i_log_level;
-    int         b_full_recon;   /* fully reconstruct frames, even when not necessary for encoding.  Implied by psz_dump_yuv */
-    char        *psz_dump_yuv;  /* filename (in UTF-8) for reconstructed frames */
-
-    /* Encoder analyser parameters */
-    struct
-    {
-        unsigned int intra;     /* intra partitions */
-        unsigned int inter;     /* inter partitions */
-
-        int          b_transform_8x8;
-        int          i_weighted_pred; /* weighting for P-frames */
-        int          b_weighted_bipred; /* implicit weighting for B-frames */
-        int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
-        int          i_chroma_qp_offset;
-
-        int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
-        int          i_me_range; /* integer pixel motion estimation search range (from predicted mv) */
-        int          i_mv_range; /* maximum length of a mv (in pixels). -1 = auto, based on level */
-        int          i_mv_range_thread; /* minimum space between threads. -1 = auto, based on number of threads. */
-        int          i_subpel_refine; /* subpixel motion estimation quality */
-        int          b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */
-        int          b_mixed_references; /* allow each mb partition to have its own reference number */
-        int          i_trellis;  /* trellis RD quantization */
-        int          b_fast_pskip; /* early SKIP detection on P-frames */
-        int          b_dct_decimate; /* transform coefficient thresholding on P-frames */
-        int          i_noise_reduction; /* adaptive pseudo-deadzone */
-        float        f_psy_rd; /* Psy RD strength */
-        float        f_psy_trellis; /* Psy trellis strength */
-        int          b_psy; /* Toggle all psy optimizations */
-
-        int          b_mb_info;            /* Use input mb_info data in x264_picture_t */
-        int          b_mb_info_update; /* Update the values in mb_info according to the results of encoding. */
-
-        /* the deadzone size that will be used in luma quantization */
-        int          i_luma_deadzone[2]; /* {inter, intra} */
-
-        int          b_psnr;    /* compute and print PSNR stats */
-        int          b_ssim;    /* compute and print SSIM stats */
-    } analyse;
-
-    /* Rate control parameters */
-    struct
-    {
-        int         i_rc_method;    /* X264_RC_* */
-
-        int         i_qp_constant;  /* 0 to (51 + 6*(x264_bit_depth-8)). 0=lossless */
-        int         i_qp_min;       /* min allowed QP value */
-        int         i_qp_max;       /* max allowed QP value */
-        int         i_qp_step;      /* max QP step between frames */
-
-        int         i_bitrate;
-        float       f_rf_constant;  /* 1pass VBR, nominal QP */
-        float       f_rf_constant_max;  /* In CRF mode, maximum CRF as caused by VBV */
-        float       f_rate_tolerance;
-        int         i_vbv_max_bitrate;
-        int         i_vbv_buffer_size;
-        float       f_vbv_buffer_init; /* <=1: fraction of buffer_size. >1: kbit */
-        float       f_ip_factor;
-        float       f_pb_factor;
-
-        /* VBV filler: force CBR VBV and use filler bytes to ensure hard-CBR.
-         * Implied by NAL-HRD CBR. */
-        int         b_filler;
-
-        int         i_aq_mode;      /* psy adaptive QP. (X264_AQ_*) */
-        float       f_aq_strength;
-        int         b_mb_tree;      /* Macroblock-tree ratecontrol. */
-        int         i_lookahead;
-
-        /* 2pass */
-        int         b_stat_write;   /* Enable stat writing in psz_stat_out */
-        char        *psz_stat_out;  /* output filename (in UTF-8) of the 2pass stats file */
-        int         b_stat_read;    /* Read stat from psz_stat_in and use it */
-        char        *psz_stat_in;   /* input filename (in UTF-8) of the 2pass stats file */
-
-        /* 2pass params (same as ffmpeg ones) */
-        float       f_qcompress;    /* 0.0 => cbr, 1.0 => constant qp */
-        float       f_qblur;        /* temporally blur quants */
-        float       f_complexity_blur; /* temporally blur complexity */
-        x264_zone_t *zones;         /* ratecontrol overrides */
-        int         i_zones;        /* number of zone_t's */
-        char        *psz_zones;     /* alternate method of specifying zones */
-    } rc;
-
-    /* Cropping Rectangle parameters: added to those implicitly defined by
-       non-mod16 video resolutions. */
-    struct
-    {
-        unsigned int i_left;
-        unsigned int i_top;
-        unsigned int i_right;
-        unsigned int i_bottom;
-    } crop_rect;
-
-    /* frame packing arrangement flag */
-    int i_frame_packing;
-
-    /* Muxing parameters */
-    int b_aud;                  /* generate access unit delimiters */
-    int b_repeat_headers;       /* put SPS/PPS before each keyframe */
-    int b_annexb;               /* if set, place start codes (4 bytes) before NAL units,
-                                 * otherwise place size (4 bytes) before NAL units. */
-    int i_sps_id;               /* SPS and PPS id number */
-    int b_vfr_input;            /* VFR input.  If 1, use timebase and timestamps for ratecontrol purposes.
-                                 * If 0, use fps only. */
-    int b_pulldown;             /* use explicity set timebase for CFR */
-    uint32_t i_fps_num;
-    uint32_t i_fps_den;
-    uint32_t i_timebase_num;    /* Timebase numerator */
-    uint32_t i_timebase_den;    /* Timebase denominator */
-
-    int b_tff;
-
-    /* Pulldown:
-     * The correct pic_struct must be passed with each input frame.
-     * The input timebase should be the timebase corresponding to the output framerate. This should be constant.
-     * e.g. for 3:2 pulldown timebase should be 1001/30000
-     * The PTS passed with each frame must be the PTS of the frame after pulldown is applied.
-     * Frame doubling and tripling require b_vfr_input set to zero (see H.264 Table D-1)
-     *
-     * Pulldown changes are not clearly defined in H.264. Therefore, it is the calling app's responsibility to manage this.
-     */
-
-    int b_pic_struct;
-
-    /* Fake Interlaced.
-     *
-     * Used only when b_interlaced=0. Setting this flag makes it possible to flag the stream as PAFF interlaced yet
-     * encode all frames progessively. It is useful for encoding 25p and 30p Blu-Ray streams.
-     */
-
-    int b_fake_interlaced;
-
-    /* Don't optimize header parameters based on video content, e.g. ensure that splitting an input video, compressing
-     * each part, and stitching them back together will result in identical SPS/PPS. This is necessary for stitching
-     * with container formats that don't allow multiple SPS/PPS. */
-    int b_stitchable;
-
-    int b_opencl;            /* use OpenCL when available */
-    int i_opencl_device;     /* specify count of GPU devices to skip, for CLI users */
-    void *opencl_device_id;  /* pass explicit cl_device_id as void*, for API users */
-    char *psz_clbin_file;    /* filename (in UTF-8) of the compiled OpenCL kernel cache file */
-
-    /* Slicing parameters */
-    int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
-    int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
-    int i_slice_min_mbs;     /* Min number of MBs per slice */
-    int i_slice_count;       /* Number of slices per frame: forces rectangular slices. */
-    int i_slice_count_max;   /* Absolute cap on slices per frame; stops applying slice-max-size
-                              * and slice-max-mbs if this is reached. */
-
-    /* Optional callback for freeing this x264_param_t when it is done being used.
-     * Only used when the x264_param_t sits in memory for an indefinite period of time,
-     * i.e. when an x264_param_t is passed to x264_t in an x264_picture_t or in zones.
-     * Not used when x264_encoder_reconfig is called directly. */
-    void (*param_free)( void* );
-
-    /* Optional low-level callback for low-latency encoding.  Called for each output NAL unit
-     * immediately after the NAL unit is finished encoding.  This allows the calling application
-     * to begin processing video data (e.g. by sending packets over a network) before the frame
-     * is done encoding.
-     *
-     * This callback MUST do the following in order to work correctly:
-     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64.
-     * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
-     * After these steps, the content of nal is valid and can be used in the same way as if
-     * the NAL unit were output by x264_encoder_encode.
-     *
-     * This does not need to be synchronous with the encoding process: the data pointed to
-     * by nal (both before and after x264_nal_encode) will remain valid until the next
-     * x264_encoder_encode call.  The callback must be re-entrant.
-     *
-     * This callback does not work with frame-based threads; threads must be disabled
-     * or sliced-threads enabled.  This callback also does not work as one would expect
-     * with HRD -- since the buffering period SEI cannot be calculated until the frame
-     * is finished encoding, it will not be sent via this callback.
-     *
-     * Note also that the NALs are not necessarily returned in order when sliced threads is
-     * enabled.  Accordingly, the variable i_first_mb and i_last_mb are available in
-     * x264_nal_t to help the calling application reorder the slices if necessary.
-     *
-     * When this callback is enabled, x264_encoder_encode does not return valid NALs;
-     * the calling application is expected to acquire all output NALs through the callback.
-     *
-     * It is generally sensible to combine this callback with a use of slice-max-mbs or
-     * slice-max-size.
-     *
-     * The opaque pointer is the opaque pointer from the input frame associated with this
-     * NAL unit. This helps distinguish between nalu_process calls from different sources,
-     * e.g. if doing multiple encodes in one process.
-     */
-    void (*nalu_process) ( x264_t *h, x264_nal_t *nal, void *opaque );
-} x264_param_t;
-
-void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
-
-/****************************************************************************
- * H.264 level restriction information
- ****************************************************************************/
-
-typedef struct x264_level_t
-{
-    int level_idc;
-    int mbps;        /* max macroblock processing rate (macroblocks/sec) */
-    int frame_size;  /* max frame size (macroblocks) */
-    int dpb;         /* max decoded picture buffer (mbs) */
-    int bitrate;     /* max bitrate (kbit/sec) */
-    int cpb;         /* max vbv buffer (kbit) */
-    int mv_range;    /* max vertical mv component range (pixels) */
-    int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
-    int slice_rate;  /* ?? */
-    int mincr;       /* min compression ratio */
-    int bipred8x8;   /* limit bipred to >=8x8 */
-    int direct8x8;   /* limit b_direct to >=8x8 */
-    int frame_only;  /* forbid interlacing */
-} x264_level_t;
-
-/* all of the levels defined in the standard, terminated by .level_idc=0 */
-X264_API extern const x264_level_t x264_levels[];
-
-/****************************************************************************
- * Basic parameter handling functions
- ****************************************************************************/
-
-/* x264_param_default:
- *      fill x264_param_t with default values and do CPU detection */
-void    x264_param_default( x264_param_t * );
-
-/* x264_param_parse:
- *  set one parameter by name.
- *  returns 0 on success, or returns one of the following errors.
- *  note: BAD_VALUE occurs only if it can't even parse the value,
- *  numerical range is not checked until x264_encoder_open() or
- *  x264_encoder_reconfig().
- *  value=NULL means "true" for boolean options, but is a BAD_VALUE for non-booleans. */
-#define X264_PARAM_BAD_NAME  (-1)
-#define X264_PARAM_BAD_VALUE (-2)
-int x264_param_parse( x264_param_t *, const char *name, const char *value );
-
-/****************************************************************************
- * Advanced parameter handling functions
- ****************************************************************************/
-
-/* These functions expose the full power of x264's preset-tune-profile system for
- * easy adjustment of large numbers of internal parameters.
- *
- * In order to replicate x264CLI's option handling, these functions MUST be called
- * in the following order:
- * 1) x264_param_default_preset
- * 2) Custom user options (via param_parse or directly assigned variables)
- * 3) x264_param_apply_fastfirstpass
- * 4) x264_param_apply_profile
- *
- * Additionally, x264CLI does not apply step 3 if the preset chosen is "placebo"
- * or --slow-firstpass is set. */
-
-/* x264_param_default_preset:
- *      The same as x264_param_default, but also use the passed preset and tune
- *      to modify the default settings.
- *      (either can be NULL, which implies no preset or no tune, respectively)
- *
- *      Currently available presets are, ordered from fastest to slowest: */
-static const char * const x264_preset_names[] = { "ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", 0 };
-
-/*      The presets can also be indexed numerically, as in:
- *      x264_param_default_preset( &param, "3", ... )
- *      with ultrafast mapping to "0" and placebo mapping to "9".  This mapping may
- *      of course change if new presets are added in between, but will always be
- *      ordered from fastest to slowest.
- *
- *      Warning: the speed of these presets scales dramatically.  Ultrafast is a full
- *      100 times faster than placebo!
- *
- *      Currently available tunings are: */
-static const char * const x264_tune_names[] = { "film", "animation", "grain", "stillimage", "psnr", "ssim", "fastdecode", "zerolatency", 0 };
-
-/*      Multiple tunings can be used if separated by a delimiter in ",./-+",
- *      however multiple psy tunings cannot be used.
- *      film, animation, grain, stillimage, psnr, and ssim are psy tunings.
- *
- *      returns 0 on success, negative on failure (e.g. invalid preset/tune name). */
-int     x264_param_default_preset( x264_param_t *, const char *preset, const char *tune );
-
-/* x264_param_apply_fastfirstpass:
- *      If first-pass mode is set (rc.b_stat_read == 0, rc.b_stat_write == 1),
- *      modify the encoder settings to disable options generally not useful on
- *      the first pass. */
-void    x264_param_apply_fastfirstpass( x264_param_t * );
-
-/* x264_param_apply_profile:
- *      Applies the restrictions of the given profile.
- *      Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 };
-
-/*      (can be NULL, in which case the function will do nothing)
- *
- *      Does NOT guarantee that the given profile will be used: if the restrictions
- *      of "High" are applied to settings that are already Baseline-compatible, the
- *      stream will remain baseline.  In short, it does not increase settings, only
- *      decrease them.
- *
- *      returns 0 on success, negative on failure (e.g. invalid profile name). */
-int     x264_param_apply_profile( x264_param_t *, const char *profile );
-
-/****************************************************************************
- * Picture structures and functions
- ****************************************************************************/
-
-/* x264_bit_depth:
- *      Specifies the number of bits per pixel that x264 uses. This is also the
- *      bit depth that x264 encodes in. If this value is > 8, x264 will read
- *      two bytes of input data for each pixel sample, and expect the upper
- *      (16-x264_bit_depth) bits to be zero.
- *      Note: The flag X264_CSP_HIGH_DEPTH must be used to specify the
- *      colorspace depth as well. */
-X264_API extern const int x264_bit_depth;
-
-/* x264_chroma_format:
- *      Specifies the chroma formats that x264 supports encoding. When this
- *      value is non-zero, then it represents a X264_CSP_* that is the only
- *      chroma format that x264 supports encoding. If the value is 0 then
- *      there are no restrictions. */
-X264_API extern const int x264_chroma_format;
-
-enum pic_struct_e
-{
-    PIC_STRUCT_AUTO              = 0, // automatically decide (default)
-    PIC_STRUCT_PROGRESSIVE       = 1, // progressive frame
-    // "TOP" and "BOTTOM" are not supported in x264 (PAFF only)
-    PIC_STRUCT_TOP_BOTTOM        = 4, // top field followed by bottom
-    PIC_STRUCT_BOTTOM_TOP        = 5, // bottom field followed by top
-    PIC_STRUCT_TOP_BOTTOM_TOP    = 6, // top field, bottom field, top field repeated
-    PIC_STRUCT_BOTTOM_TOP_BOTTOM = 7, // bottom field, top field, bottom field repeated
-    PIC_STRUCT_DOUBLE            = 8, // double frame
-    PIC_STRUCT_TRIPLE            = 9, // triple frame
-};
-
-typedef struct x264_hrd_t
-{
-    double cpb_initial_arrival_time;
-    double cpb_final_arrival_time;
-    double cpb_removal_time;
-
-    double dpb_output_time;
-} x264_hrd_t;
-
-/* Arbitrary user SEI:
- * Payload size is in bytes and the payload pointer must be valid.
- * Payload types and syntax can be found in Annex D of the H.264 Specification.
- * SEI payload alignment bits as described in Annex D must be included at the
- * end of the payload if needed.
- * The payload should not be NAL-encapsulated.
- * Payloads are written first in order of input, apart from in the case when HRD
- * is enabled where payloads are written after the Buffering Period SEI. */
-
-typedef struct x264_sei_payload_t
-{
-    int payload_size;
-    int payload_type;
-    uint8_t *payload;
-} x264_sei_payload_t;
-
-typedef struct x264_sei_t
-{
-    int num_payloads;
-    x264_sei_payload_t *payloads;
-    /* In: optional callback to free each payload AND x264_sei_payload_t when used. */
-    void (*sei_free)( void* );
-} x264_sei_t;
-
-typedef struct x264_image_t
-{
-    int     i_csp;       /* Colorspace */
-    int     i_plane;     /* Number of image planes */
-    int     i_stride[4]; /* Strides for each plane */
-    uint8_t *plane[4];   /* Pointers to each plane */
-} x264_image_t;
-
-typedef struct x264_image_properties_t
-{
-    /* All arrays of data here are ordered as follows:
-     * each array contains one offset per macroblock, in raster scan order.  In interlaced
-     * mode, top-field MBs and bottom-field MBs are interleaved at the row level.
-     * Macroblocks are 16x16 blocks of pixels (with respect to the luma plane).  For the
-     * purposes of calculating the number of macroblocks, width and height are rounded up to
-     * the nearest 16.  If in interlaced mode, height is rounded up to the nearest 32 instead. */
-
-    /* In: an array of quantizer offsets to be applied to this image during encoding.
-     *     These are added on top of the decisions made by x264.
-     *     Offsets can be fractional; they are added before QPs are rounded to integer.
-     *     Adaptive quantization must be enabled to use this feature.  Behavior if quant
-     *     offsets differ between encoding passes is undefined. */
-    float *quant_offsets;
-    /* In: optional callback to free quant_offsets when used.
-     *     Useful if one wants to use a different quant_offset array for each frame. */
-    void (*quant_offsets_free)( void* );
-
-    /* In: optional array of flags for each macroblock.
-     *     Allows specifying additional information for the encoder such as which macroblocks
-     *     remain unchanged.  Usable flags are listed below.
-     *     x264_param_t.analyse.b_mb_info must be set to use this, since x264 needs to track
-     *     extra data internally to make full use of this information.
-     *
-     * Out: if b_mb_info_update is set, x264 will update this array as a result of encoding.
-     *
-     *      For "MBINFO_CONSTANT", it will remove this flag on any macroblock whose decoded
-     *      pixels have changed.  This can be useful for e.g. noting which areas of the
-     *      frame need to actually be blitted. Note: this intentionally ignores the effects
-     *      of deblocking for the current frame, which should be fine unless one needs exact
-     *      pixel-perfect accuracy.
-     *
-     *      Results for MBINFO_CONSTANT are currently only set for P-frames, and are not
-     *      guaranteed to enumerate all blocks which haven't changed.  (There may be false
-     *      negatives, but no false positives.)
-     */
-    uint8_t *mb_info;
-    /* In: optional callback to free mb_info when used. */
-    void (*mb_info_free)( void* );
-
-    /* The macroblock is constant and remains unchanged from the previous frame. */
-    #define X264_MBINFO_CONSTANT   (1<<0)
-    /* More flags may be added in the future. */
-
-    /* Out: SSIM of the the frame luma (if x264_param_t.b_ssim is set) */
-    double f_ssim;
-    /* Out: Average PSNR of the frame (if x264_param_t.b_psnr is set) */
-    double f_psnr_avg;
-    /* Out: PSNR of Y, U, and V (if x264_param_t.b_psnr is set) */
-    double f_psnr[3];
-
-    /* Out: Average effective CRF of the encoded frame */
-    double f_crf_avg;
-} x264_image_properties_t;
-
-typedef struct x264_picture_t
-{
-    /* In: force picture type (if not auto)
-     *     If x264 encoding parameters are violated in the forcing of picture types,
-     *     x264 will correct the input picture type and log a warning.
-     * Out: type of the picture encoded */
-    int     i_type;
-    /* In: force quantizer for != X264_QP_AUTO */
-    int     i_qpplus1;
-    /* In: pic_struct, for pulldown/doubling/etc...used only if b_pic_struct=1.
-     *     use pic_struct_e for pic_struct inputs
-     * Out: pic_struct element associated with frame */
-    int     i_pic_struct;
-    /* Out: whether this frame is a keyframe.  Important when using modes that result in
-     * SEI recovery points being used instead of IDR frames. */
-    int     b_keyframe;
-    /* In: user pts, Out: pts of encoded picture (user)*/
-    int64_t i_pts;
-    /* Out: frame dts. When the pts of the first frame is close to zero,
-     *      initial frames may have a negative dts which must be dealt with by any muxer */
-    int64_t i_dts;
-    /* In: custom encoding parameters to be set from this frame forwards
-           (in coded order, not display order). If NULL, continue using
-           parameters from the previous frame.  Some parameters, such as
-           aspect ratio, can only be changed per-GOP due to the limitations
-           of H.264 itself; in this case, the caller must force an IDR frame
-           if it needs the changed parameter to apply immediately. */
-    x264_param_t *param;
-    /* In: raw image data */
-    /* Out: reconstructed image data.  x264 may skip part of the reconstruction process,
-            e.g. deblocking, in frames where it isn't necessary.  To force complete
-            reconstruction, at a small speed cost, set b_full_recon. */
-    x264_image_t img;
-    /* In: optional information to modify encoder decisions for this frame
-     * Out: information about the encoded frame */
-    x264_image_properties_t prop;
-    /* Out: HRD timing information. Output only when i_nal_hrd is set. */
-    x264_hrd_t hrd_timing;
-    /* In: arbitrary user SEI (e.g subtitles, AFDs) */
-    x264_sei_t extra_sei;
-    /* private user data. copied from input to output frames. */
-    void *opaque;
-} x264_picture_t;
-
-/* x264_picture_init:
- *  initialize an x264_picture_t.  Needs to be done if the calling application
- *  allocates its own x264_picture_t as opposed to using x264_picture_alloc. */
-void x264_picture_init( x264_picture_t *pic );
-
-/* x264_picture_alloc:
- *  alloc data for a picture. You must call x264_picture_clean on it.
- *  returns 0 on success, or -1 on malloc failure or invalid colorspace. */
-int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
-
-/* x264_picture_clean:
- *  free associated resource for a x264_picture_t allocated with
- *  x264_picture_alloc ONLY */
-void x264_picture_clean( x264_picture_t *pic );
-
-/****************************************************************************
- * Encoder functions
- ****************************************************************************/
-
-/* Force a link error in the case of linking against an incompatible API version.
- * Glue #defines exist to force correct macro expansion; the final output of the macro
- * is x264_encoder_open_##X264_BUILD (for purposes of dlopen). */
-#define x264_encoder_glue1(x,y) x##y
-#define x264_encoder_glue2(x,y) x264_encoder_glue1(x,y)
-#define x264_encoder_open x264_encoder_glue2(x264_encoder_open_,X264_BUILD)
-
-/* x264_encoder_open:
- *      create a new encoder handler, all parameters from x264_param_t are copied */
-x264_t *x264_encoder_open( x264_param_t * );
-
-/* x264_encoder_reconfig:
- *      various parameters from x264_param_t are copied.
- *      this takes effect immediately, on whichever frame is encoded next;
- *      due to delay, this may not be the next frame passed to encoder_encode.
- *      if the change should apply to some particular frame, use x264_picture_t->param instead.
- *      returns 0 on success, negative on parameter validation error.
- *      not all parameters can be changed; see the actual function for a detailed breakdown.
- *
- *      since not all parameters can be changed, moving from preset to preset may not always
- *      fully copy all relevant parameters, but should still work usably in practice. however,
- *      more so than for other presets, many of the speed shortcuts used in ultrafast cannot be
- *      switched out of; using reconfig to switch between ultrafast and other presets is not
- *      recommended without a more fine-grained breakdown of parameters to take this into account. */
-int     x264_encoder_reconfig( x264_t *, x264_param_t * );
-/* x264_encoder_parameters:
- *      copies the current internal set of parameters to the pointer provided
- *      by the caller.  useful when the calling application needs to know
- *      how x264_encoder_open has changed the parameters, or the current state
- *      of the encoder after multiple x264_encoder_reconfig calls.
- *      note that the data accessible through pointers in the returned param struct
- *      (e.g. filenames) should not be modified by the calling application. */
-void    x264_encoder_parameters( x264_t *, x264_param_t * );
-/* x264_encoder_headers:
- *      return the SPS and PPS that will be used for the whole stream.
- *      *pi_nal is the number of NAL units outputted in pp_nal.
- *      returns the number of bytes in the returned NALs.
- *      returns negative on error.
- *      the payloads of all output NALs are guaranteed to be sequential in memory. */
-int     x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
-/* x264_encoder_encode:
- *      encode one picture.
- *      *pi_nal is the number of NAL units outputted in pp_nal.
- *      returns the number of bytes in the returned NALs.
- *      returns negative on error and zero if no NAL units returned.
- *      the payloads of all output NALs are guaranteed to be sequential in memory. */
-int     x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
-/* x264_encoder_close:
- *      close an encoder handler */
-void    x264_encoder_close  ( x264_t * );
-/* x264_encoder_delayed_frames:
- *      return the number of currently delayed (buffered) frames
- *      this should be used at the end of the stream, to know when you have all the encoded frames. */
-int     x264_encoder_delayed_frames( x264_t * );
-/* x264_encoder_maximum_delayed_frames( x264_t *h ):
- *      return the maximum number of delayed (buffered) frames that can occur with the current
- *      parameters. */
-int     x264_encoder_maximum_delayed_frames( x264_t *h );
-/* x264_encoder_intra_refresh:
- *      If an intra refresh is not in progress, begin one with the next P-frame.
- *      If an intra refresh is in progress, begin one as soon as the current one finishes.
- *      Requires that b_intra_refresh be set.
- *
- *      Useful for interactive streaming where the client can tell the server that packet loss has
- *      occurred.  In this case, keyint can be set to an extremely high value so that intra refreshes
- *      only occur when calling x264_encoder_intra_refresh.
- *
- *      In multi-pass encoding, if x264_encoder_intra_refresh is called differently in each pass,
- *      behavior is undefined.
- *
- *      Should not be called during an x264_encoder_encode. */
-void    x264_encoder_intra_refresh( x264_t * );
-/* x264_encoder_invalidate_reference:
- *      An interactive error resilience tool, designed for use in a low-latency one-encoder-few-clients
- *      system.  When the client has packet loss or otherwise incorrectly decodes a frame, the encoder
- *      can be told with this command to "forget" the frame and all frames that depend on it, referencing
- *      only frames that occurred before the loss.  This will force a keyframe if no frames are left to
- *      reference after the aforementioned "forgetting".
- *
- *      It is strongly recommended to use a large i_dpb_size in this case, which allows the encoder to
- *      keep around extra, older frames to fall back on in case more recent frames are all invalidated.
- *      Unlike increasing i_frame_reference, this does not increase the number of frames used for motion
- *      estimation and thus has no speed impact.  It is also recommended to set a very large keyframe
- *      interval, so that keyframes are not used except as necessary for error recovery.
- *
- *      x264_encoder_invalidate_reference is not currently compatible with the use of B-frames or intra
- *      refresh.
- *
- *      In multi-pass encoding, if x264_encoder_invalidate_reference is called differently in each pass,
- *      behavior is undefined.
- *
- *      Should not be called during an x264_encoder_encode, but multiple calls can be made simultaneously.
- *
- *      Returns 0 on success, negative on failure. */
-int x264_encoder_invalidate_reference( x264_t *, int64_t pts );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/x264cli.h b/android/src/main/libenc/jni/libx264/x264cli.h
deleted file mode 100755
index 71be64d..0000000
--- a/android/src/main/libenc/jni/libx264/x264cli.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*****************************************************************************
- * x264cli.h: x264cli common
- *****************************************************************************
- * Copyright (C) 2003-2016 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_CLI_H
-#define X264_CLI_H
-
-#include "common/common.h"
-
-/* In microseconds */
-#define UPDATE_INTERVAL 250000
-
-typedef void *hnd_t;
-
-static inline uint64_t gcd( uint64_t a, uint64_t b )
-{
-    while( 1 )
-    {
-        int64_t c = a % b;
-        if( !c )
-            return b;
-        a = b;
-        b = c;
-    }
-}
-
-static inline uint64_t lcm( uint64_t a, uint64_t b )
-{
-    return ( a / gcd( a, b ) ) * b;
-}
-
-static inline char *get_filename_extension( char *filename )
-{
-    char *ext = filename + strlen( filename );
-    while( *ext != '.' && ext > filename )
-        ext--;
-    ext += *ext == '.';
-    return ext;
-}
-
-void x264_cli_log( const char *name, int i_level, const char *fmt, ... );
-void x264_cli_printf( int i_level, const char *fmt, ... );
-
-#ifdef _WIN32
-void x264_cli_set_console_title( const char *title );
-int x264_ansi_filename( const char *filename, char *ansi_filename, int size, int create_file );
-#else
-#define x264_cli_set_console_title( title )
-#endif
-
-#define RETURN_IF_ERR( cond, name, ret, ... )\
-do\
-{\
-    if( cond )\
-    {\
-        x264_cli_log( name, X264_LOG_ERROR, __VA_ARGS__ );\
-        return ret;\
-    }\
-} while( 0 )
-
-#define FAIL_IF_ERR( cond, name, ... ) RETURN_IF_ERR( cond, name, -1, __VA_ARGS__ )
-
-typedef enum
-{
-    RANGE_AUTO = -1,
-    RANGE_TV,
-    RANGE_PC
-} range_enum;
-
-#endif
diff --git a/android/src/main/libenc/jni/libx264/x264dll.c b/android/src/main/libenc/jni/libx264/x264dll.c
deleted file mode 100755
index 762bc3b..0000000
--- a/android/src/main/libenc/jni/libx264/x264dll.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*****************************************************************************
- * x264dll: x264 DLLMain for win32
- *****************************************************************************
- * Copyright (C) 2009-2016 x264 project
- *
- * Authors: Anton Mitrofanov <BugMaster@narod.ru>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include "common/common.h"
-#include <windows.h>
-
-/* Callback for our DLL so we can initialize pthread */
-BOOL WINAPI DllMain( HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved )
-{
-#if PTW32_STATIC_LIB
-    switch( fdwReason )
-    {
-        case DLL_PROCESS_ATTACH:
-            pthread_win32_process_attach_np();
-
-        case DLL_THREAD_ATTACH:
-            pthread_win32_thread_attach_np();
-            break;
-
-        case DLL_THREAD_DETACH:
-            pthread_win32_thread_detach_np();
-            break;
-
-        case DLL_PROCESS_DETACH:
-            pthread_win32_thread_detach_np();
-            pthread_win32_process_detach_np();
-            break;
-    }
-#endif
-
-    return TRUE;
-}
diff --git a/android/src/main/libenc/jni/libx264/x264res.rc b/android/src/main/libenc/jni/libx264/x264res.rc
deleted file mode 100755
index 89bd6cc..0000000
--- a/android/src/main/libenc/jni/libx264/x264res.rc
+++ /dev/null
@@ -1,82 +0,0 @@
-/*****************************************************************************
- * x264res.rc: windows resource file
- *****************************************************************************
- * Copyright (C) 2012-2016 x264 project
- *
- * Authors: Henrik Gramner <henrik@gramner.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#include <windows.h>
-#include <stdint.h>
-#include "x264.h"
-
-#ifndef X264_REV
-#define X264_REV 0
-#define X264_REV_DIFF 0
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-VS_VERSION_INFO VERSIONINFO
-FILEVERSION     0, X264_BUILD, X264_REV, X264_REV_DIFF
-PRODUCTVERSION  0, X264_BUILD, X264_REV, X264_REV_DIFF
-FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
-#ifdef DEBUG
-FILEFLAGS       VS_FF_DEBUG
-#else
-FILEFLAGS       0
-#endif
-FILEOS          VOS_NT_WINDOWS32 /* Identical for x86-64 */
-#ifdef DLL
-FILETYPE        VFT_DLL
-#else
-FILETYPE        VFT_APP
-#endif
-FILESUBTYPE     VFT2_UNKNOWN
-BEGIN
-    BLOCK "StringFileInfo"
-    BEGIN
-        BLOCK "040904B0"
-        BEGIN
-            VALUE "CompanyName",      "x264 project"
-#ifdef DLL
-            VALUE "FileDescription",  "H.264 (MPEG-4 AVC) encoder library"
-#else
-            VALUE "FileDescription",  "H.264 (MPEG-4 AVC) encoder"
-#endif
-            VALUE "FileVersion",      X264_POINTVER
-            VALUE "InternalName",     "x264"
-            VALUE "LegalCopyright",   "Copyright (C) 2003-2016 x264 project"
-#ifdef DLL
-            VALUE "OriginalFilename", "libx264-" xstr(X264_BUILD) ".dll"
-#else
-            VALUE "OriginalFilename", "x264.exe"
-#endif
-            VALUE "ProductName",      "x264"
-            VALUE "ProductVersion",   X264_POINTVER
-        END
-    END
-
-    BLOCK "VarFileInfo"
-    BEGIN
-        VALUE "Translation", 0x0409, 0x04B0 /* U.S. English (Unicode) */
-    END
-END
diff --git a/android/src/main/libenc/jni/libyuv/LICENSE b/android/src/main/libenc/jni/libyuv/LICENSE
deleted file mode 100755
index c911747..0000000
--- a/android/src/main/libenc/jni/libyuv/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/android/src/main/libenc/jni/libyuv/jni/Android.mk b/android/src/main/libenc/jni/libyuv/jni/Android.mk
deleted file mode 100755
index 34f15c1..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/Android.mk
+++ /dev/null
@@ -1,71 +0,0 @@
-# This is the Android makefile for libyuv for both platform and NDK.
-LOCAL_PATH:= $(call my-dir)
-
-include $(CLEAR_VARS)
-
-LOCAL_CPP_EXTENSION := .cc
-
-LOCAL_SRC_FILES := \
-    source/compare.cc           \
-    source/compare_common.cc    \
-    source/convert.cc           \
-    source/convert_argb.cc      \
-    source/convert_from.cc      \
-    source/convert_from_argb.cc \
-    source/convert_to_argb.cc   \
-    source/convert_to_i420.cc   \
-    source/cpu_id.cc            \
-    source/planar_functions.cc  \
-    source/rotate.cc            \
-    source/rotate_any.cc        \
-    source/rotate_argb.cc       \
-    source/rotate_common.cc     \
-    source/row_any.cc           \
-    source/row_common.cc        \
-    source/scale.cc             \
-    source/scale_any.cc         \
-    source/scale_argb.cc        \
-    source/scale_common.cc      \
-    source/video_common.cc
-
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-    LOCAL_CFLAGS += -DLIBYUV_NEON
-    LOCAL_SRC_FILES += \
-        source/compare_neon.cc.neon    \
-        source/rotate_neon.cc.neon     \
-        source/row_neon.cc.neon        \
-        source/scale_neon.cc.neon
-endif
-
-ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
-    LOCAL_CFLAGS += -DLIBYUV_NEON
-    LOCAL_SRC_FILES += \
-        source/compare_neon64.cc    \
-        source/rotate_neon64.cc     \
-        source/row_neon64.cc        \
-        source/scale_neon64.cc 
-endif
-
-ifeq ($(TARGET_ARCH_ABI),$(filter $(TARGET_ARCH_ABI), x86 x86_64))
-    LOCAL_SRC_FILES += \
-        source/compare_gcc.cc       \
-        source/rotate_gcc.cc        \
-        source/row_gcc.cc           \
-        source/scale_gcc.cc
-endif
-
-ifeq ($(TARGET_ARCH_ABI),$(filter $(TARGET_ARCH_ABI), mips mips_64))
-    LOCAL_SRC_FILES += \
-        source/rotate_mips.cc        \
-        source/row_mips.cc           \
-        source/scale_mips.cc
-endif
-
-LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
-LOCAL_LDLIBS := -llog
-LOCAL_MODULE := libyuv
-LOCAL_MODULE_TAGS := optional
-
-include $(BUILD_SHARED_LIBRARY)
-#include $(BUILD_STATIC_LIBRARY)
\ No newline at end of file
diff --git a/android/src/main/libenc/jni/libyuv/jni/Application.mk b/android/src/main/libenc/jni/libyuv/jni/Application.mk
deleted file mode 100755
index baad0ac..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/Application.mk
+++ /dev/null
@@ -1,2 +0,0 @@
-APP_ABI := armeabi-v7a x86
-APP_PLATFORM := android-19
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv.h
deleted file mode 100755
index de65283..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_H_  // NOLINT
-#define INCLUDE_LIBYUV_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/mjpeg_decoder.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"
-#include "libyuv/scale_argb.h"
-#include "libyuv/scale_row.h"
-#include "libyuv/version.h"
-#include "libyuv/video_common.h"
-
-#endif  // INCLUDE_LIBYUV_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/basic_types.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/basic_types.h
deleted file mode 100755
index beb750b..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/basic_types.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
-#define INCLUDE_LIBYUV_BASIC_TYPES_H_
-
-#include <stddef.h>  // for NULL, size_t
-
-#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
-#include <sys/types.h>  // for uintptr_t on x86
-#else
-#include <stdint.h>  // for uintptr_t
-#endif
-
-#ifndef GG_LONGLONG
-#ifndef INT_TYPES_DEFINED
-#define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef unsigned __int64 uint64;
-typedef __int64 int64;
-#ifndef INT64_C
-#define INT64_C(x) x ## I64
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UI64
-#endif
-#define INT64_F "I64"
-#else  // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64;  // NOLINT
-typedef long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## L
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UL
-#endif
-#define INT64_F "l"
-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64;  // NOLINT
-typedef long long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## LL
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## ULL
-#endif
-#define INT64_F "ll"
-#endif  // __LP64__
-#endif  // COMPILER_MSVC
-typedef unsigned int uint32;
-typedef int int32;
-typedef unsigned short uint16;  // NOLINT
-typedef short int16;  // NOLINT
-typedef unsigned char uint8;
-typedef signed char int8;
-#endif  // INT_TYPES_DEFINED
-#endif  // GG_LONGLONG
-
-// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || \
-    defined(__i386__) || defined(_M_IX86)
-#define CPU_X86 1
-#endif
-// Detect compiler is for ARM.
-#if defined(__arm__) || defined(_M_ARM)
-#define CPU_ARM 1
-#endif
-
-#ifndef ALIGNP
-#ifdef __cplusplus
-#define ALIGNP(p, t) \
-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
-    ((t) - 1)) & ~((t) - 1))))
-#else
-#define ALIGNP(p, t) \
-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
-#endif
-#endif
-
-#if !defined(LIBYUV_API)
-#if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
-#define LIBYUV_API __declspec(dllexport)
-#elif defined(LIBYUV_USING_SHARED_LIBRARY)
-#define LIBYUV_API __declspec(dllimport)
-#else
-#define LIBYUV_API
-#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
-    defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__ ((visibility ("default")))
-#else
-#define LIBYUV_API
-#endif  // __GNUC__
-#endif  // LIBYUV_API
-
-#define LIBYUV_BOOL int
-#define LIBYUV_FALSE 0
-#define LIBYUV_TRUE 1
-
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare.h
deleted file mode 100755
index 08b2bb2..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
-#define INCLUDE_LIBYUV_COMPARE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Compute a hash for specified memory. Seed of 5381 recommended.
-LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
-
-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
-
-// Sum Square Error - used to compute Mean Square Error or PSNR.
-LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
-                             const uint8* src_b, int count);
-
-LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height);
-
-static const int kMaxPsnr = 128;
-
-LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
-
-LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
-
-LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
-
-LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
-
-LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare_row.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare_row.h
deleted file mode 100755
index 38a957b..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/compare_row.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT
-#define INCLUDE_LIBYUV_COMPARE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-#if !defined(LIBYUV_DISABLE_X86) && \
-    defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_HASHDJB2_AVX2
-#endif
-
-// The following are available for Visual C and GCC:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
-#define HAS_HASHDJB2_SSE41
-#define HAS_SUMSQUAREERROR_SSE2
-#endif
-
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_HASHDJB2_AVX2
-#define HAS_SUMSQUAREERROR_AVX2
-#endif
-
-// The following are available for Neon:
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert.h
deleted file mode 100755
index a8d3fa0..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_H_
-
-#include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert I444 to I420.
-LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert I422 to I420.
-LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert I411 to I420.
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Copy I420 to I420.
-#define I420ToI420 I420Copy
-LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Convert I400 (grey) to I420.
-LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-#define J400ToJ420 I400ToI420
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert NV21 to I420.
-LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// ARGB little endian (bgra in memory) to I420.
-LIBYUV_API
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// BGRA little endian (argb in memory) to I420.
-LIBYUV_API
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// ABGR little endian (rgba in memory) to I420.
-LIBYUV_API
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// RGBA little endian (abgr in memory) to I420.
-LIBYUV_API
-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// RGB little endian (bgr in memory) to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height);
-
-// RGB big endian (rgb in memory) to I420.
-LIBYUV_API
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
-
-// RGB16 (RGBP fourcc) little endian to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height);
-
-// RGB15 (RGBO fourcc) little endian to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
-
-// RGB12 (R444 fourcc) little endian to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture.
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToI420(const uint8* sample, size_t sample_size,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height);
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_y" number of bytes in a row of the dst_y plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToI420(const uint8* src_frame, size_t src_size,
-                  uint8* dst_y, int dst_stride_y,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
-                  enum RotationMode rotation,
-                  uint32 format);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_argb.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_argb.h
deleted file mode 100755
index ce4e3d0..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_argb.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
-
-#include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Alias.
-#define ARGBToARGB ARGBCopy
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate);
-
-// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
-LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J400 (jpeg grey) to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Alias.
-#define YToARGB I400ToARGB
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// BGRA little endian (argb in memory) to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// ABGR little endian (rgba in memory) to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// RGBA little endian (abgr in memory) to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Deprecated function name.
-#define BG24ToARGB RGB24ToARGB
-
-// RGB little endian (bgr in memory) to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
-
-// RGB big endian (rgb in memory) to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
-
-// RGB16 (RGBP fourcc) little endian to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
-// RGB15 (RGBO fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
-
-// RGB12 (R444 fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
-#endif
-
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToARGB(const uint8* src_frame, size_t src_size,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
-                  enum RotationMode rotation,
-                  uint32 format);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from.h
deleted file mode 100755
index 9fd8d4d..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_FROM_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/rotate.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// See Also convert.h for conversions from formats to I420.
-
-// I420Copy in convert to I420ToI420.
-
-LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
-LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height);
-
-// TODO(fbarchard): I420ToM420
-
-LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_frame, int dst_stride_frame,
-              int width, int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_frame, int dst_stride_frame,
-                       const uint8* dither4x4, int width, int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
-
-// Convert I420 to specified format.
-// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
-//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
-LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 format);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from_argb.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from_argb.h
deleted file mode 100755
index 1df5320..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/convert_from_argb.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB to ARGB.
-#define ARGBToARGB ARGBCopy
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
-
-// Convert ARGB To BGRA.
-LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
-
-// Convert ARGB To ABGR.
-LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
-LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
-
-// Convert ARGB To I444.
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I422.
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I420. (also in convert.h)
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB to J422.
-LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB to J400. (JPeg full range).
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
-LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
-            uint8* dst_g, int dst_stride_g,
-            int width, int height);
-
-// Convert ARGB To NV12.
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-// Convert ARGB To YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height);
-
-// Convert ARGB To UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/cpu_id.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/cpu_id.h
deleted file mode 100755
index 2ccc3e7..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/cpu_id.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
-#define INCLUDE_LIBYUV_CPU_ID_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Internal flag to indicate cpuid requires initialization.
-static const int kCpuInitialized = 0x1;
-
-// These flags are only valid on ARM processors.
-static const int kCpuHasARM = 0x2;
-static const int kCpuHasNEON = 0x4;
-// 0x8 reserved for future ARM flag.
-
-// These flags are only valid on x86 processors.
-static const int kCpuHasX86 = 0x10;
-static const int kCpuHasSSE2 = 0x20;
-static const int kCpuHasSSSE3 = 0x40;
-static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;
-static const int kCpuHasAVX = 0x200;
-static const int kCpuHasAVX2 = 0x400;
-static const int kCpuHasERMS = 0x800;
-static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
-
-// These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
-
-// Internal function used to auto-init.
-LIBYUV_API
-int InitCpuFlags(void);
-
-// Internal function for parsing /proc/cpuinfo.
-LIBYUV_API
-int ArmCpuCaps(const char* cpuinfo_name);
-
-// Detect CPU has SSE2 etc.
-// Test_flag parameter should be one of kCpuHas constants above.
-// returns non-zero if instruction set is detected
-static __inline int TestCpuFlag(int test_flag) {
-  LIBYUV_API extern int cpu_info_;
-  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
-}
-
-// For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
-// MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(0) to disable all cpu specific optimizations.
-LIBYUV_API
-void MaskCpuFlags(int enable_flags);
-
-// Low level cpuid for X86. Returns zeros on other CPUs.
-// eax is the info type that you want.
-// ecx is typically the cpu number, and should normally be zero.
-LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/mjpeg_decoder.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/mjpeg_decoder.h
deleted file mode 100755
index 8423121..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/mjpeg_decoder.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
-#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-// NOTE: For a simplified public API use convert.h MJPGToI420().
-
-struct jpeg_common_struct;
-struct jpeg_decompress_struct;
-struct jpeg_source_mgr;
-
-namespace libyuv {
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
-
-enum JpegSubsamplingType {
-  kJpegYuv420,
-  kJpegYuv422,
-  kJpegYuv411,
-  kJpegYuv444,
-  kJpegYuv400,
-  kJpegUnknown
-};
-
-struct Buffer {
-  const uint8* data;
-  int len;
-};
-
-struct BufferVector {
-  Buffer* buffers;
-  int len;
-  int pos;
-};
-
-struct SetJmpErrorMgr;
-
-// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
-// simply independent JPEG images with a fixed huffman table (which is omitted).
-// It is rarely used in video transmission, but is common as a camera capture
-// format, especially in Logitech devices. This class implements a decoder for
-// MJPEG frames.
-//
-// See http://tools.ietf.org/html/rfc2435
-class LIBYUV_API MJpegDecoder {
- public:
-  typedef void (*CallbackFunction)(void* opaque,
-                                   const uint8* const* data,
-                                   const int* strides,
-                                   int rows);
-
-  static const int kColorSpaceUnknown;
-  static const int kColorSpaceGrayscale;
-  static const int kColorSpaceRgb;
-  static const int kColorSpaceYCbCr;
-  static const int kColorSpaceCMYK;
-  static const int kColorSpaceYCCK;
-
-  MJpegDecoder();
-  ~MJpegDecoder();
-
-  // Loads a new frame, reads its headers, and determines the uncompressed
-  // image format.
-  // Returns LIBYUV_TRUE if image looks valid and format is supported.
-  // If return value is LIBYUV_TRUE, then the values for all the following
-  // getters are populated.
-  // src_len is the size of the compressed mjpeg frame in bytes.
-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
-
-  // Returns width of the last loaded frame in pixels.
-  int GetWidth();
-
-  // Returns height of the last loaded frame in pixels.
-  int GetHeight();
-
-  // Returns format of the last loaded frame. The return value is one of the
-  // kColorSpace* constants.
-  int GetColorSpace();
-
-  // Number of color components in the color space.
-  int GetNumComponents();
-
-  // Sample factors of the n-th component.
-  int GetHorizSampFactor(int component);
-
-  int GetVertSampFactor(int component);
-
-  int GetHorizSubSampFactor(int component);
-
-  int GetVertSubSampFactor(int component);
-
-  // Public for testability.
-  int GetImageScanlinesPerImcuRow();
-
-  // Public for testability.
-  int GetComponentScanlinesPerImcuRow(int component);
-
-  // Width of a component in bytes.
-  int GetComponentWidth(int component);
-
-  // Height of a component.
-  int GetComponentHeight(int component);
-
-  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
-  int GetComponentStride(int component);
-
-  // Size of a component in bytes.
-  int GetComponentSize(int component);
-
-  // Call this after LoadFrame() if you decide you don't want to decode it
-  // after all.
-  LIBYUV_BOOL UnloadFrame();
-
-  // Decodes the entire image into a one-buffer-per-color-component format.
-  // dst_width must match exactly. dst_height must be <= to image height; if
-  // less, the image is cropped. "planes" must have size equal to at least
-  // GetNumComponents() and they must point to non-overlapping buffers of size
-  // at least GetComponentSize(i). The pointers in planes are incremented
-  // to point to after the end of the written data.
-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
-
-  // Decodes the entire image and passes the data via repeated calls to a
-  // callback function. Each call will get the data for a whole number of
-  // image scanlines.
-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
-                        int dst_width, int dst_height);
-
-  // The helper function which recognizes the jpeg sub-sampling type.
-  static JpegSubsamplingType JpegSubsamplingTypeHelper(
-     int* subsample_x, int* subsample_y, int number_of_components);
-
- private:
-  void AllocOutputBuffers(int num_outbufs);
-  void DestroyOutputBuffers();
-
-  LIBYUV_BOOL StartDecode();
-  LIBYUV_BOOL FinishDecode();
-
-  void SetScanlinePointers(uint8** data);
-  LIBYUV_BOOL DecodeImcuRow();
-
-  int GetComponentScanlinePadding(int component);
-
-  // A buffer holding the input data for a frame.
-  Buffer buf_;
-  BufferVector buf_vec_;
-
-  jpeg_decompress_struct* decompress_struct_;
-  jpeg_source_mgr* source_mgr_;
-  SetJmpErrorMgr* error_mgr_;
-
-  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
-  // GetComponentScanlinePadding() != 0.)
-  LIBYUV_BOOL has_scanline_padding_;
-
-  // Temporaries used to point to scanline outputs.
-  int num_outbufs_;  // Outermost size of all arrays below.
-  uint8*** scanlines_;
-  int* scanlines_sizes_;
-  // Temporary buffer used for decoding when we can't decode directly to the
-  // output buffers. Large enough for just one iMCU row.
-  uint8** databuf_;
-  int* databuf_strides_;
-};
-
-}  // namespace libyuv
-
-#endif  //  __cplusplus
-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/planar_functions.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/planar_functions.h
deleted file mode 100755
index 9c19a59..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/planar_functions.h
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
-#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-
-#include "libyuv/basic_types.h"
-
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy a plane of data.
-LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height);
-
-// Set a plane of data to a 32 bit value.
-LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value);
-
-// Copy I400.  Supports inverting.
-LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-#define J400ToJ400 I400ToI400
-
-// Copy I422 to I422.
-#define I422ToI422 I422Copy
-LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Copy I444 to I444.
-#define I444ToI444 I444Copy
-LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-// Convert I420 to I400. (calls CopyPlane ignoring u/v).
-LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alias
-#define J420ToJ400 I420ToI400
-#define I420ToI420Mirror I420Mirror
-
-// I420 mirror.
-LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Alias
-#define I400ToI400Mirror I400Mirror
-
-// I400 mirror.  A single plane is mirrored horizontally.
-// Pass negative height to achieve 180 degree rotation.
-LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alias
-#define ARGBToARGBMirror ARGBMirror
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
-
-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height);
-
-// Draw a rectangle into I420.
-LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y, int width, int height,
-             int value_y, int value_u, int value_v);
-
-// Draw a rectangle into ARGB.
-LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height, uint32 value);
-
-// Convert ARGB to gray scale ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height);
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int x, int y, int width, int height);
-
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The next 4 coefficients apply to B, G, R, A and produce R of the output.
-// The last 4 coefficients apply to B, G, R, A and produce A of the output.
-LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height);
-
-// Deprecated. Use ARGBColorMatrix instead.
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The last 4 coefficients apply to B, G, R, A and produce R of the output.
-LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int x, int y, int width, int height);
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int x, int y, int width, int height);
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int x, int y, int width, int height);
-
-// Apply a luma/color table each ARGB pixel but preserve destination alpha.
-// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
-// RGB (YJ style) and C is an 8 bit color component (R, G or B).
-LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma_rgb_table,
-                       int width, int height);
-
-// Apply a 3 term polynomial to ARGB values.
-// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
-// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
-// g squared, r squared and a squared.  The 4rd row is coefficients for b to
-// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
-// result clamped to 0 to 255.
-// A polynomial approximation can be dirived using software such as 'R'.
-
-LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
-                   const float* poly,
-                   int width, int height);
-
-// Quantize a rectangle of ARGB. Alpha unaffected.
-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
-// interval_size should be a value between 1 and 255.
-// interval_offset should be a value between 0 and 255.
-LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int x, int y, int width, int height);
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
-
-// Copy Alpha channel of ARGB to alpha of ARGB.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
-
-// Copy Y channel to Alpha of ARGB.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height);
-
-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
-                             uint8* dst_argb, int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
-// Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
-// Alpha of destination is set to 255.
-LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
-
-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
-
-// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
-// Add ARGB image with ARGB image. Saturates to 255.
-LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height);
-
-// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
-LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
-// Convert I422 to YUY2.
-LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-// Convert I422 to UYVY.
-LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Internal function - do not call directly.
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height);
-
-// Blur ARGB image.
-// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
-//   16 byte boundary.
-// dst_stride32_cumsum is number of ints in a row (width * 4).
-// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
-// Blur is optimized for radius of 5 (11x11) or less.
-LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius);
-
-// Multiply ARGB image by ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value);
-
-// Interpolate between two images using specified amount of interpolation
-// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
-LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation);
-
-// Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation);
-
-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
-
-// Row function for copying pixels from a source with a slope to a row
-// of destination. Useful for scaling, rotation, mirror, texture mapping.
-LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
-
-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-// shuffler is 16 bytes and must be aligned.
-LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height);
-
-// Sobel ARGB effect with planar output.
-LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height);
-
-// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
-
-// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
-LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate.h
deleted file mode 100755
index 8af60b8..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
-#define INCLUDE_LIBYUV_ROTATE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported rotation.
-typedef enum RotationMode {
-  kRotate0 = 0,  // No rotation.
-  kRotate90 = 90,  // Rotate 90 degrees clockwise.
-  kRotate180 = 180,  // Rotate 180 degrees.
-  kRotate270 = 270,  // Rotate 270 degrees clockwise.
-
-  // Deprecated.
-  kRotateNone = 0,
-  kRotateClockwise = 90,
-  kRotateCounterClockwise = 270,
-} RotationModeEnum;
-
-// Rotate I420 frame.
-LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height, enum RotationMode mode);
-
-// Rotate NV12 input and store in I420.
-LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height, enum RotationMode mode);
-
-// Rotate a plane by 0, 90, 180, or 270.
-LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int src_width, int src_height, enum RotationMode mode);
-
-// Rotate planes by 90, 180, 270. Deprecated.
-LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
-
-LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
-
-LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
-LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
-
-LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-// Deprecated.
-LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
-
-LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_argb.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_argb.h
deleted file mode 100755
index 660ff55..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_argb.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
-#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/rotate.h"  // For RotationMode.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Rotate ARGB frame
-LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height, enum RotationMode mode);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_row.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_row.h
deleted file mode 100755
index ebc487f..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/rotate_row.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
-#define INCLUDE_LIBYUV_ROTATE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-#define HAS_TRANSPOSEWX8_SSSE3
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
-#define HAS_TRANSPOSEWX8_SSSE3
-#endif
-
-// The following are available for 64 bit GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
-#define HAS_TRANSPOSEWX8_FAST_SSSE3
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_TRANSPOSEWX8_NEON
-#define HAS_TRANSPOSEUVWX8_NEON
-#endif
-
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_DSPR2
-#define HAS_TRANSPOSEUVWX8_DSPR2
-#endif  // defined(__mips__)
-
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height);
-
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width);
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
-                                 uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height);
-
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b, int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
-                              uint8* dst_a, int dst_stride_a,
-                              uint8* dst_b, int dst_stride_b, int width);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/row.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/row.h
deleted file mode 100755
index b5d9aaa..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/row.h
+++ /dev/null
@@ -1,1929 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
-#define INCLUDE_LIBYUV_ROW_H_
-
-#include <stdlib.h>  // For malloc.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
-
-#ifdef __cplusplus
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
-  uint8* var = reinterpret_cast<uint8*>                                        \
-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
-#else
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
-#endif
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);  \
-  var = 0
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// True if compiling for SSSE3 as a requirement.
-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
-#define LIBYUV_SSSE3_ONLY
-#endif
-
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// clang >= 3.5.0 required for Arm64.
-#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
-#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
-#define LIBYUV_DISABLE_NEON
-#endif  // clang >= 3.5
-#endif  // __clang__
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions:
-#define HAS_ABGRTOUVROW_SSSE3
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGB1555TOARGBROW_SSE2
-#define HAS_ARGB4444TOARGBROW_SSE2
-#define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSE2
-#define HAS_ARGBSHUFFLEROW_SSSE3
-#define HAS_ARGBTOARGB1555ROW_SSE2
-#define HAS_ARGBTOARGB4444ROW_SSE2
-#define HAS_ARGBTORAWROW_SSSE3
-#define HAS_ARGBTORGB24ROW_SSSE3
-#define HAS_ARGBTORGB565DITHERROW_SSE2
-#define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_ARGBTOYJROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_COPYROW_ERMS
-#define HAS_COPYROW_SSE2
-#define HAS_H422TOARGBROW_SSSE3
-#define HAS_I400TOARGBROW_SSE2
-#define HAS_I422TOARGB1555ROW_SSSE3
-#define HAS_I422TOARGB4444ROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#define HAS_I422TORGB24ROW_SSSE3
-#define HAS_I422TORGB565ROW_SSSE3
-#define HAS_I422TORGBAROW_SSSE3
-#define HAS_I422TOUYVYROW_SSE2
-#define HAS_I422TOYUY2ROW_SSE2
-#define HAS_I444TOARGBROW_SSSE3
-#define HAS_J400TOARGBROW_SSE2
-#define HAS_J422TOARGBROW_SSSE3
-#define HAS_MERGEUVROW_SSE2
-#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
-#define HAS_NV12TOARGBROW_SSSE3
-#define HAS_NV12TORGB565ROW_SSSE3
-#define HAS_NV21TOARGBROW_SSSE3
-#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RAWTORGB24ROW_SSSE3
-#define HAS_RAWTOYROW_SSSE3
-#define HAS_RGB24TOARGBROW_SSSE3
-#define HAS_RGB24TOYROW_SSSE3
-#define HAS_RGB565TOARGBROW_SSE2
-#define HAS_RGBATOUVROW_SSSE3
-#define HAS_RGBATOYROW_SSSE3
-#define HAS_SETROW_ERMS
-#define HAS_SETROW_X86
-#define HAS_SPLITUVROW_SSE2
-#define HAS_UYVYTOARGBROW_SSSE3
-#define HAS_UYVYTOUV422ROW_SSE2
-#define HAS_UYVYTOUVROW_SSE2
-#define HAS_UYVYTOYROW_SSE2
-#define HAS_YUY2TOARGBROW_SSSE3
-#define HAS_YUY2TOUV422ROW_SSE2
-#define HAS_YUY2TOUVROW_SSE2
-#define HAS_YUY2TOYROW_SSE2
-
-// Effects:
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSE2
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBPOLYNOMIALROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_BLENDPLANEROW_SSSE3
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_RGBCOLORTABLEROW_X86
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELTOPLANEROW_SSE2
-#define HAS_SOBELXROW_SSE2
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSE2
-
-// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
-// caveat: clangcl uses row_win.cc which works.
-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
-    !defined(__i386__) || defined(_MSC_VER)
-// TODO(fbarchard): fix build error on x86 debug
-// https://code.google.com/p/libyuv/issues/detail?id=524
-#define HAS_I411TOARGBROW_SSSE3
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_SSSE3
-#endif
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_ARGBCOPYALPHAROW_AVX2
-#define HAS_ARGBCOPYYTOALPHAROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
-#define HAS_ARGBPOLYNOMIALROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
-#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
-#define HAS_ARGBTOYJROW_AVX2
-#define HAS_ARGBTOYROW_AVX2
-#define HAS_COPYROW_AVX
-#define HAS_H422TOARGBROW_AVX2
-#define HAS_I400TOARGBROW_AVX2
-#if !(defined(_DEBUG) && defined(__i386__))
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#endif
-#define HAS_I411TOARGBROW_AVX2
-#define HAS_I422TOARGB1555ROW_AVX2
-#define HAS_I422TOARGB4444ROW_AVX2
-#define HAS_I422TOARGBROW_AVX2
-#define HAS_I422TORGB24ROW_AVX2
-#define HAS_I422TORGB565ROW_AVX2
-#define HAS_I422TORGBAROW_AVX2
-#define HAS_I444TOARGBROW_AVX2
-#define HAS_INTERPOLATEROW_AVX2
-#define HAS_J422TOARGBROW_AVX2
-#define HAS_MERGEUVROW_AVX2
-#define HAS_MIRRORROW_AVX2
-#define HAS_NV12TOARGBROW_AVX2
-#define HAS_NV12TORGB565ROW_AVX2
-#define HAS_NV21TOARGBROW_AVX2
-#define HAS_SPLITUVROW_AVX2
-#define HAS_UYVYTOARGBROW_AVX2
-#define HAS_UYVYTOUV422ROW_AVX2
-#define HAS_UYVYTOUVROW_AVX2
-#define HAS_UYVYTOYROW_AVX2
-#define HAS_YUY2TOARGBROW_AVX2
-#define HAS_YUY2TOUV422ROW_AVX2
-#define HAS_YUY2TOUVROW_AVX2
-#define HAS_YUY2TOYROW_AVX2
-
-// Effects:
-#define HAS_ARGBADDROW_AVX2
-#define HAS_ARGBATTENUATEROW_AVX2
-#define HAS_ARGBMULTIPLYROW_AVX2
-#define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
-#define HAS_BLENDPLANEROW_AVX2
-#endif
-
-// The following are available for AVX2 Visual C and clangcl 32 bit:
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
-#define HAS_ARGBTOARGB1555ROW_AVX2
-#define HAS_ARGBTOARGB4444ROW_AVX2
-#define HAS_ARGBTORGB565ROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
-#endif
-
-// The following are also available on x64 Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
-    (!defined(__clang__) || defined(__SSSE3__))
-#define HAS_I422ALPHATOARGBROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#endif
-
-// The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_ABGRTOUVROW_NEON
-#define HAS_ABGRTOYROW_NEON
-#define HAS_ARGB1555TOARGBROW_NEON
-#define HAS_ARGB1555TOUVROW_NEON
-#define HAS_ARGB1555TOYROW_NEON
-#define HAS_ARGB4444TOARGBROW_NEON
-#define HAS_ARGB4444TOUVROW_NEON
-#define HAS_ARGB4444TOYROW_NEON
-#define HAS_ARGBSETROW_NEON
-#define HAS_ARGBTOARGB1555ROW_NEON
-#define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORGB565DITHERROW_NEON
-#define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOUV411ROW_NEON
-#define HAS_ARGBTOUV444ROW_NEON
-#define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOUVROW_NEON
-#define HAS_ARGBTOYJROW_NEON
-#define HAS_ARGBTOYROW_NEON
-#define HAS_BGRATOUVROW_NEON
-#define HAS_BGRATOYROW_NEON
-#define HAS_COPYROW_NEON
-#define HAS_I400TOARGBROW_NEON
-#define HAS_I411TOARGBROW_NEON
-#define HAS_I422ALPHATOARGBROW_NEON
-#define HAS_I422TOARGB1555ROW_NEON
-#define HAS_I422TOARGB4444ROW_NEON
-#define HAS_I422TOARGBROW_NEON
-#define HAS_I422TORGB24ROW_NEON
-#define HAS_I422TORGB565ROW_NEON
-#define HAS_I422TORGBAROW_NEON
-#define HAS_I422TOUYVYROW_NEON
-#define HAS_I422TOYUY2ROW_NEON
-#define HAS_I444TOARGBROW_NEON
-#define HAS_J400TOARGBROW_NEON
-#define HAS_MERGEUVROW_NEON
-#define HAS_MIRRORROW_NEON
-#define HAS_MIRRORUVROW_NEON
-#define HAS_NV12TOARGBROW_NEON
-#define HAS_NV12TORGB565ROW_NEON
-#define HAS_NV21TOARGBROW_NEON
-#define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGB24ROW_NEON
-#define HAS_RAWTOUVROW_NEON
-#define HAS_RAWTOYROW_NEON
-#define HAS_RGB24TOARGBROW_NEON
-#define HAS_RGB24TOUVROW_NEON
-#define HAS_RGB24TOYROW_NEON
-#define HAS_RGB565TOARGBROW_NEON
-#define HAS_RGB565TOUVROW_NEON
-#define HAS_RGB565TOYROW_NEON
-#define HAS_RGBATOUVROW_NEON
-#define HAS_RGBATOYROW_NEON
-#define HAS_SETROW_NEON
-#define HAS_SPLITUVROW_NEON
-#define HAS_UYVYTOARGBROW_NEON
-#define HAS_UYVYTOUV422ROW_NEON
-#define HAS_UYVYTOUVROW_NEON
-#define HAS_UYVYTOYROW_NEON
-#define HAS_YUY2TOARGBROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
-#define HAS_YUY2TOUVROW_NEON
-#define HAS_YUY2TOYROW_NEON
-
-// Effects:
-#define HAS_ARGBADDROW_NEON
-#define HAS_ARGBATTENUATEROW_NEON
-#define HAS_ARGBBLENDROW_NEON
-#define HAS_ARGBCOLORMATRIXROW_NEON
-#define HAS_ARGBGRAYROW_NEON
-#define HAS_ARGBMIRRORROW_NEON
-#define HAS_ARGBMULTIPLYROW_NEON
-#define HAS_ARGBQUANTIZEROW_NEON
-#define HAS_ARGBSEPIAROW_NEON
-#define HAS_ARGBSHADEROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
-#define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_INTERPOLATEROW_NEON
-#define HAS_SOBELROW_NEON
-#define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXROW_NEON
-#define HAS_SOBELXYROW_NEON
-#define HAS_SOBELYROW_NEON
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-#define HAS_COPYROW_MIPS
-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOARGBROW_DSPR2
-#define HAS_INTERPOLATEROW_DSPR2
-#define HAS_MIRRORROW_DSPR2
-#define HAS_MIRRORUVROW_DSPR2
-#define HAS_SPLITUVROW_DSPR2
-#endif
-#endif
-
-#if defined(_MSC_VER) && !defined(__CLR_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define SIMD_ALIGNED32(var) __declspec(align(64)) var
-typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint16 uvec16[8];
-typedef __declspec(align(16)) uint32 uvec32[4];
-typedef __declspec(align(16)) uint8 uvec8[16];
-typedef __declspec(align(32)) int16 lvec16[16];
-typedef __declspec(align(32)) int32 lvec32[8];
-typedef __declspec(align(32)) int8 lvec8[32];
-typedef __declspec(align(32)) uint16 ulvec16[16];
-typedef __declspec(align(32)) uint32 ulvec32[8];
-typedef __declspec(align(32)) uint8 ulvec8[32];
-#elif defined(__GNUC__) && !defined(__pnacl__)
-// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
-typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
-typedef uint32 __attribute__((vector_size(16))) uvec32;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
-typedef int16 __attribute__((vector_size(32))) lvec16;
-typedef int32 __attribute__((vector_size(32))) lvec32;
-typedef int8 __attribute__((vector_size(32))) lvec8;
-typedef uint16 __attribute__((vector_size(32))) ulvec16;
-typedef uint32 __attribute__((vector_size(32))) ulvec32;
-typedef uint8 __attribute__((vector_size(32))) ulvec8;
-#else
-#define SIMD_ALIGNED(var) var
-#define SIMD_ALIGNED32(var) var
-typedef int16 vec16[8];
-typedef int32 vec32[4];
-typedef int8 vec8[16];
-typedef uint16 uvec16[8];
-typedef uint32 uvec32[4];
-typedef uint8 uvec8[16];
-typedef int16 lvec16[16];
-typedef int32 lvec32[8];
-typedef int8 lvec8[32];
-typedef uint16 ulvec16[16];
-typedef uint32 ulvec32[8];
-typedef uint8 ulvec8[32];
-#endif
-
-#if defined(__aarch64__)
-// This struct is for Arm64 color conversion.
-struct YuvConstants {
-  uvec16 kUVToRB;
-  uvec16 kUVToRB2;
-  uvec16 kUVToG;
-  uvec16 kUVToG2;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
-};
-#elif defined(__arm__)
-// This struct is for ArmV7 color conversion.
-struct YuvConstants {
-  uvec8 kUVToRB;
-  uvec8 kUVToG;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
-};
-#else
-// This struct is for Intel color conversion.
-struct YuvConstants {
-  lvec8 kUVToB;
-  lvec8 kUVToG;
-  lvec8 kUVToR;
-  lvec16 kUVBiasB;
-  lvec16 kUVBiasG;
-  lvec16 kUVBiasR;
-  lvec16 kYToRgb;
-};
-
-// Offsets into YuvConstants structure
-#define KUVTOB   0
-#define KUVTOG   32
-#define KUVTOR   64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB  192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants kYuvI601Constants;  // BT.601
-extern const struct YuvConstants kYuvJPEGConstants;  // JPeg color space
-extern const struct YuvConstants kYuvH709Constants;  // BT.709
-
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants kYvuI601Constants;  // BT.601
-extern const struct YuvConstants kYvuJPEGConstants;  // JPeg color space
-extern const struct YuvConstants kYvuH709Constants;  // BT.709
-
-#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
-#define OMITFP
-#else
-#define OMITFP __attribute__((optimize("omit-frame-pointer")))
-#endif
-
-// NaCL macros for GCC x86 and x64.
-#if defined(__native_client__)
-#define LABELALIGN ".p2align 5\n"
-#else
-#define LABELALIGN
-#endif
-#if defined(__native_client__) && defined(__x86_64__)
-// r14 is used for MEMOP macros.
-#define NACL_R14 "r14",
-#define BUNDLELOCK ".bundle_lock\n"
-#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
-#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%q" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%q" #base ",%q" #index "," #scale ")"
-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg "\n" \
-    BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%" #arg "\n" \
-    BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
-    BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#else  // defined(__native_client__) && defined(__x86_64__)
-#define NACL_R14
-#define BUNDLEALIGN
-#define MEMACCESS(base) "(%" #base ")"
-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
-#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%" #base ",%" #index "," #scale ")"
-#define MEMMOVESTRING(s, d)
-#define MEMSTORESTRING(reg, d)
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
-    #reg2 "\n"
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#endif  // defined(__native_client__) && defined(__x86_64__)
-
-#if defined(__arm__) || defined(__aarch64__)
-#undef MEMACCESS
-#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
-#else
-#define MEMACCESS(base)
-#endif
-#endif
-
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422AlphaToARGBRow_NEON(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width);
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width);
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
-                             int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
-                             int width);
-
-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                            uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int width);
-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int width);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
-                         uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                            uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
-                              int src_stride_argb1555,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
-                              int src_stride_argb4444,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
-                    uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
-                  uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
-void MirrorRow_C(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width);
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width);
-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                          int width);
-
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                  int width);
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width);
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width);
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                         int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                         int width);
-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                         int width);
-
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
-void CopyRow_C(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
-
-void CopyRow_16_C(const uint16* src, uint16* dst, int count);
-
-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-
-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
-                                  int width);
-
-void SetRow_C(uint8* dst, uint8 v8, int count);
-void SetRow_X86(uint8* dst, uint8 v8, int count);
-void SetRow_ERMS(uint8* dst, uint8 v8, int count);
-void SetRow_NEON(uint8* dst, uint8 v8, int count);
-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
-
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
-
-// ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width);
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width);
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
-                              int width);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                              int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                                int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                                int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                              int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                                int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                                int width);
-
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
-                             int width);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
-                              int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                                int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                                int width);
-
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
-
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422AlphaToARGBRow_C(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          const uint8* a_buf,
-                          uint8* dst_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* dst_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width);
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGBARow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I411ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I411ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV12ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_argb,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGBARow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToRGB24Row_SSSE3(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I444ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
-                                  const uint8* u_buf,
-                                  const uint8* v_buf,
-                                  const uint8* a_buf,
-                                  uint8* dst_argb,
-                                  const struct YuvConstants* yuvconstants,
-                                  int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 const uint8* a_buf,
-                                 uint8* dst_argb,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I411ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_uv,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_vu,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_uv,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_rgba,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_rgba,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_rgba,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
-
-// ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width);
-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
-                    uint8* dst_argb, int width);
-
-// Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
-                             const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
-                            const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width);
-
-// ARGB multiply images. Same API as Blend, but these require
-// pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-
-// ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
-                  uint8* dst_argb, int width);
-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-
-// ARGB subtract images. Same API as Blend, but these require
-// pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void I444ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 const uint8* src_a,
-                                 uint8* dst_argb,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I411ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_yuy2, int width);
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_uyvy, int width);
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
-
-// Effects related row functions.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-
-// Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32 fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                                 int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                                 int width);
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-
-void ARGBSepiaRow_C(uint8* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
-
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width);
-
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-
-// Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width);
-
-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width);
-
-LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
-
-// Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                      ptrdiff_t src_stride_ptr,
-                      int width, int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
-                          int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
-                         int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
-                         int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
-                          int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
-                             int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
-                              int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
-                             int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
-                              int source_y_fraction);
-
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         ptrdiff_t src_stride_ptr,
-                         int width, int source_y_fraction);
-
-// Sobel images.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width);
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width);
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width);
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb, const float* poly,
-                         int width);
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale.h
deleted file mode 100755
index 102158d..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
-#define INCLUDE_LIBYUV_SCALE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported filtering.
-typedef enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest.
-  kFilterLinear = 1,  // Filter horizontally only.
-  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3  // Highest quality.
-} FilterModeEnum;
-
-// Scale a YUV plane.
-LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
-                enum FilterMode filtering);
-
-LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                   int src_width, int src_height,
-                   uint16* dst, int dst_stride,
-                   int dst_width, int dst_height,
-                   enum FilterMode filtering);
-
-// Scales a YUV 4:2:0 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
-              enum FilterMode filtering);
-
-LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
-                 enum FilterMode filtering);
-
-#ifdef __cplusplus
-// Legacy API.  Deprecated.
-LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
-          LIBYUV_BOOL interpolate);
-
-// Legacy API.  Deprecated.
-LIBYUV_API
-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate);
-
-// For testing, allow disabling of specialized scalers.
-LIBYUV_API
-void SetUseReferenceImpl(LIBYUV_BOOL use);
-#endif  // __cplusplus
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_argb.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_argb.h
deleted file mode 100755
index b56cf52..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_argb.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
-#define INCLUDE_LIBYUV_SCALE_ARGB_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/scale.h"  // For FilterMode
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
-              enum FilterMode filtering);
-
-// Clipped scale takes destination rectangle coordinates for clip values.
-LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
-                  enum FilterMode filtering);
-
-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
-                       enum FilterMode filtering);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_row.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_row.h
deleted file mode 100755
index df699e6..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/scale_row.h
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
-#define INCLUDE_LIBYUV_SCALE_ROW_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/scale.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_FIXEDDIV1_X86
-#define HAS_FIXEDDIV_X86
-#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-#define HAS_SCALEARGBROWDOWN2_SSE2
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALECOLSUP2_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSSE3
-#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_SCALEADDROW_AVX2
-#define HAS_SCALEROWDOWN2_AVX2
-#define HAS_SCALEROWDOWN4_AVX2
-#endif
-
-// The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBCOLS_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEFILTERCOLS_NEON
-#define HAS_SCALEROWDOWN2_NEON
-#define HAS_SCALEROWDOWN34_NEON
-#define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
-#endif
-
-// Scale ARGB vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering);
-
-void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering);
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  enum FilterMode filtering);
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div);
-int FixedDiv_X86(int num, int div);
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_C(int num, int div);
-int FixedDiv1_X86(int num, int div);
-#ifdef HAS_FIXEDDIV_X86
-#define FixedDiv FixedDiv_X86
-#define FixedDiv1 FixedDiv1_X86
-#else
-#define FixedDiv FixedDiv_C
-#define FixedDiv1 FixedDiv1_C
-#endif
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
-                enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy);
-
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx);
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int, int);
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                          int dst_width, int x, int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                            int dst_width, int x, int dx);
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleARGBRowDown2_C(const uint8* src_argb,
-                         ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            int src_stepx,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx);
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x, int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int, int);
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x, int dx);
-
-// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-
-
-// ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx);
-
-// ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst, int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-
-// ScaleRowDown2Box also used by planar functions
-// NEON downscalers with interpolation.
-
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-//  to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-// 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx);
-
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx);
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/version.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/version.h
deleted file mode 100755
index a7a3cf5..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/version.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
-#define INCLUDE_LIBYUV_VERSION_H_
-
-#define LIBYUV_VERSION 1580
-
-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/video_common.h b/android/src/main/libenc/jni/libyuv/jni/include/libyuv/video_common.h
deleted file mode 100755
index ad934e4..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/include/libyuv/video_common.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Common definitions for video, including fourcc and VideoFormat.
-
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
-#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-// Definition of FourCC codes
-//////////////////////////////////////////////////////////////////////////////
-
-// Convert four characters to a FourCC code.
-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
-// constants are used in a switch.
-#ifdef __cplusplus
-#define FOURCC(a, b, c, d) ( \
-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
-#else
-#define FOURCC(a, b, c, d) ( \
-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
-#endif
-
-// Some pages discussing FourCC codes:
-//   http://www.fourcc.org/yuv.php
-//   http://v4l2spec.bytesex.org/spec/book1.htm
-//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
-//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
-//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
-
-// FourCC codes grouped according to implementation efficiency.
-// Primary formats should convert in 1 efficient step.
-// Secondary formats are converted in 2 steps.
-// Auxilliary formats call primary converters.
-enum FourCC {
-  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
-  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
-  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
-  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
-  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
-  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
-  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
-  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
-  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-
-  // 2 Secondary YUV formats: row biplanar.
-  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
-
-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
-  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
-  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
-  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
-  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
-  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
-  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
-  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
-  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
-
-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
-  // 1 Primary Compressed YUV format.
-  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
-
-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
-  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
-  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
-  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
-  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
-  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
-
-  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
-  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
-  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
-  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
-  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
-  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
-  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
-  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
-  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
-  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
-  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
-  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
-  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
-  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
-  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
-  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
-  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
-  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
-
-  // 1 Auxiliary compressed YUV format set aside for capturer.
-  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
-
-  // Match any fourcc.
-  FOURCC_ANY = -1,
-};
-
-enum FourCCBpp {
-  // Canonical fourcc codes used in our code.
-  FOURCC_BPP_I420 = 12,
-  FOURCC_BPP_I422 = 16,
-  FOURCC_BPP_I444 = 24,
-  FOURCC_BPP_I411 = 12,
-  FOURCC_BPP_I400 = 8,
-  FOURCC_BPP_NV21 = 12,
-  FOURCC_BPP_NV12 = 12,
-  FOURCC_BPP_YUY2 = 16,
-  FOURCC_BPP_UYVY = 16,
-  FOURCC_BPP_M420 = 12,
-  FOURCC_BPP_Q420 = 12,
-  FOURCC_BPP_ARGB = 32,
-  FOURCC_BPP_BGRA = 32,
-  FOURCC_BPP_ABGR = 32,
-  FOURCC_BPP_RGBA = 32,
-  FOURCC_BPP_24BG = 24,
-  FOURCC_BPP_RAW  = 24,
-  FOURCC_BPP_RGBP = 16,
-  FOURCC_BPP_RGBO = 16,
-  FOURCC_BPP_R444 = 16,
-  FOURCC_BPP_RGGB = 8,
-  FOURCC_BPP_BGGR = 8,
-  FOURCC_BPP_GRBG = 8,
-  FOURCC_BPP_GBRG = 8,
-  FOURCC_BPP_YV12 = 12,
-  FOURCC_BPP_YV16 = 16,
-  FOURCC_BPP_YV24 = 24,
-  FOURCC_BPP_YU12 = 12,
-  FOURCC_BPP_J420 = 12,
-  FOURCC_BPP_J400 = 8,
-  FOURCC_BPP_H420 = 12,
-  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
-  FOURCC_BPP_H264 = 0,
-  FOURCC_BPP_IYUV = 12,
-  FOURCC_BPP_YU16 = 16,
-  FOURCC_BPP_YU24 = 24,
-  FOURCC_BPP_YUYV = 16,
-  FOURCC_BPP_YUVS = 16,
-  FOURCC_BPP_HDYC = 16,
-  FOURCC_BPP_2VUY = 16,
-  FOURCC_BPP_JPEG = 1,
-  FOURCC_BPP_DMB1 = 1,
-  FOURCC_BPP_BA81 = 8,
-  FOURCC_BPP_RGB3 = 24,
-  FOURCC_BPP_BGR3 = 24,
-  FOURCC_BPP_CM32 = 32,
-  FOURCC_BPP_CM24 = 24,
-
-  // Match any fourcc.
-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
-};
-
-// Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare.cc
deleted file mode 100755
index e3846bd..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/compare.h"
-
-#include <float.h>
-#include <math.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare_row.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// hash seed of 5381 recommended.
-LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
-  const int kBlockSize = 1 << 15;  // 32768;
-  int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
-      HashDjb2_C;
-#if defined(HAS_HASHDJB2_SSE41)
-  if (TestCpuFlag(kCpuHasSSE41)) {
-    HashDjb2_SSE = HashDjb2_SSE41;
-  }
-#endif
-#if defined(HAS_HASHDJB2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    HashDjb2_SSE = HashDjb2_AVX2;
-  }
-#endif
-
-  while (count >= (uint64)(kBlockSize)) {
-    seed = HashDjb2_SSE(src, kBlockSize, seed);
-    src += kBlockSize;
-    count -= kBlockSize;
-  }
-  remainder = (int)(count) & ~15;
-  if (remainder) {
-    seed = HashDjb2_SSE(src, remainder, seed);
-    src += remainder;
-    count -= remainder;
-  }
-  remainder = (int)(count) & 15;
-  if (remainder) {
-    seed = HashDjb2_C(src, remainder, seed);
-  }
-  return seed;
-}
-
-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
-      return FOURCC_BGRA;
-    }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
-      return FOURCC_ARGB;
-    }
-    argb += 8;
-  }
-  if (width & 1) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-  }
-  return 0;
-}
-
-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
-  uint32 fourcc = 0;
-  int h;
-
-  // Coalesce rows.
-  if (stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    stride_argb = 0;
-  }
-  for (h = 0; h < height && fourcc == 0; ++h) {
-    fourcc = ARGBDetectRow_C(argb, width);
-    argb += stride_argb;
-  }
-  return fourcc;
-}
-
-// TODO(fbarchard): Refactor into row function.
-LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
-                             int count) {
-  // SumSquareError returns values 0 to 65535 for each squared difference.
-  // Up to 65536 of those can be summed and remain within a uint32.
-  // After each block of 65536 pixels, accumulate into a uint64.
-  const int kBlockSize = 65536;
-  int remainder = count & (kBlockSize - 1) & ~31;
-  uint64 sse = 0;
-  int i;
-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
-      SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SumSquareError = SumSquareError_NEON;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    // Note only used for multiples of 16 so count is not checked.
-    SumSquareError = SumSquareError_SSE2;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    // Note only used for multiples of 32 so count is not checked.
-    SumSquareError = SumSquareError_AVX2;
-  }
-#endif
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
-#endif
-  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
-    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
-  }
-  src_a += count & ~(kBlockSize - 1);
-  src_b += count & ~(kBlockSize - 1);
-  if (remainder) {
-    sse += SumSquareError(src_a, src_b, remainder);
-    src_a += remainder;
-    src_b += remainder;
-  }
-  remainder = count & 31;
-  if (remainder) {
-    sse += SumSquareError_C(src_a, src_b, remainder);
-  }
-  return sse;
-}
-
-LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height) {
-  uint64 sse = 0;
-  int h;
-  // Coalesce rows.
-  if (stride_a == width &&
-      stride_b == width) {
-    width *= height;
-    height = 1;
-    stride_a = stride_b = 0;
-  }
-  for (h = 0; h < height; ++h) {
-    sse += ComputeSumSquareError(src_a, src_b, width);
-    src_a += stride_a;
-    src_b += stride_b;
-  }
-  return sse;
-}
-
-LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
-  double psnr;
-  if (sse > 0) {
-    double mse = (double)(count) / (double)(sse);
-    psnr = 10.0 * log10(255.0 * 255.0 * mse);
-  } else {
-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
-  }
-
-  if (psnr > kMaxPsnr)
-    psnr = kMaxPsnr;
-
-  return psnr;
-}
-
-LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
-  const uint64 samples = width * height;
-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
-                                                src_b, stride_b,
-                                                width, height);
-  return SumSquareErrorToPsnr(sse, samples);
-}
-
-LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
-                                                  src_y_b, stride_y_b,
-                                                  width, height);
-  const int width_uv = (width + 1) >> 1;
-  const int height_uv = (height + 1) >> 1;
-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
-                                                  src_u_b, stride_u_b,
-                                                  width_uv, height_uv);
-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
-                                                  src_v_b, stride_v_b,
-                                                  width_uv, height_uv);
-  const uint64 samples = width * height + 2 * (width_uv * height_uv);
-  const uint64 sse = sse_y + sse_u + sse_v;
-  return SumSquareErrorToPsnr(sse, samples);
-}
-
-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
-
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
-                        const uint8* src_b, int stride_b) {
-  int64 sum_a = 0;
-  int64 sum_b = 0;
-  int64 sum_sq_a = 0;
-  int64 sum_sq_b = 0;
-  int64 sum_axb = 0;
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    int j;
-    for (j = 0; j < 8; ++j) {
-      sum_a += src_a[j];
-      sum_b += src_b[j];
-      sum_sq_a += src_a[j] * src_a[j];
-      sum_sq_b += src_b[j] * src_b[j];
-      sum_axb += src_a[j] * src_b[j];
-    }
-
-    src_a += stride_a;
-    src_b += stride_b;
-  }
-
-  {
-    const int64 count = 64;
-    // scale the constants by number of pixels
-    const int64 c1 = (cc1 * count * count) >> 12;
-    const int64 c2 = (cc2 * count * count) >> 12;
-
-    const int64 sum_a_x_sum_b = sum_a * sum_b;
-
-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
-
-    const int64 sum_a_sq = sum_a*sum_a;
-    const int64 sum_b_sq = sum_b*sum_b;
-
-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                         (count * sum_sq_a - sum_a_sq +
-                          count * sum_sq_b - sum_b_sq + c2);
-
-    if (ssim_d == 0.0) {
-      return DBL_MAX;
-    }
-    return ssim_n * 1.0 / ssim_d;
-  }
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
-  int samples = 0;
-  double ssim_total = 0;
-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
-
-  // sample point start with each 4x4 location
-  int i;
-  for (i = 0; i < height - 8; i += 4) {
-    int j;
-    for (j = 0; j < width - 8; j += 4) {
-      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
-      samples++;
-    }
-
-    src_a += stride_a * 4;
-    src_b += stride_b * 4;
-  }
-
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
-                                      src_y_b, stride_y_b, width, height);
-  const int width_uv = (width + 1) >> 1;
-  const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
-                                      src_u_b, stride_u_b,
-                                      width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
-                                      src_v_b, stride_v_b,
-                                      width_uv, height_uv);
-  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare_common.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare_common.cc
deleted file mode 100755
index 42fc589..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare_common.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse = 0u;
-  int i;
-  for (i = 0; i < count; ++i) {
-    int diff = src_a[i] - src_b[i];
-    sse += (uint32)(diff * diff);
-  }
-  return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
-  int i;
-  for (i = 0; i < count; ++i) {
-    hash += (hash << 5) + src[i];
-  }
-  return hash;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare_gcc.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare_gcc.cc
deleted file mode 100755
index 1b83edb..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare_gcc.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
-
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-  return sse;
-}
-
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%1                        \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-  return hash;
-}
-#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare_neon.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare_neon.cc
deleted file mode 100755
index 49aa3b4..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare_neon.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
-
-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-  return sse;
-}
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare_neon64.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare_neon64.cc
deleted file mode 100755
index f9c7df9..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare_neon64.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
-
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/compare_win.cc b/android/src/main/libenc/jni/libyuv/jni/source/compare_win.cc
deleted file mode 100755
index dc86fe2..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/compare_win.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-
-  wloop:
-    movdqu     xmm1, [eax]
-    lea        eax,  [eax + 16]
-    movdqu     xmm2, [edx]
-    lea        edx,  [edx + 16]
-    movdqa     xmm3, xmm1  // abs trick
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    sub        ecx, 16
-    jg         wloop
-
-    pshufd     xmm1, xmm0, 0xee
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 0x01
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
-  }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
-    vpxor      ymm0, ymm0, ymm0  // sum
-    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
-    sub        edx, eax
-
-  wloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + edx]
-    lea        eax,  [eax + 32]
-    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
-    vpsubusb   ymm2, ymm2, ymm1
-    vpor       ymm1, ymm2, ymm3
-    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
-    vpunpckhbw ymm1, ymm1, ymm5
-    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
-    vpmaddwd   ymm1, ymm1, ymm1
-    vpaddd     ymm0, ymm0, ymm1
-    vpaddd     ymm0, ymm0, ymm2
-    sub        ecx, 32
-    jg         wloop
-
-    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpermq     ymm1, ymm0, 0x02  // high + low lane.
-    vpaddd     ymm0, ymm0, ymm1
-    vmovd      eax, xmm0
-    vzeroupper
-    ret
-  }
-}
-#endif  // _MSC_VER >= 1700
-
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-
-    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, xmmword ptr kHash16x33
-
-  wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
-    lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
-    movdqa     xmm5, xmmword ptr kHashMul0
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld     xmm3, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul1
-    movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld     xmm4, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
-    movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld     xmm2, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
-    paddd      xmm1, xmm3
-
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    movd       eax, xmm0         // return hash
-    ret
-  }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    vmovd      xmm0, [esp + 12]  // seed
-
-  wloop:
-    vpmovzxbd  xmm3, [eax]  // src[0-3]
-    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
-    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
-    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
-    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
-    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
-    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
-    lea        eax, [eax + 16]
-    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4        // add 16 results
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm1, xmm1, xmm3
-    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
-    vpaddd     xmm1, xmm1,xmm2
-    vpshufd    xmm2, xmm1, 0x01
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm0, xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    vmovd      eax, xmm0         // return hash
-    vzeroupper
-    ret
-  }
-}
-#endif  // _MSC_VER >= 1700
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert.cc
deleted file mode 100755
index e332bc5..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert.cc
+++ /dev/null
@@ -1,1389 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int src_uv_width, int src_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
-  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      src_uv_width == 0 || src_uv_height == 0) {
-    return -1;
-  }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  return 0;
-}
-
-// Copy I420 with optional flipping
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  // Copy UV planes.
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
-}
-
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    width, height);
-}
-
-// 411 chroma is 1/4 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
-  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
-  return 0;
-}
-
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride,
-                       int width, int height) {
-  int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride, width);
-    src += src_stride_0 + src_stride_1;
-    dst += dst_stride * 2;
-  }
-  if (height & 1) {
-    CopyRow(src, dst, width);
-  }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
-//   this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) = SplitUVRow_C;
-  if (!src_y || !src_uv ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // Coalesce rows.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 &&
-      dst_stride_u == halfwidth &&
-      dst_stride_v == halfwidth) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-    SplitUVRow = SplitUVRow_Any_DSPR2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_DSPR2;
-    }
-  }
-#endif
-
-  if (dst_y) {
-    if (src_stride_y0 == src_stride_y1) {
-      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
-    } else {
-      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-                 width, height);
-    }
-  }
-
-  for (y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
-  }
-  return 0;
-}
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
-}
-
-// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
-LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_vu, src_stride_vu,
-                    dst_y, dst_stride_y,
-                    dst_v, dst_stride_v,
-                    dst_u, dst_stride_u,
-                    width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
-}
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-      uint8* dst_y, int width) = YUY2ToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_SSE2;
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUVRow = YUY2ToUVRow_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUVRow = YUY2ToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
-    src_yuy2 += src_stride_yuy2 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-      uint8* dst_y, int width) = UYVYToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-    UYVYToYRow = UYVYToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_SSE2;
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
-    UYVYToYRow = UYVYToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToUVRow = UYVYToUVRow_AVX2;
-      UYVYToYRow = UYVYToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToYRow = UYVYToYRow_Any_NEON;
-    UYVYToUVRow = UYVYToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUVRow = UYVYToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
-    src_uyvy += src_stride_uyvy * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert BGRA to I420.
-LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
-      BGRAToYRow_C;
-  if (!src_bgra ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_SSSE3;
-      BGRAToYRow = BGRAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    BGRAToYRow = BGRAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      BGRAToYRow = BGRAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOUVROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        BGRAToUVRow = BGRAToUVRow_NEON;
-      }
-    }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
-    src_bgra += src_stride_bgra * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
-      ABGRToYRow_C;
-  if (!src_abgr ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
-      ABGRToYRow = ABGRToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToUVRow = ABGRToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-    src_abgr += src_stride_abgr * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGBA to I420.
-LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
-      RGBAToYRow_C;
-  if (!src_rgba ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
-    RGBAToYRow = RGBAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_SSSE3;
-      RGBAToYRow = RGBAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYRow = RGBAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYRow = RGBAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToUVRow = RGBAToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
-    src_rgba += src_stride_rgba * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGB24 to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
-  int y;
-#if defined(HAS_RGB24TOYROW_NEON)
-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
-      RGB24ToYRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
-    RGB24ToYRow = RGB24ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYRow = RGB24ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_NEON;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
-      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
-      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RGB24TOYROW_NEON)
-    free_aligned_buffer_64(row);
-  }
-#endif
-  return 0;
-}
-
-// Convert RAW to I420.
-LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
-  int y;
-#if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
-      RAWToYRow_C;
-#else
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToUVRow = RAWToUVRow_Any_NEON;
-    RAWToYRow = RAWToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYRow = RAWToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_NEON;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RAW to ARGB.
-#else
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
-      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
-      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RAWTOYROW_NEON)
-    free_aligned_buffer_64(row);
-  }
-#endif
-  return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
-  int y;
-#if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
-      RGB565ToYRow_C;
-#else
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-
-// Neon version does direct RGB565 to YUV.
-#if defined(HAS_RGB565TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
-    RGB565ToYRow = RGB565ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_NEON;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RGB565 to ARGB.
-#else
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
-      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb565 += src_stride_rgb565 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
-      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RGB565TOYROW_NEON)
-    free_aligned_buffer_64(row);
-  }
-#endif
-  return 0;
-}
-
-// Convert ARGB1555 to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
-  int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
-      ARGB1555ToYRow_C;
-#else
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-
-// Neon version does direct ARGB1555 to YUV.
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from ARGB1555 to ARGB.
-#else
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb1555 += src_stride_argb1555 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
-    free_aligned_buffer_64(row);
-  }
-#endif
-  return 0;
-}
-
-// Convert ARGB4444 to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
-  int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
-      ARGB4444ToYRow_C;
-#else
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-
-// Neon version does direct ARGB4444 to YUV.
-#if defined(HAS_ARGB4444TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
-    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from ARGB4444 to ARGB.
-#else
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb4444 += src_stride_argb4444 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
-    free_aligned_buffer_64(row);
-  }
-#endif
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_argb.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_argb.cc
deleted file mode 100755
index e586f70..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_argb.cc
+++ /dev/null
@@ -1,1455 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB with optional flipping
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
-  return 0;
-}
-
-// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width, int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
-}
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
-}
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width, int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
-}
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
-}
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width, int height) {
-  int y;
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
-}
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
-}
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I411ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 4 == width &&
-      src_stride_v * 4 == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I411ToARGBRow = I411ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
-                                 const uint8* src_u, int src_stride_u,
-                                 const uint8* src_v, int src_stride_v,
-                                 const uint8* src_a, int src_stride_a,
-                                 uint8* dst_argb, int dst_stride_argb,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width, int height, int attenuate) {
-  int y;
-  void (*I422AlphaToARGBRow)(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 with Alpha to ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_u, src_stride_u,
-                               src_v, src_stride_v,
-                               src_a, src_stride_a,
-                               dst_argb, dst_stride_argb,
-                               &kYuvI601Constants,
-                               width, height, attenuate);
-}
-
-// Convert I420 with Alpha to ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_v, src_stride_v,  // Swap U and V
-                               src_u, src_stride_u,
-                               src_a, src_stride_a,
-                               dst_abgr, dst_stride_abgr,
-                               &kYvuI601Constants,  // Use Yvu matrix
-                               width, height, attenuate);
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*I400ToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = I400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I400ToARGBRow = I400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-  }
-  return 0;
-}
-
-// Convert J400 to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      J400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_J400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      J400ToARGBRow = J400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    J400ToARGBRow = J400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_NEON;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    J400ToARGBRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-// Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-
-// Convert BGRA to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
-}
-
-// Convert ARGB to BGRA (same as BGRAToARGB).
-LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
-}
-
-// Convert ABGR to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
-}
-
-// Convert ARGB to ABGR to (same as ABGRToARGB).
-LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
-}
-
-// Convert RGBA to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
-                     width, height);
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  int y;
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb24 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB24ToARGBRow(src_rgb24, dst_argb, width);
-    src_rgb24 += src_stride_rgb24;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RAWToARGBRow_C;
-  if (!src_raw || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_argb = 0;
-  }
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToARGBRow(src_raw, dst_argb, width);
-    src_raw += src_stride_raw;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RGB565 to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
-  int y;
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb565 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB565ToARGBRow(src_rgb565, dst_argb, width);
-    src_rgb565 += src_stride_rgb565;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB1555 to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
-  int y;
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-      int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-  // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb1555 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
-    src_argb1555 += src_stride_argb1555;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB4444 to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
-  int y;
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-      int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-  // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb4444 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
-    src_argb4444 += src_stride_argb4444;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToARGBRow = NV21ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
-                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb * 2;
-    src_m420 += src_stride_m420 * 3;
-  }
-  if (height & 1) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-  }
-  return 0;
-}
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
-      YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_argb = 0;
-  }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*UYVYToARGBRow)(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
-      UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_uyvy = dst_stride_argb = 0;
-  }
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToARGBRow = UYVYToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      UYVYToARGBRow = UYVYToARGBRow_NEON;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
-    src_uyvy += src_stride_uyvy;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_from.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_from.cc
deleted file mode 100755
index 3bc9eb1..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_from.cc
+++ /dev/null
@@ -1,1167 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert.h"  // For I420Copy
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int dst_uv_width, int dst_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
-  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
-    return -1;
-  }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  return 0;
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 422 chroma is 1/2 width, 1x height
-LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 1) >> 1;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = Abs(width);
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 3) >> 2;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
-LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
-                  dst_yuy2 + dst_stride_yuy2, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2 * 2;
-  }
-  if (height & 1) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uyvy == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
-                  dst_uyvy + dst_stride_uyvy, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy * 2;
-  }
-  if (height & 1) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-      int width) = MergeUVRow_C;
-  // Coalesce rows.
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_uv = -dst_stride_uv;
-  }
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_u == halfwidth &&
-      src_stride_v == halfwidth &&
-      dst_stride_uv == halfwidth * 2) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_u = src_stride_v = dst_stride_uv = 0;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  for (y = 0; y < halfheight; ++y) {
-    // Merge a row of U and V into a row of UV.
-    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += dst_stride_uv;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height) {
-  return I420ToNV12(src_y, src_stride_y,
-                    src_v, src_stride_v,
-                    src_u, src_stride_u,
-                    dst_y, dst_stride_y,
-                    dst_vu, dst_stride_vu,
-                    width, height);
-}
-
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width, int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
-                             const uint8* src_u, int src_stride_u,
-                             const uint8* src_v, int src_stride_v,
-                             uint8* dst_rgb24, int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width, int height) {
-  int y;
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) = I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_u, src_stride_u,
-                           src_v, src_stride_v,
-                           dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants,
-                           width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_v, src_stride_v,  // Swap U and V
-                           src_u, src_stride_u,
-                           dst_raw, dst_stride_raw,
-                           &kYvuI601Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
-  int y;
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
-  }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
-                      width);
-    dst_argb1555 += dst_stride_argb1555;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
-  int y;
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
-    dst_stride_argb4444 = -dst_stride_argb4444;
-  }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
-                      width);
-    dst_argb4444 += dst_stride_argb4444;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-  {
-    // Allocate a row of argb.
-    align_buffer_64(row_argb, width * 4);
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
-      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
-      dst_rgb565 += dst_stride_rgb565;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row_argb);
-  }
-  return 0;
-}
-
-// Convert I420 to specified format
-LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
-  int r = 0;
-  if (!y || !u|| !v || !dst_sample ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
-      break;
-    case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
-      break;
-    case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2,
-                       width, height);
-      break;
-    case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3,
-                      width, height);
-      break;
-    case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3,
-                    width, height);
-      break;
-    case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
-      break;
-    case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
-      break;
-    case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
-      break;
-    case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
-      break;
-    case FOURCC_I400:
-      r = I400Copy(y, y_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width,
-                   width, height);
-      break;
-    case FOURCC_NV12: {
-      uint8* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
-      break;
-    }
-    case FOURCC_NV21: {
-      uint8* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
-      break;
-    }
-    // TODO(fbarchard): Add M420.
-    // Triplanar formats
-    // TODO(fbarchard): halfstride instead of halfwidth
-    case FOURCC_I420:
-    case FOURCC_YU12:
-    case FOURCC_YV12: {
-      int halfwidth = (width + 1) / 2;
-      int halfheight = (height + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
-      if (format == FOURCC_YV12) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * halfheight;
-      } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * halfheight;
-      }
-      r = I420Copy(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample, width,
-                   dst_u, halfwidth,
-                   dst_v, halfwidth,
-                   width, height);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      int halfwidth = (width + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
-      if (format == FOURCC_YV16) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * height;
-      } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * height;
-      }
-      r = I420ToI422(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, halfwidth,
-                     dst_v, halfwidth,
-                     width, height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      uint8* dst_u;
-      uint8* dst_v;
-      if (format == FOURCC_YV24) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + width * height;
-      } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + width * height;
-      }
-      r = I420ToI444(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, width,
-                     dst_v, width,
-                     width, height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (width + 3) / 4;
-      uint8* dst_u = dst_sample + width * height;
-      uint8* dst_v = dst_u + quarterwidth * height;
-      r = I420ToI411(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, quarterwidth,
-                     dst_v, quarterwidth,
-                     width, height);
-      break;
-    }
-
-    // Formats not supported - MJPG, biplanar, some rgb formats.
-    default:
-      return -1;  // unknown fourcc - return failure code.
-  }
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_from_argb.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_from_argb.cc
deleted file mode 100755
index 2a8682b..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_from_argb.cc
+++ /dev/null
@@ -1,1286 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from_argb.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// ARGB little endian (bgra in memory) to I444
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV444Row_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u == width &&
-      dst_stride_v == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-      }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// ARGB little endian (bgra in memory) to I422
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// ARGB little endian (bgra in memory) to I411
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 4 == width &&
-      dst_stride_v * 4 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUV411Row = ARGBToUV411Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-      src_argb += src_stride_argb * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-      src_argb += src_stride_argb * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-// Convert ARGB to YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
-
-  if (!src_argb || !dst_yuy2 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yuy2 = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-
-  {
-    // Allocate a rows of yuv.
-    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-
-    for (y = 0; y < height; ++y) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      ARGBToYRow(src_argb, row_y, width);
-      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
-      src_argb += src_stride_argb;
-      dst_yuy2 += dst_stride_yuy2;
-    }
-
-    free_aligned_buffer_64(row_y);
-  }
-  return 0;
-}
-
-// Convert ARGB to UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
-
-  if (!src_argb || !dst_uyvy ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_uyvy == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_uyvy = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-
-  {
-    // Allocate a rows of yuv.
-    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-
-    for (y = 0; y < height; ++y) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      ARGBToYRow(src_argb, row_y, width);
-      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
-      src_argb += src_stride_argb;
-      dst_uyvy += dst_stride_uyvy;
-    }
-
-    free_aligned_buffer_64(row_y);
-  }
-  return 0;
-}
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-// Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
-
-// Convert ARGB to RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return ARGBShuffle(src_argb, src_stride_argb,
-                     dst_rgba, dst_stride_rgba,
-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
-                     width, height);
-}
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  int y;
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToRGB24Row_C;
-  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb24 == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_rgb24 = 0;
-  }
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB24Row(src_argb, dst_rgb24, width);
-    src_argb += src_stride_argb;
-    dst_rgb24 += dst_stride_rgb24;
-  }
-  return 0;
-}
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  int y;
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToRAWRow_C;
-  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_raw == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_raw = 0;
-  }
-#if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRAWRow = ARGBToRAWRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRAWRow(src_argb, dst_raw, width);
-    src_argb += src_stride_argb;
-    dst_raw += dst_stride_raw;
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
-};
-
-// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
-  int y;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
-    src_argb += src_stride_argb;
-    dst_rgb565 += dst_stride_rgb565;
-  }
-  return 0;
-}
-
-// Convert ARGB To RGB565.
-// TODO(fbarchard): Consider using dither function low level with zeros.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToRGB565Row_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb565 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_rgb565 = 0;
-  }
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB565Row(src_argb, dst_rgb565, width);
-    src_argb += src_stride_argb;
-    dst_rgb565 += dst_stride_rgb565;
-  }
-  return 0;
-}
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
-  int y;
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB1555Row_C;
-  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb1555 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb1555 = 0;
-  }
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
-    src_argb += src_stride_argb;
-    dst_argb1555 += dst_stride_argb1555;
-  }
-  return 0;
-}
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
-  int y;
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB4444Row_C;
-  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb4444 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb4444 = 0;
-  }
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
-    src_argb += src_stride_argb;
-    dst_argb4444 += dst_stride_argb4444;
-  }
-  return 0;
-}
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
-    src_argb += src_stride_argb * 2;
-    dst_yj += dst_stride_yj * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-  }
-  return 0;
-}
-
-// Convert ARGB to J422. (JPeg full range I422).
-LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
-    dst_yj += dst_stride_yj;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Convert ARGB to J400.
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height) {
-  int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yj = 0;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
-    dst_yj += dst_stride_yj;
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_jpeg.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_jpeg.cc
deleted file mode 100755
index bcb980f..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_jpeg.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#ifdef HAVE_JPEG
-struct I420Buffers {
-  uint8* y;
-  int y_stride;
-  uint8* u;
-  int u_stride;
-  uint8* v;
-  int v_stride;
-  int w;
-  int h;
-};
-
-static void JpegCopyI420(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height) {
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret) {
-    *width = mjpeg_decoder.GetWidth();
-    *height = mjpeg_decoder.GetHeight();
-  }
-  mjpeg_decoder.UnloadFrame();
-  return ret ? 0 : -1;  // -1 for runtime failure.
-}
-
-// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToI420(const uint8* sample,
-               size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-
-#ifdef HAVE_JPEG
-struct ARGBBuffers {
-  uint8* argb;
-  int argb_stride;
-  int w;
-  int h;
-};
-
-static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample,
-               size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-#endif
-
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_to_argb.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_to_argb.cc
deleted file mode 100755
index af829fb..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_to_argb.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* crop_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
-                  enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-  int r = 0;
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination crop_argb is same as source sample,
-  // also enable temporary buffer.
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
-      crop_argb == sample;
-  uint8* tmp_argb = crop_argb;
-  int tmp_argb_stride = argb_stride;
-  uint8* rotate_buffer = NULL;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (crop_argb == NULL || sample == NULL ||
-      src_width <= 0 || crop_width <= 0 ||
-      src_height == 0 || crop_height == 0) {
-    return -1;
-  }
-  if (src_height < 0) {
-    inv_crop_height = -inv_crop_height;
-  }
-
-  if (need_buf) {
-    int argb_size = crop_width * abs_crop_height * 4;
-    rotate_buffer = (uint8*)malloc(argb_size);
-    if (!rotate_buffer) {
-      return 1;  // Out of memory runtime error.
-    }
-    crop_argb = rotate_buffer;
-    argb_stride = crop_width;
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      crop_argb, argb_stride,
-                      crop_width, inv_crop_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    crop_argb, argb_stride,
-                    crop_width, inv_crop_height);
-      break;
-    case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       crop_argb, argb_stride,
-                       crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YU12:
-    case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_J420: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
-      src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     crop_argb, argb_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = ARGBRotate(crop_argb, argb_stride,
-                     tmp_argb, tmp_argb_stride,
-                     crop_width, abs_crop_height, rotation);
-    }
-    free(rotate_buffer);
-  }
-
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/convert_to_i420.cc b/android/src/main/libenc/jni/libyuv/jni/source/convert_to_i420.cc
deleted file mode 100755
index 5e75369..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/convert_to_i420.cc
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "libyuv/convert.h"
-
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8* sample,
-                  size_t sample_size,
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
-                  enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-  int r = 0;
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
-  uint8* tmp_y = y;
-  uint8* tmp_u = u;
-  uint8* tmp_v = v;
-  int tmp_y_stride = y_stride;
-  int tmp_u_stride = u_stride;
-  int tmp_v_stride = v_stride;
-  uint8* rotate_buffer = NULL;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || crop_width <= 0  ||
-      src_height == 0 || crop_height == 0) {
-    return -1;
-  }
-  if (src_height < 0) {
-    inv_crop_height = -inv_crop_height;
-  }
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination y is same as source sample,
-  // also enable temporary buffer.
-  if (need_buf) {
-    int y_size = crop_width * abs_crop_height;
-    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
-    if (!rotate_buffer) {
-      return 1;  // Out of memory runtime error.
-    }
-    y = rotate_buffer;
-    u = y + y_size;
-    v = u + uv_size;
-    y_stride = crop_width;
-    u_stride = v_stride = ((crop_width + 1) / 2);
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      crop_width, inv_crop_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
-      break;
-    case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           crop_width, inv_crop_height, rotation);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           crop_width, inv_crop_height, rotation);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YU12:
-    case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height, rotation);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     crop_width, abs_crop_height, rotation);
-    }
-    free(rotate_buffer);
-  }
-
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/cpu_id.cc b/android/src/main/libenc/jni/libyuv/jni/source/cpu_id.cc
deleted file mode 100755
index d64d9d5..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/cpu_id.cc
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/cpu_id.h"
-
-#if defined(_MSC_VER)
-#include <intrin.h>  // For __cpuidex()
-#endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#include <immintrin.h>  // For _xgetbv()
-#endif
-
-#if !defined(__native_client__)
-#include <stdlib.h>  // For getenv()
-#endif
-
-// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
-#include <string.h>
-
-#include "libyuv/basic_types.h"  // For CPU_X86
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// For functions that use the stack and have runtime checks for overflow,
-// use SAFEBUFFERS to avoid additional check.
-#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
-#define SAFEBUFFERS __declspec(safebuffers)
-#else
-#define SAFEBUFFERS
-#endif
-
-// Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER)
-LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
-// Visual C version uses intrinsic or inline x86 assembly.
-#if (_MSC_FULL_VER >= 160040219)
-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
-#elif defined(_M_IX86)
-  __asm {
-    mov        eax, info_eax
-    mov        ecx, info_ecx
-    mov        edi, cpu_info
-    cpuid
-    mov        [edi], eax
-    mov        [edi + 4], ebx
-    mov        [edi + 8], ecx
-    mov        [edi + 12], edx
-  }
-#else  // Visual C but not x86
-  if (info_ecx == 0) {
-    __cpuid((int*)(cpu_info), info_eax);
-  } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
-  }
-#endif
-// GCC version uses inline x86 assembly.
-#else  // defined(_MSC_VER)
-  uint32 info_ebx, info_edx;
-  asm volatile (
-#if defined( __i386__) && defined(__PIC__)
-    // Preserve ebx for fpic 32 bit.
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=D" (info_ebx),
-#else
-    "cpuid                                     \n"
-    : "=b" (info_ebx),
-#endif  //  defined( __i386__) && defined(__PIC__)
-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
-  cpu_info[0] = info_eax;
-  cpu_info[1] = info_ebx;
-  cpu_info[2] = info_ecx;
-  cpu_info[3] = info_edx;
-#endif  // defined(_MSC_VER)
-}
-#else  // (defined(_M_IX86) || defined(_M_X64) ...
-LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
-#endif
-
-// For VS2010 and earlier emit can be used:
-//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-//  __asm {
-//    xor        ecx, ecx    // xcr 0
-//    xgetbv
-//    mov        xcr0, eax
-//  }
-// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
-// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", off)
-#endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
-  uint32 xcr0 = 0u;
-#if (_MSC_FULL_VER >= 160040219)
-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
-#endif  // defined(__i386__) || defined(__x86_64__)
-  return xcr0;
-}
-#endif  // defined(_M_IX86) || defined(_M_X64) ..
-// Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", on)
-#endif
-
-// based on libvpx arm_cpudetect.c
-// For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
-  char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
-  if (!f) {
-    // Assume Neon if /proc/cpuinfo is unavailable.
-    // This will occur for Chrome sandbox for Pepper or Render process.
-    return kCpuHasNEON;
-  }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
-      char* p = strstr(cpuinfo_line, " neon");
-      if (p && (p[5] == ' ' || p[5] == '\n')) {
-        fclose(f);
-        return kCpuHasNEON;
-      }
-      // aarch64 uses asimd for Neon.
-      p = strstr(cpuinfo_line, " asimd");
-      if (p && (p[6] == ' ' || p[6] == '\n')) {
-        fclose(f);
-        return kCpuHasNEON;
-      }
-    }
-  }
-  fclose(f);
-  return 0;
-}
-
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = 0;  // cpu_info is not initialized yet.
-
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
-  const char* var = getenv(name);
-  if (var) {
-    if (var[0] != '0') {
-      return LIBYUV_TRUE;
-    }
-  }
-  return LIBYUV_FALSE;
-}
-#else  // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
-  return LIBYUV_FALSE;
-}
-#endif
-
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
-  int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
-  CpuId(0, 0, cpu_info0);
-  CpuId(1, 0, cpu_info1);
-  if (cpu_info0[0] >= 7) {
-    CpuId(7, 0, cpu_info7);
-  }
-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-             kCpuHasX86;
-
-#ifdef HAS_XGETBV
-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
-  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
-      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
-
-    // Detect AVX512bw
-    if ((GetXCR0() & 0xe0) == 0xe0) {
-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
-    }
-  }
-#endif
-
-  // Environment variable overrides for testing.
-  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info &= ~kCpuHasX86;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info &= ~kCpuHasSSE2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info &= ~kCpuHasSSSE3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info &= ~kCpuHasSSE41;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info &= ~kCpuHasSSE42;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info &= ~kCpuHasAVX;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info &= ~kCpuHasAVX2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info &= ~kCpuHasERMS;
-  }
-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info &= ~kCpuHasFMA3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
-    cpu_info &= ~kCpuHasAVX3;
-  }
-#endif
-#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_dspr2)
-  cpu_info |= kCpuHasDSPR2;
-#endif
-  cpu_info |= kCpuHasMIPS;
-  if (getenv("LIBYUV_DISABLE_DSPR2")) {
-    cpu_info &= ~kCpuHasDSPR2;
-  }
-#endif
-#if defined(__arm__) || defined(__aarch64__)
-// gcc -mfpu=neon defines __ARM_NEON__
-// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
-// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
-  cpu_info = kCpuHasNEON;
-// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
-// flag in it.
-// So for aarch64, neon enabling is hard coded here.
-#endif
-#if defined(__aarch64__)
-  cpu_info = kCpuHasNEON;
-#else
-  // Linux arm parse text file for neon detect.
-  cpu_info = ArmCpuCaps("/proc/cpuinfo");
-#endif
-  cpu_info |= kCpuHasARM;
-  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info &= ~kCpuHasNEON;
-  }
-#endif  // __arm__
-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info = 0;
-  }
-  cpu_info  |= kCpuInitialized;
-  cpu_info_ = cpu_info;
-  return cpu_info;
-}
-
-// Note that use of this function is not thread safe.
-LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
-  cpu_info_ = InitCpuFlags() & enable_flags;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_decoder.cc b/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_decoder.cc
deleted file mode 100755
index 5081841..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_decoder.cc
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#ifdef HAVE_JPEG
-#include <assert.h>
-
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-// Must be included before jpeglib.
-#include <setjmp.h>
-#define HAVE_SETJMP
-
-#if defined(_MSC_VER)
-// disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
-#endif
-
-#endif
-struct FILE;  // For jpeglib.h.
-
-// C++ build requires extern C for jpeg internals.
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <jpeglib.h>
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#include "libyuv/planar_functions.h"  // For CopyPlane().
-
-namespace libyuv {
-
-#ifdef HAVE_SETJMP
-struct SetJmpErrorMgr {
-  jpeg_error_mgr base;  // Must be at the top
-  jmp_buf setjmp_buffer;
-};
-#endif
-
-const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
-const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
-const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
-const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
-const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
-const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
-
-// Methods that are passed to jpeglib.
-boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
-void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
-void term_source(jpeg_decompress_struct* cinfo);
-void ErrorHandler(jpeg_common_struct* cinfo);
-
-MJpegDecoder::MJpegDecoder()
-    : has_scanline_padding_(LIBYUV_FALSE),
-      num_outbufs_(0),
-      scanlines_(NULL),
-      scanlines_sizes_(NULL),
-      databuf_(NULL),
-      databuf_strides_(NULL) {
-  decompress_struct_ = new jpeg_decompress_struct;
-  source_mgr_ = new jpeg_source_mgr;
-#ifdef HAVE_SETJMP
-  error_mgr_ = new SetJmpErrorMgr;
-  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
-  // Override standard exit()-based error handler.
-  error_mgr_->base.error_exit = &ErrorHandler;
-#endif
-  decompress_struct_->client_data = NULL;
-  source_mgr_->init_source = &init_source;
-  source_mgr_->fill_input_buffer = &fill_input_buffer;
-  source_mgr_->skip_input_data = &skip_input_data;
-  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
-  source_mgr_->term_source = &term_source;
-  jpeg_create_decompress(decompress_struct_);
-  decompress_struct_->src = source_mgr_;
-  buf_vec_.buffers = &buf_;
-  buf_vec_.len = 1;
-}
-
-MJpegDecoder::~MJpegDecoder() {
-  jpeg_destroy_decompress(decompress_struct_);
-  delete decompress_struct_;
-  delete source_mgr_;
-#ifdef HAVE_SETJMP
-  delete error_mgr_;
-#endif
-  DestroyOutputBuffers();
-}
-
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
-  if (!ValidateJpeg(src, src_len)) {
-    return LIBYUV_FALSE;
-  }
-
-  buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
-  buf_vec_.pos = 0;
-  decompress_struct_->client_data = &buf_vec_;
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called jpeg_read_header, it experienced an error, and we called
-    // longjmp() and rewound the stack to here. Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
-    // ERROR: Bad MJPEG header
-    return LIBYUV_FALSE;
-  }
-  AllocOutputBuffers(GetNumComponents());
-  for (int i = 0; i < num_outbufs_; ++i) {
-    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
-    if (scanlines_sizes_[i] != scanlines_size) {
-      if (scanlines_[i]) {
-        delete scanlines_[i];
-      }
-      scanlines_[i] = new uint8* [scanlines_size];
-      scanlines_sizes_[i] = scanlines_size;
-    }
-
-    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
-    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
-    // the preceding scanlines, the padding is not needed/wanted because the
-    // following addresses will already be valid (they are the initial bytes of
-    // the next scanline) and will be overwritten when jpeglib writes out that
-    // next scanline.
-    int databuf_stride = GetComponentStride(i);
-    int databuf_size = scanlines_size * databuf_stride;
-    if (databuf_strides_[i] != databuf_stride) {
-      if (databuf_[i]) {
-        delete databuf_[i];
-      }
-      databuf_[i] = new uint8[databuf_size];
-      databuf_strides_[i] = databuf_stride;
-    }
-
-    if (GetComponentStride(i) != GetComponentWidth(i)) {
-      has_scanline_padding_ = LIBYUV_TRUE;
-    }
-  }
-  return LIBYUV_TRUE;
-}
-
-static int DivideAndRoundUp(int numerator, int denominator) {
-  return (numerator + denominator - 1) / denominator;
-}
-
-static int DivideAndRoundDown(int numerator, int denominator) {
-  return numerator / denominator;
-}
-
-// Returns width of the last loaded frame.
-int MJpegDecoder::GetWidth() {
-  return decompress_struct_->image_width;
-}
-
-// Returns height of the last loaded frame.
-int MJpegDecoder::GetHeight() {
-  return decompress_struct_->image_height;
-}
-
-// Returns format of the last loaded frame. The return value is one of the
-// kColorSpace* constants.
-int MJpegDecoder::GetColorSpace() {
-  return decompress_struct_->jpeg_color_space;
-}
-
-// Number of color components in the color space.
-int MJpegDecoder::GetNumComponents() {
-  return decompress_struct_->num_components;
-}
-
-// Sample factors of the n-th component.
-int MJpegDecoder::GetHorizSampFactor(int component) {
-  return decompress_struct_->comp_info[component].h_samp_factor;
-}
-
-int MJpegDecoder::GetVertSampFactor(int component) {
-  return decompress_struct_->comp_info[component].v_samp_factor;
-}
-
-int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor /
-      GetHorizSampFactor(component);
-}
-
-int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor /
-      GetVertSampFactor(component);
-}
-
-int MJpegDecoder::GetImageScanlinesPerImcuRow() {
-  return decompress_struct_->max_v_samp_factor * DCTSIZE;
-}
-
-int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
-  int vs = GetVertSubSampFactor(component);
-  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
-}
-
-int MJpegDecoder::GetComponentWidth(int component) {
-  int hs = GetHorizSubSampFactor(component);
-  return DivideAndRoundUp(GetWidth(), hs);
-}
-
-int MJpegDecoder::GetComponentHeight(int component) {
-  int vs = GetVertSubSampFactor(component);
-  return DivideAndRoundUp(GetHeight(), vs);
-}
-
-// Get width in bytes padded out to a multiple of DCTSIZE
-int MJpegDecoder::GetComponentStride(int component) {
-  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
-}
-
-int MJpegDecoder::GetComponentSize(int component) {
-  return GetComponentWidth(component) * GetComponentHeight(component);
-}
-
-LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called jpeg_abort_decompress, it experienced an error, and we called
-    // longjmp() and rewound the stack to here. Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  jpeg_abort_decompress(decompress_struct_);
-  return LIBYUV_TRUE;
-}
-
-// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
-    uint8** planes, int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
-    // ERROR: Bad dimensions
-    return LIBYUV_FALSE;
-  }
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called into jpeglib, it experienced an error sometime during this
-    // function call, and we called longjmp() and rewound the stack to here.
-    // Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (!StartDecode()) {
-    return LIBYUV_FALSE;
-  }
-  SetScanlinePointers(databuf_);
-  int lines_left = dst_height;
-  // Compute amount of lines to skip to implement vertical crop.
-  // TODO(fbarchard): Ensure skip is a multiple of maximum component
-  // subsample. ie 2
-  int skip = (GetHeight() - dst_height) / 2;
-  if (skip > 0) {
-    // There is no API to skip lines in the output data, so we read them
-    // into the temp buffer.
-    while (skip >= GetImageScanlinesPerImcuRow()) {
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      skip -= GetImageScanlinesPerImcuRow();
-    }
-    if (skip > 0) {
-      // Have a partial iMCU row left over to skip. Must read it and then
-      // copy the parts we want into the destination.
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      for (int i = 0; i < num_outbufs_; ++i) {
-        // TODO(fbarchard): Compute skip to avoid this
-        assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip =
-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
-                                rows_to_skip;
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
-                  planes[i], GetComponentWidth(i),
-                  GetComponentWidth(i), scanlines_to_copy);
-        planes[i] += scanlines_to_copy * GetComponentWidth(i);
-      }
-      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
-    }
-  }
-
-  // Read full MCUs but cropped horizontally
-  for (; lines_left > GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    for (int i = 0; i < num_outbufs_; ++i) {
-      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
-    }
-  }
-
-  if (lines_left > 0) {
-    // Have a partial iMCU row left over to decode.
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    for (int i = 0; i < num_outbufs_; ++i) {
-      int scanlines_to_copy =
-          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
-    }
-  }
-  return FinishDecode();
-}
-
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
-    int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
-    // ERROR: Bad dimensions
-    return LIBYUV_FALSE;
-  }
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called into jpeglib, it experienced an error sometime during this
-    // function call, and we called longjmp() and rewound the stack to here.
-    // Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (!StartDecode()) {
-    return LIBYUV_FALSE;
-  }
-  SetScanlinePointers(databuf_);
-  int lines_left = dst_height;
-  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
-  int skip = (GetHeight() - dst_height) / 2;
-  if (skip > 0) {
-    while (skip >= GetImageScanlinesPerImcuRow()) {
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      skip -= GetImageScanlinesPerImcuRow();
-    }
-    if (skip > 0) {
-      // Have a partial iMCU row left over to skip.
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      for (int i = 0; i < num_outbufs_; ++i) {
-        // TODO(fbarchard): Compute skip to avoid this
-        assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        // Change our own data buffer pointers so we can pass them to the
-        // callback.
-        databuf_[i] += data_to_skip;
-      }
-      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
-      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
-      // Now change them back.
-      for (int i = 0; i < num_outbufs_; ++i) {
-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        databuf_[i] -= data_to_skip;
-      }
-      lines_left -= scanlines_to_copy;
-    }
-  }
-  // Read full MCUs until we get to the crop point.
-  for (; lines_left >= GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
-  }
-  if (lines_left > 0) {
-    // Have a partial iMCU row left over to decode.
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
-  }
-  return FinishDecode();
-}
-
-void init_source(j_decompress_ptr cinfo) {
-  fill_input_buffer(cinfo);
-}
-
-boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
-  if (buf_vec->pos >= buf_vec->len) {
-    assert(0 && "No more data");
-    // ERROR: No more data
-    return FALSE;
-  }
-  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
-  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
-  ++buf_vec->pos;
-  return TRUE;
-}
-
-void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
-  cinfo->src->next_input_byte += num_bytes;
-}
-
-void term_source(j_decompress_ptr cinfo) {
-  // Nothing to do.
-}
-
-#ifdef HAVE_SETJMP
-void ErrorHandler(j_common_ptr cinfo) {
-  // This is called when a jpeglib command experiences an error. Unfortunately
-  // jpeglib's error handling model is not very flexible, because it expects the
-  // error handler to not return--i.e., it wants the program to terminate. To
-  // recover from errors we use setjmp() as shown in their example. setjmp() is
-  // C's implementation for the "call with current continuation" functionality
-  // seen in some functional programming languages.
-  // A formatted message can be output, but is unsafe for release.
-#ifdef DEBUG
-  char buf[JMSG_LENGTH_MAX];
-  (*cinfo->err->format_message)(cinfo, buf);
-  // ERROR: Error in jpeglib: buf
-#endif
-
-  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
-  // This rewinds the call stack to the point of the corresponding setjmp()
-  // and causes it to return (for a second time) with value 1.
-  longjmp(mgr->setjmp_buffer, 1);
-}
-#endif
-
-void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
-  if (num_outbufs != num_outbufs_) {
-    // We could perhaps optimize this case to resize the output buffers without
-    // necessarily having to delete and recreate each one, but it's not worth
-    // it.
-    DestroyOutputBuffers();
-
-    scanlines_ = new uint8** [num_outbufs];
-    scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8* [num_outbufs];
-    databuf_strides_ = new int[num_outbufs];
-
-    for (int i = 0; i < num_outbufs; ++i) {
-      scanlines_[i] = NULL;
-      scanlines_sizes_[i] = 0;
-      databuf_[i] = NULL;
-      databuf_strides_[i] = 0;
-    }
-
-    num_outbufs_ = num_outbufs;
-  }
-}
-
-void MJpegDecoder::DestroyOutputBuffers() {
-  for (int i = 0; i < num_outbufs_; ++i) {
-    delete [] scanlines_[i];
-    delete [] databuf_[i];
-  }
-  delete [] scanlines_;
-  delete [] databuf_;
-  delete [] scanlines_sizes_;
-  delete [] databuf_strides_;
-  scanlines_ = NULL;
-  databuf_ = NULL;
-  scanlines_sizes_ = NULL;
-  databuf_strides_ = NULL;
-  num_outbufs_ = 0;
-}
-
-// JDCT_IFAST and do_block_smoothing improve performance substantially.
-LIBYUV_BOOL MJpegDecoder::StartDecode() {
-  decompress_struct_->raw_data_out = TRUE;
-  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
-  decompress_struct_->dither_mode = JDITHER_NONE;
-  // Not applicable to 'raw':
-  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
-  // Only for buffered mode:
-  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
-  // Blocky but fast:
-  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
-
-  if (!jpeg_start_decompress(decompress_struct_)) {
-    // ERROR: Couldn't start JPEG decompressor";
-    return LIBYUV_FALSE;
-  }
-  return LIBYUV_TRUE;
-}
-
-LIBYUV_BOOL MJpegDecoder::FinishDecode() {
-  // jpeglib considers it an error if we finish without decoding the whole
-  // image, so we call "abort" rather than "finish".
-  jpeg_abort_decompress(decompress_struct_);
-  return LIBYUV_TRUE;
-}
-
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
-  for (int i = 0; i < num_outbufs_; ++i) {
-    uint8* data_i = data[i];
-    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
-      scanlines_[i][j] = data_i;
-      data_i += GetComponentStride(i);
-    }
-  }
-}
-
-inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
-  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-      jpeg_read_raw_data(decompress_struct_,
-                         scanlines_,
-                         GetImageScanlinesPerImcuRow());
-}
-
-// The helper function which recognizes the jpeg sub-sampling type.
-JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x, int* subsample_y, int number_of_components) {
-  if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
-        subsample_x[2] == 2 && subsample_y[2] == 2) {
-      return kJpegYuv420;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
-        subsample_x[2] == 2 && subsample_y[2] == 1) {
-      return kJpegYuv422;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
-        subsample_x[2] == 1 && subsample_y[2] == 1) {
-      return kJpegYuv444;
-    }
-  } else if (number_of_components == 1) {  // Grey-scale images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
-      return kJpegYuv400;
-    }
-  }
-  return kJpegUnknown;
-}
-
-}  // namespace libyuv
-#endif  // HAVE_JPEG
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_validate.cc b/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_validate.cc
deleted file mode 100755
index 9c48832..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/mjpeg_validate.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#include <string.h>  // For memchr.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
-  if (sample_size >= 2) {
-    const uint8* end = sample + sample_size - 1;
-    const uint8* it = sample;
-    while (it < end) {
-      // TODO(fbarchard): scan for 0xd9 instead.
-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
-      if (it == NULL) {
-        break;
-      }
-      if (it[1] == 0xd9) {
-        return LIBYUV_TRUE;  // Success: Valid jpeg.
-      }
-      ++it;  // Skip over current 0xff.
-    }
-  }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
-  return LIBYUV_FALSE;
-}
-
-// Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
-  // Maximum size that ValidateJpeg will consider valid.
-  const size_t kMaxJpegSize = 0x7fffffffull;
-  const size_t kBackSearchSize = 1024;
-  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
-    // ERROR: Invalid jpeg size: sample_size
-    return LIBYUV_FALSE;
-  }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
-    // ERROR: Invalid jpeg initial start code
-    return LIBYUV_FALSE;
-  }
-
-  // Look for the End Of Image (EOI) marker near the end of the buffer.
-  if (sample_size > kBackSearchSize) {
-    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
-      return LIBYUV_TRUE;  // Success: Valid jpeg.
-    }
-    // Reduce search size for forward search.
-    sample_size = sample_size - kBackSearchSize + 1;
-  }
-  // Step over SOI marker and scan for EOI.
-  return ScanEOI(sample + 2, sample_size - 2);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/planar_functions.cc b/android/src/main/libenc/jni/libyuv/jni/source/planar_functions.cc
deleted file mode 100755
index 851c0fe..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/planar_functions.cc
+++ /dev/null
@@ -1,2629 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/planar_functions.h"
-
-#include <string.h>  // for memset()
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"  // for ScaleRowDown2
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy a plane of data
-LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Nothing to do.
-  if (src_y == dst_y && src_stride_y == dst_stride_y) {
-    return;
-  }
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height; ++y) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height) {
-  int y;
-  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_COPYROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_16_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_16_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_16_MIPS;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height; ++y) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Copy I422.
-LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  int halfwidth = (width + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
-  return 0;
-}
-
-// Copy I444.
-LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-  return 0;
-}
-
-// Copy I400.
-LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Convert I420 to I400.
-LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
-                 uint8* dst_y, int dst_stride_y,
-                 int width, int height) {
-  int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
-      YUY2ToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int width) = UYVYToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
-    UYVYToYRow = UYVYToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_SSE2;
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
-    UYVYToYRow = UYVYToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToUV422Row = UYVYToUV422Row_AVX2;
-      UYVYToYRow = UYVYToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUV422Row = UYVYToUV422Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-    src_uyvy += src_stride_uyvy;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Mirror I400 with optional flipping
-LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-
-  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Mirror I420 with optional flipping
-LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
-      ARGBMirrorRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_AVX2;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    ARGBMirrorRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
-    return ARGBBlendRow;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBBlendRow = ARGBBlendRow_NEON;
-  }
-#endif
-  return ARGBBlendRow;
-}
-
-// Alpha Blend 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  int y;
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = GetARGBBlend();
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-
-  for (y = 0; y < height; ++y) {
-    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Alpha Blend plane and store to destination.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  int y;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
-  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-
-  // Coalesce rows for Y plane.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      alpha_stride == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
-  }
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      BlendPlaneRow = BlendPlaneRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      BlendPlaneRow = BlendPlaneRow_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
-    src_y0 += src_stride_y0;
-    src_y1 += src_stride_y1;
-    alpha += alpha_stride;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-#define MAXTWIDTH 2048
-// Alpha Blend YUV images and store to destination.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
-  int y;
-  // Half width/height for UV.
-  int halfwidth = (width + 1) >> 1;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
-  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
-      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-
-  // Blend Y plane.
-  BlendPlane(src_y0, src_stride_y0,
-             src_y1, src_stride_y1,
-             alpha, alpha_stride,
-             dst_y, dst_stride_y,
-             width, height);
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      BlendPlaneRow = BlendPlaneRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      BlendPlaneRow = BlendPlaneRow_AVX2;
-    }
-  }
-#endif
-  if (!IS_ALIGNED(width, 2)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
-  }
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
-      if (IS_ALIGNED(halfwidth, 16)) {
-        ScaleRowDown2 = ScaleRowDown2Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
-      if (IS_ALIGNED(halfwidth, 16)) {
-        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
-      if (IS_ALIGNED(halfwidth, 32)) {
-        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
-      }
-    }
-  }
-#endif
-
-  // Row buffer for intermediate alpha pixels.
-  align_buffer_64(halfalpha, halfwidth);
-  for (y = 0; y < height; y += 2) {
-    // last row of odd height image use 1 row of alpha instead of 2.
-    if (y == (height - 1)) {
-      alpha_stride = 0;
-    }
-    // Subsample 2 rows of UV to half width and half height.
-    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
-    alpha += alpha_stride * 2;
-    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
-    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
-    src_u0 += src_stride_u0;
-    src_u1 += src_stride_u1;
-    dst_u += dst_stride_u;
-    src_v0 += src_stride_v0;
-    src_v1 += src_stride_v1;
-    dst_v += dst_stride_v;
-  }
-  free_aligned_buffer_64(halfalpha);
-  return 0;
-}
-
-// Multiply 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
-  int y;
-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBMultiplyRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
-    }
-  }
-#endif
-
-  // Multiply plane
-  for (y = 0; y < height; ++y) {
-    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Add 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height) {
-  int y;
-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                     int width) = ARGBAddRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBAddRow = ARGBAddRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBAddRow = ARGBAddRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAddRow = ARGBAddRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAddRow = ARGBAddRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAddRow = ARGBAddRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAddRow = ARGBAddRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAddRow = ARGBAddRow_NEON;
-    }
-  }
-#endif
-
-  // Add plane
-  for (y = 0; y < height; ++y) {
-    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Subtract 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
-  int y;
-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBSubtractRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSUBTRACTROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSubtractRow = ARGBSubtractRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBSubtractRow = ARGBSubtractRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBSubtractRow = ARGBSubtractRow_NEON;
-    }
-  }
-#endif
-
-  // Subtract plane
-  for (y = 0; y < height; ++y) {
-    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width, int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* uv_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert RAW to RGB24.
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height) {
-  int y;
-  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
-      RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_rgb24 == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_rgb24 = 0;
-  }
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToRGB24Row = RAWToRGB24Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToRGB24Row(src_raw, dst_rgb24, width);
-    src_raw += src_stride_raw;
-    dst_rgb24 += dst_stride_rgb24;
-  }
-  return 0;
-}
-
-LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value) {
-  int y;
-  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    dst_stride_y = 0;
-  }
-#if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SetRow = SetRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SetRow = SetRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SETROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    SetRow = SetRow_Any_X86;
-    if (IS_ALIGNED(width, 4)) {
-      SetRow = SetRow_X86;
-    }
-  }
-#endif
-#if defined(HAS_SETROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    SetRow = SetRow_ERMS;
-  }
-#endif
-
-  // Set plane
-  for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
-    dst_y += dst_stride_y;
-  }
-}
-
-// Draw a rectangle into I420
-LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
-      value_v < 0 || value_v > 255) {
-    return -1;
-  }
-
-  SetPlane(start_y, dst_stride_y, width, height, value_y);
-  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
-  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
-  return 0;
-}
-
-// Draw a rectangle into ARGB
-LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height,
-             uint32 value) {
-  int y;
-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
-  if (!dst_argb ||
-      width <= 0 || height == 0 ||
-      dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-
-#if defined(HAS_ARGBSETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBSetRow = ARGBSetRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSetRow = ARGBSetRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSETROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    ARGBSetRow = ARGBSetRow_X86;
-  }
-#endif
-
-  // Set plane
-  for (y = 0; y < height; ++y) {
-    ARGBSetRow(dst_argb, value, width);
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-// An unattenutated ARGB alpha blend uses the formula
-// p = a * f + (1 - a) * b
-// where
-//   p is output pixel
-//   f is foreground pixel
-//   b is background pixel
-//   a is alpha value from foreground pixel
-// An preattenutated ARGB alpha blend uses the formula
-// p = f + (1 - a) * b
-// where
-//   f is foreground pixel premultiplied by alpha
-
-LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
-  int y;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBAttenuateRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
-  int y;
-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                             int width) = ARGBUnattenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
-    }
-  }
-#endif
-// TODO(fbarchard): Neon version.
-
-  for (y = 0; y < height; ++y) {
-    ARGBUnattenuateRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB to Grayed ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_NEON;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBGrayRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height) {
-  int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_NEON;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBGrayRow(dst, dst, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int dst_x, int dst_y, int width, int height) {
-  int y;
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSEPIAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBSEPIAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBSepiaRow = ARGBSepiaRow_NEON;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBSepiaRow(dst, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a 4x4 matrix to each ARGB pixel.
-// Note: Normally for shading, but can be used to swizzle or invert.
-LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height) {
-  int y;
-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
-  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a 4x3 matrix to each ARGB pixel.
-// Deprecated.
-LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int dst_x, int dst_y, int width, int height) {
-  SIMD_ALIGNED(int8 matrix_argb[16]);
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-
-  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
-  matrix_argb[0] = matrix_rgb[0] / 2;
-  matrix_argb[1] = matrix_rgb[1] / 2;
-  matrix_argb[2] = matrix_rgb[2] / 2;
-  matrix_argb[3] = matrix_rgb[3] / 2;
-  matrix_argb[4] = matrix_rgb[4] / 2;
-  matrix_argb[5] = matrix_rgb[5] / 2;
-  matrix_argb[6] = matrix_rgb[6] / 2;
-  matrix_argb[7] = matrix_rgb[7] / 2;
-  matrix_argb[8] = matrix_rgb[8] / 2;
-  matrix_argb[9] = matrix_rgb[9] / 2;
-  matrix_argb[10] = matrix_rgb[10] / 2;
-  matrix_argb[11] = matrix_rgb[11] / 2;
-  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
-  matrix_argb[15] = 64;  // 1.0
-
-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
-                         dst, dst_stride_argb,
-                         &matrix_argb[0], width, height);
-}
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int dst_x, int dst_y, int width, int height) {
-  int y;
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
-                            int width) = ARGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOLORTABLEROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    ARGBColorTableRow = ARGBColorTableRow_X86;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBColorTableRow(dst, table_argb, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int dst_x, int dst_y, int width, int height) {
-  int y;
-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
-                           int width) = RGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_RGBCOLORTABLEROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    RGBColorTableRow = RGBColorTableRow_X86;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    RGBColorTableRow(dst, table_argb, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// ARGBQuantize is used to posterize art.
-// e.g. rgb / qvalue * qvalue + qvalue / 2
-// But the low levels implement efficiently with 3 parameters, and could be
-// used for other high level operations.
-// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-// where scale is 1 / interval_size as a fixed point value.
-// The divide is replaces with a multiply by reciprocal fixed point multiply.
-// Caveat - although SSE2 saturates, the C function does not and should be used
-// with care if doing anything but quantization.
-LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int dst_x, int dst_y, int width, int height) {
-  int y;
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) = ARGBQuantizeRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
-      interval_size < 1 || interval_size > 255) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBQUANTIZEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBQUANTIZEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height) {
-  int y;
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  int32* previous_cumsum = dst_cumsum;
-  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
-    return -1;
-  }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-  }
-#endif
-  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
-  for (y = 0; y < height; ++y) {
-    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
-    previous_cumsum = dst_cumsum;
-    dst_cumsum += dst_stride32_cumsum;
-    src_argb += src_stride_argb;
-  }
-  return 0;
-}
-
-// Blur ARGB image.
-// Caller should allocate CumulativeSum table of width * height * 16 bytes
-// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
-// as the buffer is treated as circular.
-LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius) {
-  int y;
-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
-  int32* cumsum_bot_row;
-  int32* max_cumsum_bot_row;
-  int32* cumsum_top_row;
-
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  if (radius > height) {
-    radius = height;
-  }
-  if (radius > (width / 2 - 1)) {
-    radius = width / 2 - 1;
-  }
-  if (radius <= 0) {
-    return -1;
-  }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
-  }
-#endif
-  // Compute enough CumulativeSum for first row to be blurred. After this
-  // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
-                           dst_cumsum, dst_stride32_cumsum,
-                           width, radius);
-
-  src_argb = src_argb + radius * src_stride_argb;
-  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
-
-  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
-  cumsum_top_row = &dst_cumsum[0];
-
-  for (y = 0; y < height; ++y) {
-    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
-    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
-    int area = radius * (bot_y - top_y);
-    int boxwidth = radius * 4;
-    int x;
-    int n;
-
-    // Increment cumsum_top_row pointer with circular buffer wrap around.
-    if (top_y) {
-      cumsum_top_row += dst_stride32_cumsum;
-      if (cumsum_top_row >= max_cumsum_bot_row) {
-        cumsum_top_row = dst_cumsum;
-      }
-    }
-    // Increment cumsum_bot_row pointer with circular buffer wrap around and
-    // then fill in a row of CumulativeSum.
-    if ((y + radius) < height) {
-      const int32* prev_cumsum_bot_row = cumsum_bot_row;
-      cumsum_bot_row += dst_stride32_cumsum;
-      if (cumsum_bot_row >= max_cumsum_bot_row) {
-        cumsum_bot_row = dst_cumsum;
-      }
-      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
-                              width);
-      src_argb += src_stride_argb;
-    }
-
-    // Left clipped.
-    for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                                boxwidth, area, &dst_argb[x * 4], 1);
-      area += (bot_y - top_y);
-      boxwidth += 4;
-    }
-
-    // Middle unclipped.
-    n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], n);
-
-    // Right clipped.
-    for (x += n; x <= width - 1; ++x) {
-      area -= (bot_y - top_y);
-      boxwidth -= 4;
-      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4,
-                                boxwidth, area, &dst_argb[x * 4], 1);
-    }
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Multiply ARGB image by a specified ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value) {
-  int y;
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSHADEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBShadeRow = ARGBShadeRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBSHADEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBShadeRow = ARGBShadeRow_NEON;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBShadeRow(src_argb, dst_argb, width, value);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Interpolate 2 planes by specified amount (0 to 255).
-LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation) {
-  int y;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst = dst + (height - 1) * dst_stride;
-    dst_stride = -dst_stride;
-  }
-  // Coalesce rows.
-  if (src_stride0 == width &&
-      src_stride1 == width &&
-      dst_stride == width) {
-    width *= height;
-    height = 1;
-    src_stride0 = src_stride1 = dst_stride = 0;
-  }
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
-      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
-      IS_ALIGNED(width, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    InterpolateRow(dst, src0, src1 - src0,
-                   width, interpolation);
-    src0 += src_stride0;
-    src1 += src_stride1;
-    dst += dst_stride;
-  }
-  return 0;
-}
-
-// Interpolate 2 ARGB images by specified amount (0 to 255).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation) {
-  return InterpolatePlane(src_argb0, src_stride_argb0,
-                          src_argb1, src_stride_argb1,
-                          dst_argb, dst_stride_argb,
-                          width * 4, height, interpolation);
-}
-
-// Interpolate 2 YUV images by specified amount (0 to 255).
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src0_y || !src0_u || !src0_v ||
-      !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  InterpolatePlane(src0_y, src0_stride_y,
-                   src1_y, src1_stride_y,
-                   dst_y, dst_stride_y,
-                   width, height, interpolation);
-  InterpolatePlane(src0_u, src0_stride_u,
-                   src1_u, src1_stride_u,
-                   dst_u, dst_stride_u,
-                   halfwidth, halfheight, interpolation);
-  InterpolatePlane(src0_v, src0_stride_v,
-                   src1_v, src1_stride_v,
-                   dst_v, dst_stride_v,
-                   halfwidth, halfheight, interpolation);
-  return 0;
-}
-
-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height) {
-  int y;
-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
-                         const uint8* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-  // Coalesce rows.
-  if (src_stride_bgra == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_bgra = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBShuffleRow = ARGBShuffleRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
-    src_bgra += src_stride_bgra;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_argb, int dst_stride_argb,
-                        int width, int height,
-                        void (*SobelRow)(const uint8* src_sobelx,
-                                         const uint8* src_sobely,
-                                         uint8* dst, int width)) {
-  int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
-      ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobely, int width) =
-      SobelXRow_C;
-  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-
-#if defined(HAS_SOBELYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelYRow = SobelYRow_SSE2;
-  }
-#endif
-#if defined(HAS_SOBELYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelYRow = SobelYRow_NEON;
-  }
-#endif
-#if defined(HAS_SOBELXROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelXRow = SobelXRow_SSE2;
-  }
-#endif
-#if defined(HAS_SOBELXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXRow = SobelXRow_NEON;
-  }
-#endif
-  {
-    // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 31) & ~31;
-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
-    uint8* row_sobelx = rows;
-    uint8* row_sobely = rows + kRowSize;
-    uint8* row_y = rows + kRowSize * 2;
-
-    // Convert first row.
-    uint8* row_y0 = row_y + kEdge;
-    uint8* row_y1 = row_y0 + kRowSize;
-    uint8* row_y2 = row_y1 + kRowSize;
-    ARGBToYJRow(src_argb, row_y0, width);
-    row_y0[-1] = row_y0[0];
-    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToYJRow(src_argb, row_y1, width);
-    row_y1[-1] = row_y1[0];
-    memset(row_y1 + width, row_y1[width - 1], 16);
-    memset(row_y2 + width, 0, 16);
-
-    for (y = 0; y < height; ++y) {
-      // Convert next row of ARGB to G.
-      if (y < (height - 1)) {
-        src_argb += src_stride_argb;
-      }
-      ARGBToYJRow(src_argb, row_y2, width);
-      row_y2[-1] = row_y2[0];
-      row_y2[width] = row_y2[width - 1];
-
-      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
-      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
-      SobelRow(row_sobelx, row_sobely, dst_argb, width);
-
-      // Cycle thru circular queue of 3 row_y buffers.
-      {
-        uint8* row_yt = row_y0;
-        row_y0 = row_y1;
-        row_y1 = row_y2;
-        row_y2 = row_yt;
-      }
-
-      dst_argb += dst_stride_argb;
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) = SobelRow_C;
-#if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelRow = SobelRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelRow = SobelRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelRow = SobelRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      SobelRow = SobelRow_NEON;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                      width, height, SobelRow);
-}
-
-// Sobel ARGB effect with planar output.
-LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height) {
-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_, int width) = SobelToPlaneRow_C;
-#if defined(HAS_SOBELTOPLANEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelToPlaneRow = SobelToPlaneRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELTOPLANEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SobelToPlaneRow = SobelToPlaneRow_NEON;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                      width, height, SobelToPlaneRow);
-}
-
-// SobelXY ARGB effect.
-// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
-LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) = SobelXYRow_C;
-#if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelXYRow = SobelXYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelXYRow = SobelXYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELXYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXYRow = SobelXYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      SobelXYRow = SobelXYRow_NEON;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                      width, height, SobelXYRow);
-}
-
-// Apply a 4x4 polynomial to each ARGB pixel.
-LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
-                   const float* poly,
-                   int width, int height) {
-  int y;
-  void (*ARGBPolynomialRow)(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) = ARGBPolynomialRow_C;
-  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
-    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
-      IS_ALIGNED(width, 2)) {
-    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a lumacolortable to each ARGB pixel.
-LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma,
-                       int width, int height) {
-  int y;
-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
-      int width, const uint8* luma, const uint32 lumacoeff) =
-      ARGBLumaColorTableRow_C;
-  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
-    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Copy Alpha from one ARGB image to another.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
-  int y;
-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBCopyAlphaRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBCopyAlphaRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Copy a planar Y channel to the alpha channel of a destination ARGB image.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height) {
-  int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_yuy2 += src_stride_yuy2 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_uyvy += src_stride_uyvy * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate.cc
deleted file mode 100755
index 01ea5c4..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate.cc
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
-  int i = height;
-  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
-#if defined(HAS_TRANSPOSEWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeWx8 = TransposeWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeWx8 = TransposeWx8_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx8 = TransposeWx8_Fast_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_Fast_DSPR2;
-    } else {
-      TransposeWx8 = TransposeWx8_DSPR2;
-    }
-  }
-#endif
-
-  // Work across the source in 8x8 tiles
-  while (i >= 8) {
-    TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst += 8;                 // Move over 8 columns.
-    i -= 8;
-  }
-
-  if (i > 0) {
-    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
-  }
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
-  // Rotate by 90 is a transpose with the source read
-  // from bottom to top. So set the source pointer to the end
-  // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
-  // Rotate by 270 is a transpose with the destination written
-  // from bottom to top. So set the destination pointer to the end
-  // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
-  int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-  // Odd height will harmlessly mirror the middle row twice.
-  for (y = 0; y < half_height; ++y) {
-    MirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    dst += dst_stride;
-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
-  int i = height;
-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSEUVWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeUVWx8 = TransposeUVWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx8 = TransposeUVWx8_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    TransposeUVWx8 = TransposeUVWx8_DSPR2;
-  }
-#endif
-
-  // Work through the source in 8x8 tiles.
-  while (i >= 8) {
-    TransposeUVWx8(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
-                   width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst_a += 8;               // Move over 8 columns.
-    dst_b += 8;               // Move over 8 columns.
-    i -= 8;
-  }
-
-  if (i > 0) {
-    TransposeUVWxH_C(src, src_stride,
-                     dst_a, dst_stride_a,
-                     dst_b, dst_stride_b,
-                     width, i);
-  }
-}
-
-LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height) {
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
-}
-
-LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
-  dst_a += dst_stride_a * (width - 1);
-  dst_b += dst_stride_b * (width - 1);
-  dst_stride_a = -dst_stride_a;
-  dst_stride_b = -dst_stride_b;
-
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
-}
-
-// Rotate 180 is a horizontal and vertical flip.
-LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
-  int i;
-  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    MirrorUVRow = MirrorUVRow_NEON;
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    MirrorUVRow = MirrorUVRow_SSSE3;
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    MirrorUVRow = MirrorUVRow_DSPR2;
-  }
-#endif
-
-  dst_a += dst_stride_a * (height - 1);
-  dst_b += dst_stride_b * (height - 1);
-
-  for (i = 0; i < height; ++i) {
-    MirrorUVRow(src, dst_a, dst_b, width);
-    src += src_stride;
-    dst_a -= dst_stride_a;
-    dst_b -= dst_stride_b;
-  }
-}
-
-LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int width, int height,
-                enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || !dst) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src = src + (height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      CopyPlane(src, src_stride,
-                dst, dst_stride,
-                width, height);
-      return 0;
-    case kRotate90:
-      RotatePlane90(src, src_stride,
-                    dst, dst_stride,
-                    width, height);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height,
-               enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return I420Copy(src_y, src_stride_y,
-                      src_u, src_stride_u,
-                      src_v, src_stride_v,
-                      dst_y, dst_stride_y,
-                      dst_u, dst_stride_u,
-                      dst_v, dst_stride_v,
-                      width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotatePlane90(src_u, src_stride_u,
-                    dst_u, dst_stride_u,
-                    halfwidth, halfheight);
-      RotatePlane90(src_v, src_stride_v,
-                    dst_v, dst_stride_v,
-                    halfwidth, halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane270(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane270(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane180(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane180(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height,
-                     enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return NV12ToI420(src_y, src_stride_y,
-                        src_uv, src_stride_uv,
-                        dst_y, dst_stride_y,
-                        dst_u, dst_stride_u,
-                        dst_v, dst_stride_v,
-                        width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotateUV90(src_uv, src_stride_uv,
-                 dst_u, dst_stride_u,
-                 dst_v, dst_stride_v,
-                 halfwidth, halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV270(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV180(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_any.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_any.cc
deleted file mode 100755
index 31a74c3..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_any.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                 uint8* dst, int dst_stride, int width) {                      \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
-      }                                                                        \
-      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
-    }
-
-#ifdef HAS_TRANSPOSEWX8_NEON
-TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
-#endif
-#ifdef HAS_TRANSPOSEWX8_SSSE3
-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
-#endif
-#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
-#endif
-#ifdef HAS_TRANSPOSEWX8_DSPR2
-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
-#endif
-#undef TANY
-
-#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                uint8* dst_a, int dst_stride_a,                                \
-                uint8* dst_b, int dst_stride_b, int width) {                   \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
-                  n);                                                          \
-      }                                                                        \
-      TransposeUVWx8_C(src + n * 2, src_stride,                                \
-                       dst_a + n * dst_stride_a, dst_stride_a,                 \
-                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
-    }
-
-#ifdef HAS_TRANSPOSEUVWX8_NEON
-TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_SSE2
-TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_DSPR2
-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
-#endif
-#undef TUVANY
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-
-
-
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_argb.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_argb.cc
deleted file mode 100755
index 787c0ad..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_argb.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
-
-static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
-  int i;
-  int src_pixel_step = src_stride >> 2;
-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
-  }
-#endif
-
-  for (i = 0; i < width; ++i) {  // column of source to row of dest.
-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
-    dst += dst_stride;
-    src += 4;
-  }
-}
-
-void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
-  // Rotate by 90 is a ARGBTranspose with the source read
-  // from bottom to top. So set the source pointer to the end
-  // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
-}
-
-void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
-  // Rotate by 270 is a ARGBTranspose with the destination written
-  // from bottom to top. So set the destination pointer to the end
-  // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
-}
-
-void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width * 4);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
-  int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
-      ARGBMirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-  // Odd height will harmlessly mirror the middle row twice.
-  for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
-    src += src_stride;
-    dst += dst_stride;
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
-               enum RotationMode mode) {
-  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return ARGBCopy(src_argb, src_stride_argb,
-                      dst_argb, dst_stride_argb,
-                      width, height);
-    case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb,
-                   dst_argb, dst_stride_argb,
-                   width, height);
-      return 0;
-    case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
-      return 0;
-    case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_common.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_common.cc
deleted file mode 100755
index b33a9a0..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_common.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height) {
-  int i;
-  for (i = 0; i < width * 2; i += 2) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_gcc.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_gcc.cc
deleted file mode 100755
index cbe870c..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_gcc.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
-
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm9                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "palignr    $0x8,%%xmm9,%%xmm9               \n"
-    "movdqu     (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm10                   \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm10                   \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movdqa     %%xmm10,%%xmm11                  \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "movdqu     (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm12                   \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm12                   \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movdqa     %%xmm12,%%xmm13                  \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movdqu     (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm14                   \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "punpckhbw  %%xmm7,%%xmm14                   \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "movdqa     %%xmm14,%%xmm15                  \n"
-    "lea        0x10(%0,%3,8),%0                 \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "punpcklwd  %%xmm10,%%xmm8                   \n"
-    "punpcklwd  %%xmm11,%%xmm9                   \n"
-    "movdqa     %%xmm8,%%xmm10                   \n"
-    "movdqa     %%xmm9,%%xmm11                   \n"
-    "palignr    $0x8,%%xmm10,%%xmm10             \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "punpcklwd  %%xmm14,%%xmm12                  \n"
-    "punpcklwd  %%xmm15,%%xmm13                  \n"
-    "movdqa     %%xmm12,%%xmm14                  \n"
-    "movdqa     %%xmm13,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm12,%%xmm8                   \n"
-    "movq       %%xmm8,(%1)                      \n"
-    "movdqa     %%xmm8,%%xmm12                   \n"
-    "palignr    $0x8,%%xmm12,%%xmm12             \n"
-    "movq       %%xmm12,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm14,%%xmm10                  \n"
-    "movdqa     %%xmm10,%%xmm14                  \n"
-    "movq       %%xmm10,(%1)                     \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "punpckldq  %%xmm13,%%xmm9                   \n"
-    "movq       %%xmm14,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm9,%%xmm13                   \n"
-    "movq       %%xmm9,(%1)                      \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movq       %%xmm13,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm15,%%xmm11                  \n"
-    "movq       %%xmm11,(%1)                     \n"
-    "movdqa     %%xmm11,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "sub        $0x10,%2                         \n"
-    "movq       %%xmm15,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-  );
-}
-#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-// Transpose UV 8x8.  64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%4),%%xmm1                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm1                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqu     (%0,%4),%%xmm3                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm3                    \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "movdqu     (%0,%4),%%xmm5                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm5                    \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "movdqu     (%0,%4),%%xmm7                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm8                    \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %4                               \n"
-    "lea        0x10(%0,%4,8),%0                 \n"
-    "punpckhbw  %%xmm7,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm7                    \n"
-    "neg        %4                               \n"
-     // Second round of bit swap.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "movdqa     %%xmm1,%%xmm9                    \n"
-    "punpckhwd  %%xmm2,%%xmm8                    \n"
-    "punpckhwd  %%xmm3,%%xmm9                    \n"
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm2                    \n"
-    "movdqa     %%xmm9,%%xmm3                    \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "movdqa     %%xmm5,%%xmm9                    \n"
-    "punpckhwd  %%xmm6,%%xmm8                    \n"
-    "punpckhwd  %%xmm7,%%xmm9                    \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm8,%%xmm6                    \n"
-    "movdqa     %%xmm9,%%xmm7                    \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-    "punpckhdq  %%xmm4,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movlpd     %%xmm2,(%1)                      \n"
-    "movhpd     %%xmm2,(%2)                      \n"
-    "punpckhdq  %%xmm6,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm1,%%xmm8                    \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movlpd     %%xmm1,(%1)                      \n"
-    "movhpd     %%xmm1,(%2)                      \n"
-    "punpckhdq  %%xmm5,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm3,%%xmm8                    \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movlpd     %%xmm3,(%1)                      \n"
-    "movhpd     %%xmm3,(%2)                      \n"
-    "punpckhdq  %%xmm7,%%xmm8                    \n"
-    "sub        $0x8,%3                          \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst_a),  // %1
-      "+r"(dst_b),  // %2
-      "+r"(width)   // %3
-    : "r"((intptr_t)(src_stride)),    // %4
-      "r"((intptr_t)(dst_stride_a)),  // %5
-      "r"((intptr_t)(dst_stride_b))   // %6
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9"
-  );
-}
-#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_mips.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_mips.cc
deleted file mode 100755
index 23e89fb..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_mips.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-   __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-    "1:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "sw               $s0, 0(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "sw               $s1, 4(%[dst])                   \n"
-      "bnez             %[width], 1b                     \n"
-      " addu            %[dst], %[dst], %[dst_stride]    \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-   "11:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "swr              $s0, 0(%[dst])                   \n"
-      "swl              $s0, 3(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "swr              $s1, 4(%[dst])                   \n"
-      "swl              $s1, 7(%[dst])                   \n"
-      "bnez             %[width], 11b                    \n"
-       "addu             %[dst], %[dst], %[dst_stride]   \n"
-    "2:                                                  \n"
-      ".set pop                                          \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1"
-  );
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                                  uint8* dst, int dst_stride, int width) {
-  __asm__ __volatile__ (
-      ".set noat                                         \n"
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz             %[width], 2f                     \n"
-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-
-      "srl              $AT, %[width], 0x2               \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-      "1:                                                \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "sw              $s4, 0(%[dst])                    \n"
-      "sw              $t0, 4(%[dst])                    \n"
-      "sw              $s6, 0($s0)                       \n"
-      "sw              $t8, 4($s0)                       \n"
-      "sw              $s5, 0($s1)                       \n"
-      "sw              $t1, 4($s1)                       \n"
-      "sw              $s7, 0($s2)                       \n"
-      "sw              $t9, 4($s2)                       \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 1b                          \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-      "11:                                               \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "swr              $s4, 0(%[dst])                   \n"
-      "swl              $s4, 3(%[dst])                   \n"
-      "swr              $t0, 4(%[dst])                   \n"
-      "swl              $t0, 7(%[dst])                   \n"
-      "swr              $s6, 0($s0)                      \n"
-      "swl              $s6, 3($s0)                      \n"
-      "swr              $t8, 4($s0)                      \n"
-      "swl              $t8, 7($s0)                      \n"
-      "swr              $s5, 0($s1)                      \n"
-      "swl              $s5, 3($s1)                      \n"
-      "swr              $t1, 4($s1)                      \n"
-      "swl              $t1, 7($s1)                      \n"
-      "swr              $s7, 0($s2)                      \n"
-      "swl              $s7, 3($s2)                      \n"
-      "swr              $t9, 4($s2)                      \n"
-      "swl              $t9, 7($s2)                      \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 11b                         \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "2:                                                \n"
-      ".set pop                                          \n"
-      ".set at                                           \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
-  );
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b,
-                               int width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz            %[width], 2f                      \n"
-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
-      "addu            $t3, $t2, %[src_stride]           \n"
-      "addu            $t5, $t4, %[src_stride]           \n"
-      "addu            $t6, $t2, $t4                     \n"
-      "subu            $t7, $t9, %[src_stride]           \n"
-      "srl             $t1, %[width], 1                  \n"
-
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
-      "andi            $t0, %[dst_a], 0x3                \n"
-      "andi            $t8, %[dst_b], 0x3                \n"
-      "or              $t0, $t0, $t8                     \n"
-      "andi            $t8, %[dst_stride_a], 0x3         \n"
-      "andi            $s5, %[dst_stride_b], 0x3         \n"
-      "or              $t8, $t8, $s5                     \n"
-      "or              $t0, $t0, $t8                     \n"
-      "bnez            $t0, 11f                          \n"
-      " nop                                              \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "sw              $s3, 0($s5)                       \n"
-      "sw              $s4, 0($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "sw              $s3, 0(%[dst_a])                  \n"
-      "sw              $s4, 0(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-      "sw              $s3, 4($s5)                       \n"
-      "sw              $s4, 4($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "sw              $s3, 4(%[dst_a])                  \n"
-      "sw              $s4, 4(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 1b                           \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-      "b               2f                                \n"
-      " nop                                              \n"
-
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
-   "11:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "swr             $s3, 0($s5)                       \n"
-      "swl             $s3, 3($s5)                       \n"
-      "swr             $s4, 0($s6)                       \n"
-      "swl             $s4, 3($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "swr             $s3, 0(%[dst_a])                  \n"
-      "swl             $s3, 3(%[dst_a])                  \n"
-      "swr             $s4, 0(%[dst_b])                  \n"
-      "swl             $s4, 3(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-
-      "swr             $s3, 4($s5)                       \n"
-      "swl             $s3, 7($s5)                       \n"
-      "swr             $s4, 4($s6)                       \n"
-      "swl             $s4, 7($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "swr             $s3, 4(%[dst_a])                  \n"
-      "swl             $s3, 7(%[dst_a])                  \n"
-      "swr             $s4, 4(%[dst_b])                  \n"
-      "swl             $s4, 7(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 11b                          \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-
-      "2:                                                \n"
-      ".set pop                                          \n"
-      : [src] "+r" (src),
-        [dst_a] "+r" (dst_a),
-        [dst_b] "+r" (dst_b),
-        [width] "+r" (width),
-        [src_stride] "+r" (src_stride)
-      : [dst_stride_a] "r" (dst_stride_a),
-        [dst_stride_b] "r" (dst_stride_b)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon.cc
deleted file mode 100755
index 9e4ecd8..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon.cc
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
-                       int width) {
-  const uint8* src_temp = NULL;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %5, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      MEMACCESS(0)
-      "vld1.8      {d0}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d1}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d2}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d3}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d4}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d5}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d6}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d7}, [%0]                  \n"
-
-      "vtrn.8      d1, d0                      \n"
-      "vtrn.8      d3, d2                      \n"
-      "vtrn.8      d5, d4                      \n"
-      "vtrn.8      d7, d6                      \n"
-
-      "vtrn.16     d1, d3                      \n"
-      "vtrn.16     d0, d2                      \n"
-      "vtrn.16     d5, d7                      \n"
-      "vtrn.16     d4, d6                      \n"
-
-      "vtrn.32     d1, d5                      \n"
-      "vtrn.32     d0, d4                      \n"
-      "vtrn.32     d3, d7                      \n"
-      "vtrn.32     d2, d6                      \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-
-      "mov         %0, %3                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d1}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d0}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d3}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d2}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d5}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d4}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d7}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d6}, [%0]                  \n"
-
-      "add         %1, #8                      \n"  // src += 8
-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                     \n"  // w   -= 8
-      "bge         1b                          \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %5, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %5, #4                        \n"
-    "blt         2f                            \n"
-
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[1]}, [%0]                 \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(6)
-    "vld1.8      {q3}, [%6]                    \n"
-
-    "vtbl.8      d4, {d0, d1}, d6              \n"
-    "vtbl.8      d5, {d0, d1}, d7              \n"
-    "vtbl.8      d0, {d2, d3}, d6              \n"
-    "vtbl.8      d1, {d2, d3}, d7              \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "vst1.32     {d4[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d4[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[1]}, [%0]                 \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[1]}, [%0]                 \n"
-
-    "add         %1, #4                        \n"  // src += 4
-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %5,  #4                       \n"  // w   -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[3]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[3]}, [%0]                 \n"
-
-    "vtrn.8      d0, d1                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0]                    \n"
-
-    "add         %1, #2                        \n"  // src += 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %5,  #2                       \n"  // w   -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[0]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[1]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[2]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[3]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[4]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[5]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[6]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[7]}, [%1]                 \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),          // %0
-      "+r"(src),               // %1
-      "+r"(src_stride),        // %2
-      "+r"(dst),               // %3
-      "+r"(dst_stride),        // %4
-      "+r"(width)              // %5
-    : "r"(&kVTbl4x4Transpose)  // %6
-    : "memory", "cc", "q0", "q1", "q2", "q3"
-  );
-}
-
-static uvec8 kVTbl4x4TransposeDi =
-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
-
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int width) {
-  const uint8* src_temp = NULL;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %7, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      MEMACCESS(0)
-      "vld2.8      {d0,  d1},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d2,  d3},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d4,  d5},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d6,  d7},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d16, d17}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d18, d19}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d20, d21}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d22, d23}, [%0]            \n"
-
-      "vtrn.8      q1, q0                      \n"
-      "vtrn.8      q3, q2                      \n"
-      "vtrn.8      q9, q8                      \n"
-      "vtrn.8      q11, q10                    \n"
-
-      "vtrn.16     q1, q3                      \n"
-      "vtrn.16     q0, q2                      \n"
-      "vtrn.16     q9, q11                     \n"
-      "vtrn.16     q8, q10                     \n"
-
-      "vtrn.32     q1, q9                      \n"
-      "vtrn.32     q0, q8                      \n"
-      "vtrn.32     q3, q11                     \n"
-      "vtrn.32     q2, q10                     \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-      "vrev16.8    q8, q8                      \n"
-      "vrev16.8    q9, q9                      \n"
-      "vrev16.8    q10, q10                    \n"
-      "vrev16.8    q11, q11                    \n"
-
-      "mov         %0, %3                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d2},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d0},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d6},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d4},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d18}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d16}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d22}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d20}, [%0]                 \n"
-
-      "mov         %0, %5                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d3},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d1},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d7},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d5},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d19}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d17}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d23}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d21}, [%0]                 \n"
-
-      "add         %1, #8*2                    \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %7,  #8                     \n"  // w     -= 8
-      "bge         1b                          \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %7, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %7, #4                        \n"
-    "blt         2f                            \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.64     {d0}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d1}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d2}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d3}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d4}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d5}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d6}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d7}, [%0]                    \n"
-
-    MEMACCESS(8)
-    "vld1.8      {q15}, [%8]                   \n"
-
-    "vtrn.8      q0, q1                        \n"
-    "vtrn.8      q2, q3                        \n"
-
-    "vtbl.8      d16, {d0, d1}, d30            \n"
-    "vtbl.8      d17, {d0, d1}, d31            \n"
-    "vtbl.8      d18, {d2, d3}, d30            \n"
-    "vtbl.8      d19, {d2, d3}, d31            \n"
-    "vtbl.8      d20, {d4, d5}, d30            \n"
-    "vtbl.8      d21, {d4, d5}, d31            \n"
-    "vtbl.8      d22, {d6, d7}, d30            \n"
-    "vtbl.8      d23, {d6, d7}, d31            \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d16[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d16[1]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[1]},  [%0], %4           \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[1]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[1]}, [%0]                \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d18[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d18[1]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[1]}, [%0], %6            \n"
-
-    "add         %0, %5, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[1]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[1]},  [%0]               \n"
-
-    "add         %1, #4*2                      \n"  // src   += 4 * 2
-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %7,  #4                       \n"  // w     -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
-
-    "vtrn.8      d0, d1                        \n"
-    "vtrn.8      d2, d3                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d2}, [%0]                    \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0], %6                \n"
-    MEMACCESS(0)
-    "vst1.64     {d3}, [%0]                    \n"
-
-    "add         %1, #2*2                      \n"  // src   += 2 * 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %7,  #2                       \n"  // w     -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-    MEMACCESS(5)
-    "vst1.64     {d1}, [%5]                    \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),            // %0
-      "+r"(src),                 // %1
-      "+r"(src_stride),          // %2
-      "+r"(dst_a),               // %3
-      "+r"(dst_stride_a),        // %4
-      "+r"(dst_b),               // %5
-      "+r"(dst_stride_b),        // %6
-      "+r"(width)                // %7
-    : "r"(&kVTbl4x4TransposeDi)  // %8
-    : "memory", "cc",
-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
-}
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon64.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon64.cc
deleted file mode 100755
index f52c082..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_neon64.cc
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-static uvec8 kVTbl4x4Transpose =
-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) {
-  const uint8* src_temp = NULL;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %3, %3, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                          \n"
-      "mov         %0, %1                        \n"
-
-      MEMACCESS(0)
-      "ld1        {v0.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v1.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v2.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v3.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v4.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v5.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v6.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v7.8b}, [%0]                  \n"
-
-      "trn2     v16.8b, v0.8b, v1.8b             \n"
-      "trn1     v17.8b, v0.8b, v1.8b             \n"
-      "trn2     v18.8b, v2.8b, v3.8b             \n"
-      "trn1     v19.8b, v2.8b, v3.8b             \n"
-      "trn2     v20.8b, v4.8b, v5.8b             \n"
-      "trn1     v21.8b, v4.8b, v5.8b             \n"
-      "trn2     v22.8b, v6.8b, v7.8b             \n"
-      "trn1     v23.8b, v6.8b, v7.8b             \n"
-
-      "trn2     v3.4h, v17.4h, v19.4h            \n"
-      "trn1     v1.4h, v17.4h, v19.4h            \n"
-      "trn2     v2.4h, v16.4h, v18.4h            \n"
-      "trn1     v0.4h, v16.4h, v18.4h            \n"
-      "trn2     v7.4h, v21.4h, v23.4h            \n"
-      "trn1     v5.4h, v21.4h, v23.4h            \n"
-      "trn2     v6.4h, v20.4h, v22.4h            \n"
-      "trn1     v4.4h, v20.4h, v22.4h            \n"
-
-      "trn2     v21.2s, v1.2s, v5.2s             \n"
-      "trn1     v17.2s, v1.2s, v5.2s             \n"
-      "trn2     v20.2s, v0.2s, v4.2s             \n"
-      "trn1     v16.2s, v0.2s, v4.2s             \n"
-      "trn2     v23.2s, v3.2s, v7.2s             \n"
-      "trn1     v19.2s, v3.2s, v7.2s             \n"
-      "trn2     v22.2s, v2.2s, v6.2s             \n"
-      "trn1     v18.2s, v2.2s, v6.2s             \n"
-
-      "mov         %0, %2                        \n"
-
-    MEMACCESS(0)
-      "st1      {v17.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v16.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v19.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v18.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v21.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v20.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v23.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v22.8b}, [%0]                   \n"
-
-      "add         %1, %1, #8                    \n"  // src += 8
-      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %3, %3, #8                    \n"  // w   -= 8
-      "b.ge        1b                            \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %3, %3, #8                      \n"
-    "b.eq        4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    "cmp         %3, #4                          \n"
-    "b.lt        2f                              \n"
-
-    // 4x8 block
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[3], [%0]                     \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(4)
-    "ld1      {v2.16b}, [%4]                     \n"
-
-    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "st1 {v3.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[3], [%0]                         \n"
-
-    "add         %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[3], [%0]                         \n"
-
-    "add         %1, %1, #4                      \n"  // src += 4
-    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
-    "subs        %3, %3, #4                      \n"  // w   -= 4
-    "b.eq        4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    // 2x8 block
-    "2:                                          \n"
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[3], [%0]                     \n"
-
-    "trn2    v2.8b, v0.8b, v1.8b                 \n"
-    "trn1    v3.8b, v0.8b, v1.8b                 \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1     {v3.8b}, [%0], %6                   \n"
-    MEMACCESS(0)
-    "st1     {v2.8b}, [%0]                       \n"
-
-    "add         %1, %1, #2                      \n"  // src += 2
-    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
-    "subs        %3, %3,  #2                     \n"  // w   -= 2
-    "b.eq        4f                              \n"
-
-    // 1x8 block
-    "3:                                          \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[0], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[1], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[2], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[3], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[4], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[5], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[6], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[7], [%1]                 \n"
-
-    MEMACCESS(2)
-    "st1         {v0.8b}, [%2]                   \n"
-
-    "4:                                          \n"
-
-    : "+r"(src_temp),                             // %0
-      "+r"(src),                                  // %1
-      "+r"(dst),                                  // %2
-      "+r"(width64)                               // %3
-    : "r"(&kVTbl4x4Transpose),                    // %4
-      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
-    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-  );
-}
-
-static uint8 kVTbl4x4TransposeDi[32] =
-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
-
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int width) {
-  const uint8* src_temp = NULL;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub       %4, %4, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-    "mov       %0, %1                          \n"
-
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v1.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v2.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v3.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v4.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v5.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v6.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v7.16b}, [%0]                  \n"
-
-    "trn1      v16.16b, v0.16b, v1.16b         \n"
-    "trn2      v17.16b, v0.16b, v1.16b         \n"
-    "trn1      v18.16b, v2.16b, v3.16b         \n"
-    "trn2      v19.16b, v2.16b, v3.16b         \n"
-    "trn1      v20.16b, v4.16b, v5.16b         \n"
-    "trn2      v21.16b, v4.16b, v5.16b         \n"
-    "trn1      v22.16b, v6.16b, v7.16b         \n"
-    "trn2      v23.16b, v6.16b, v7.16b         \n"
-
-    "trn1      v0.8h, v16.8h, v18.8h           \n"
-    "trn2      v1.8h, v16.8h, v18.8h           \n"
-    "trn1      v2.8h, v20.8h, v22.8h           \n"
-    "trn2      v3.8h, v20.8h, v22.8h           \n"
-    "trn1      v4.8h, v17.8h, v19.8h           \n"
-    "trn2      v5.8h, v17.8h, v19.8h           \n"
-    "trn1      v6.8h, v21.8h, v23.8h           \n"
-    "trn2      v7.8h, v21.8h, v23.8h           \n"
-
-    "trn1      v16.4s, v0.4s, v2.4s            \n"
-    "trn2      v17.4s, v0.4s, v2.4s            \n"
-    "trn1      v18.4s, v1.4s, v3.4s            \n"
-    "trn2      v19.4s, v1.4s, v3.4s            \n"
-    "trn1      v20.4s, v4.4s, v6.4s            \n"
-    "trn2      v21.4s, v4.4s, v6.4s            \n"
-    "trn1      v22.4s, v5.4s, v7.4s            \n"
-    "trn2      v23.4s, v5.4s, v7.4s            \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v16.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[1], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v20.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v20.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[1], [%0]                \n"
-
-    "add       %1, %1, #16                     \n"  // src   += 8*2
-    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
-    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
-    "subs      %4, %4,  #8                     \n"  // w     -= 8
-    "b.ge      1b                              \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds      %4, %4, #8                      \n"
-    "b.eq      4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    "cmp       %4, #4                          \n"
-    "b.lt      2f                              \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1       {v0.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v1.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v2.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v3.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v4.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v5.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v6.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v7.8b}, [%0]                   \n"
-
-    MEMACCESS(8)
-    "ld1       {v30.16b}, [%8], #16            \n"
-    "ld1       {v31.16b}, [%8]                 \n"
-
-    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
-    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
-    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
-    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.s}[0],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[1],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[2],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[3],  [%0], %6           \n"
-
-    "add       %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[2], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[3], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v17.s}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[2], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[3], [%0], %7            \n"
-
-    "add       %0, %3, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[0],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[1],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[2],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[3],  [%0]               \n"
-
-    "add       %1, %1, #8                      \n"  // src   += 4 * 2
-    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
-    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
-    "subs      %4,  %4,  #4                    \n"  // w     -= 4
-    "b.eq      4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[3], [%0]           \n"
-
-    "trn1      v4.8b, v0.8b, v2.8b             \n"
-    "trn2      v5.8b, v0.8b, v2.8b             \n"
-    "trn1      v6.8b, v1.8b, v3.8b             \n"
-    "trn2      v7.8b, v1.8b, v3.8b             \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v4.d}[0], [%0], %6             \n"
-    MEMACCESS(0)
-    "st1       {v6.d}[0], [%0]                 \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v5.d}[0], [%0], %7             \n"
-    MEMACCESS(0)
-    "st1       {v7.d}[0], [%0]                 \n"
-
-    "add       %1, %1, #4                      \n"  // src   += 2 * 2
-    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
-    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
-    "subs      %4,  %4,  #2                    \n"  // w     -= 2
-    "b.eq      4f                              \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[7], [%1]           \n"
-
-    MEMACCESS(2)
-    "st1       {v0.d}[0], [%2]                 \n"
-    MEMACCESS(3)
-    "st1       {v1.d}[0], [%3]                 \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),                             // %0
-      "+r"(src),                                  // %1
-      "+r"(dst_a),                                // %2
-      "+r"(dst_b),                                // %3
-      "+r"(width64)                               // %4
-    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-      "r"(&kVTbl4x4TransposeDi)                   // %8
-    : "memory", "cc",
-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-      "v30", "v31"
-  );
-}
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/rotate_win.cc b/android/src/main/libenc/jni/libyuv/jni/source/rotate_win.cc
deleted file mode 100755
index 1300fc0..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/rotate_win.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  __asm {
-    push      edi
-    push      esi
-    push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
-    mov       edx, [esp + 12 + 12]  // dst
-    mov       esi, [esp + 12 + 16]  // dst_stride
-    mov       ecx, [esp + 12 + 20]  // width
-
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    align      4
- convertloop:
-    movq      xmm0, qword ptr [eax]
-    lea       ebp, [eax + 8]
-    movq      xmm1, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm0, xmm1
-    movq      xmm2, qword ptr [eax]
-    movdqa    xmm1, xmm0
-    palignr   xmm1, xmm1, 8
-    movq      xmm3, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm2, xmm3
-    movdqa    xmm3, xmm2
-    movq      xmm4, qword ptr [eax]
-    palignr   xmm3, xmm3, 8
-    movq      xmm5, qword ptr [eax + edi]
-    punpcklbw xmm4, xmm5
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm5, xmm4
-    movq      xmm6, qword ptr [eax]
-    palignr   xmm5, xmm5, 8
-    movq      xmm7, qword ptr [eax + edi]
-    punpcklbw xmm6, xmm7
-    mov       eax, ebp
-    movdqa    xmm7, xmm6
-    palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
-    punpcklwd xmm0, xmm2
-    punpcklwd xmm1, xmm3
-    movdqa    xmm2, xmm0
-    movdqa    xmm3, xmm1
-    palignr   xmm2, xmm2, 8
-    palignr   xmm3, xmm3, 8
-    punpcklwd xmm4, xmm6
-    punpcklwd xmm5, xmm7
-    movdqa    xmm6, xmm4
-    movdqa    xmm7, xmm5
-    palignr   xmm6, xmm6, 8
-    palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    punpckldq xmm0, xmm4
-    movq      qword ptr [edx], xmm0
-    movdqa    xmm4, xmm0
-    palignr   xmm4, xmm4, 8
-    movq      qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    punpckldq xmm2, xmm6
-    movdqa    xmm6, xmm2
-    palignr   xmm6, xmm6, 8
-    movq      qword ptr [edx], xmm2
-    punpckldq xmm1, xmm5
-    movq      qword ptr [edx + esi], xmm6
-    lea       edx, [edx + 2 * esi]
-    movdqa    xmm5, xmm1
-    movq      qword ptr [edx], xmm1
-    palignr   xmm5, xmm5, 8
-    punpckldq xmm3, xmm7
-    movq      qword ptr [edx + esi], xmm5
-    lea       edx, [edx + 2 * esi]
-    movq      qword ptr [edx], xmm3
-    movdqa    xmm7, xmm3
-    palignr   xmm7, xmm7, 8
-    sub       ecx, 8
-    movq      qword ptr [edx + esi], xmm7
-    lea       edx, [edx + 2 * esi]
-    jg        convertloop
-
-    pop       ebp
-    pop       esi
-    pop       edi
-    ret
-  }
-}
-
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w) {
-  __asm {
-    push      ebx
-    push      esi
-    push      edi
-    push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
-    mov       edx, [esp + 16 + 12]  // dst_a
-    mov       esi, [esp + 16 + 16]  // dst_stride_a
-    mov       ebx, [esp + 16 + 20]  // dst_b
-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
-    mov       ecx, esp
-    sub       esp, 4 + 16
-    and       esp, ~15
-    mov       [esp + 16], ecx
-    mov       ecx, [ecx + 16 + 28]  // w
-
-    align      4
- convertloop:
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
-    punpcklbw xmm0, xmm1
-    punpckhbw xmm7, xmm1
-    movdqa    xmm1, xmm7
-    movdqu    xmm2, [eax]
-    movdqu    xmm3, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm2
-    punpcklbw xmm2, xmm3
-    punpckhbw xmm7, xmm3
-    movdqa    xmm3, xmm7
-    movdqu    xmm4, [eax]
-    movdqu    xmm5, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm4
-    punpcklbw xmm4, xmm5
-    punpckhbw xmm7, xmm5
-    movdqa    xmm5, xmm7
-    movdqu    xmm6, [eax]
-    movdqu    xmm7, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqu    [esp], xmm5  // backup xmm5
-    neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
-    punpcklbw xmm6, xmm7
-    punpckhbw xmm5, xmm7
-    movdqa    xmm7, xmm5
-    lea       eax, [eax + 8 * edi + 16]
-    neg       edi
-    // Second round of bit swap.
-    movdqa    xmm5, xmm0
-    punpcklwd xmm0, xmm2
-    punpckhwd xmm5, xmm2
-    movdqa    xmm2, xmm5
-    movdqa    xmm5, xmm1
-    punpcklwd xmm1, xmm3
-    punpckhwd xmm5, xmm3
-    movdqa    xmm3, xmm5
-    movdqa    xmm5, xmm4
-    punpcklwd xmm4, xmm6
-    punpckhwd xmm5, xmm6
-    movdqa    xmm6, xmm5
-    movdqu    xmm5, [esp]  // restore xmm5
-    movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
-    punpcklwd xmm5, xmm7
-    punpckhwd xmm6, xmm7
-    movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    movdqa    xmm6, xmm0
-    punpckldq xmm0, xmm4
-    punpckhdq xmm6, xmm4
-    movdqa    xmm4, xmm6
-    movdqu    xmm6, [esp]  // restore xmm6
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [ebx], xmm0
-    movlpd    qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm4
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
-    punpckldq xmm2, xmm6
-    movlpd    qword ptr [edx], xmm2
-    movhpd    qword ptr [ebx], xmm2
-    punpckhdq xmm0, xmm6
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
-    punpckldq xmm1, xmm5
-    movlpd    qword ptr [edx], xmm1
-    movhpd    qword ptr [ebx], xmm1
-    punpckhdq xmm0, xmm5
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
-    punpckldq xmm3, xmm7
-    movlpd    qword ptr [edx], xmm3
-    movhpd    qword ptr [ebx], xmm3
-    punpckhdq xmm0, xmm7
-    sub       ecx, 8
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    jg        convertloop
-
-    mov       esp, [esp + 16]
-    pop       ebp
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-  }
-}
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_any.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_any.cc
deleted file mode 100755
index 29b7a34..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_any.cc
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h>  // For memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Subsampled source needs to be increase by 1 of not even.
-#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-
-// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 const uint8* a_buf, uint8* dst_ptr,                           \
-                 const struct YuvConstants* yuvconstants,  int width) {        \
-      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
-      memset(temp, 0, 64 * 4);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      memcpy(temp + 192, a_buf + n, r);                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
-#endif
-#undef ANY41C
-
-// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
-#ifdef HAS_I422TOYUY2ROW_SSE2
-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_BLENDPLANEROW_AVX2
-ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
-#endif
-#ifdef HAS_BLENDPLANEROW_SSSE3
-ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
-#endif
-#undef ANY31
-
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      if (width & 1) {                                                         \
-        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
-#endif
-#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
-#endif  // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
-#endif
-#ifdef HAS_I422TOARGBROW_AVX2
-ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422TORGBAROW_AVX2
-ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_AVX2
-ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
-#endif
-#ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
-#endif
-#undef ANY31C
-
-// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-
-// Merge functions.
-#ifdef HAS_MERGEUVROW_SSE2
-ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
-#endif
-#ifdef HAS_MERGEUVROW_NEON
-ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
-#endif
-
-// Math functions.
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBADDROW_SSE2
-ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_AVX2
-ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_NEON
-ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_SOBELROW_SSE2
-ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_NEON
-ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_NEON
-ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELXYROW_SSE2
-ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_NEON
-ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#undef ANY21
-
-// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_SSSE3
-ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_AVX2
-ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV21TOARGBROW_NEON
-ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-#endif
-#undef ANY21C
-
-// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-
-#ifdef HAS_COPYROW_AVX
-ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
-#endif
-#ifdef HAS_COPYROW_SSE2
-ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
-#endif
-#ifdef HAS_COPYROW_NEON
-ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_SSE2)
-ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
-#endif
-#ifdef HAS_ARGBTOYROW_AVX2
-ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYJROW_AVX2
-ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_AVX2
-ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_AVX2
-ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
-ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_NEON
-ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_NEON
-ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_NEON
-ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_NEON
-ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_NEON
-ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_NEON
-ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_NEON
-ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_NEON
-ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_NEON
-ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_NEON
-ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTOARGBROW_NEON
-ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RGB565TOARGBROW_NEON
-ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
-#endif
-#undef ANY11
-
-// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 const struct YuvConstants* yuvconstants, int width) {         \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 blended.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#undef ANY11B
-
-// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 T shuffler, int width) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
-    }
-
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
-       const uint32, 4, 2, 3)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
-       const uint32, 4, 2, 7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
-       const uint32, 4, 2, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
-#endif
-#undef ANY11P
-
-// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 ptrdiff_t src_stride_ptr, int width,                          \
-                 int source_y_fraction) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_NEON
-ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_DSPR2
-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
-#endif
-#undef ANY11T
-
-// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr, r * BPP);                                          \
-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
-    }
-
-#ifdef HAS_MIRRORROW_AVX2
-ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
-#endif
-#ifdef HAS_MIRRORROW_SSSE3
-ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
-#endif
-#ifdef HAS_ARGBMIRRORROW_AVX2
-ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_SSE2
-ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
-#endif
-#undef ANY11M
-
-// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
-      SIMD_ALIGNED(uint8 temp[64]);                                            \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, v32, n);                                             \
-      }                                                                        \
-      ANY_SIMD(temp, v32, MASK + 1);                                           \
-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
-    }
-
-#ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
-#endif
-#ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
-#endif
-#ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
-#endif
-#undef ANY1
-
-// Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
-      memset(temp, 0, 128);  /* for msan */                                    \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      /* repeat last 4 bytes for 422 subsampler */                             \
-      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
-      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
-      }                                                                        \
-      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
-      }                                                                        \
-      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
-    }
-
-#ifdef HAS_SPLITUVROW_SSE2
-ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_DSPR2
-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
-#endif
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
-ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
-#endif
-#undef ANY12
-
-// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
-      memset(temp, 0, 128 * 2);  /* for msan */                                \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
-             SS(r, UVSHIFT) * BPP);                                            \
-      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-      }                                                                        \
-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-    }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_AVX2
-ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
-ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
-ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_AVX2
-ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
-ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
-ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_NEON
-ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_NEON
-ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_NEON
-ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_NEON
-ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_NEON
-ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_NEON
-ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_NEON
-ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_NEON
-ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_NEON
-ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_NEON
-ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
-#endif
-#undef ANY12S
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_common.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_common.cc
deleted file mode 100755
index 2b80d07..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_common.cc
+++ /dev/null
@@ -1,2614 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// llvm x86 is poor at ternary operator, so use branchless min/max.
-
-#define USE_BRANCHLESS 1
-#if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
-  return ((-(v) >> 31) & (v));
-}
-
-static __inline int32 clamp255(int32 v) {
-  return (((255 - (v)) >> 31) | (v)) & 255;
-}
-
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
-}
-
-static __inline uint32 Abs(int32 v) {
-  int m = v >> 31;
-  return (v + m) ^ m;
-}
-#else  // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
-  return (v < 0) ? 0 : v;
-}
-
-static __inline int32 clamp255(int32 v) {
-  return (v > 255) ? 255 : v;
-}
-
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
-}
-
-static __inline uint32 Abs(int32 v) {
-  return (v < 0) ? -v : v;
-}
-#endif  // USE_BRANCHLESS
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
-#else
-static inline void WRITEWORD(uint8* p, uint32 v) {
-  p[0] = (uint8)(v & 255);
-  p[1] = (uint8)((v >> 8) & 255);
-  p[2] = (uint8)((v >> 16) & 255);
-  p[3] = (uint8)((v >> 24) & 255);
-}
-#endif
-
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb24[0];
-    uint8 g = src_rgb24[1];
-    uint8 r = src_rgb24[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_rgb24 += 3;
-  }
-}
-
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_raw += 3;
-  }
-}
-
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
-    dst_rgb24[0] = b;
-    dst_rgb24[1] = g;
-    dst_rgb24[2] = r;
-    dst_rgb24 += 3;
-    src_raw += 3;
-  }
-}
-
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 2) | (g >> 4);
-    dst_argb[2] = (r << 3) | (r >> 2);
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_rgb565 += 2;
-  }
-}
-
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 a = src_argb1555[1] >> 7;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 3) | (g >> 2);
-    dst_argb[2] = (r << 3) | (r >> 2);
-    dst_argb[3] = -a;
-    dst_argb += 4;
-    src_argb1555 += 2;
-  }
-}
-
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
-    uint8 a = src_argb4444[1] >> 4;
-    dst_argb[0] = (b << 4) | b;
-    dst_argb[1] = (g << 4) | g;
-    dst_argb[2] = (r << 4) | r;
-    dst_argb[3] = (a << 4) | a;
-    dst_argb += 4;
-    src_argb4444 += 2;
-  }
-}
-
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
-    dst_rgb[0] = b;
-    dst_rgb[1] = g;
-    dst_rgb[2] = r;
-    dst_rgb += 3;
-    src_argb += 4;
-  }
-}
-
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
-    dst_rgb[0] = r;
-    dst_rgb[1] = g;
-    dst_rgb[2] = b;
-    dst_rgb += 3;
-    src_argb += 4;
-  }
-}
-
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 2;
-    uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
-    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 3;
-    uint8 r1 = src_argb[6] >> 3;
-    uint8 a1 = src_argb[7] >> 7;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
-  }
-}
-
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    uint8 b1 = src_argb[4] >> 4;
-    uint8 g1 = src_argb[5] >> 4;
-    uint8 r1 = src_argb[6] >> 4;
-    uint8 a1 = src_argb[7] >> 4;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
-  }
-}
-
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
-}
-
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
-}
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
-}
-
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
-}
-
-MAKEROWY(ARGB, 2, 1, 0, 4)
-MAKEROWY(BGRA, 1, 2, 3, 4)
-MAKEROWY(ABGR, 0, 1, 2, 4)
-MAKEROWY(RGBA, 3, 2, 1, 4)
-MAKEROWY(RGB24, 2, 1, 0, 3)
-MAKEROWY(RAW, 0, 1, 2, 3)
-#undef MAKEROWY
-
-// JPeg uses a variation on BT.601-1 full range
-// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
-// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
-// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
-// BT.601 Mpeg range uses:
-// b 0.1016 * 255 = 25.908 = 25
-// g 0.5078 * 255 = 129.489 = 129
-// r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
-// b 0.11400 * 128 = 14.592 = 15
-// g 0.58700 * 128 = 75.136 = 75
-// r 0.29900 * 128 = 38.272 = 38
-// JPeg 8 bit U:
-// b  0.50000 * 255 = 127.5 = 127
-// g -0.33126 * 255 = -84.4713 = -84
-// r -0.16874 * 255 = -43.0287 = -43
-// JPeg 8 bit V:
-// b -0.08131 * 255 = -20.73405 = -20
-// g -0.41869 * 255 = -106.76595 = -107
-// r  0.50000 * 255 = 127.5 = 127
-
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
-}
-
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
-  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
-}
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
-  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
-}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
-                        uint8* dst_u, uint8* dst_v, int width) {               \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-  }                                                                            \
-}
-
-MAKEROWYJ(ARGB, 2, 1, 0, 4)
-#undef MAKEROWYJ
-
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
-    b = (b << 3) | (b >> 2);
-    g = (g << 2) | (g >> 4);
-    r = (r << 3) | (r >> 2);
-    dst_y[0] = RGBToY(r, g, b);
-    src_rgb565 += 2;
-    dst_y += 1;
-  }
-}
-
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
-    b = (b << 3) | (b >> 2);
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_y[0] = RGBToY(r, g, b);
-    src_argb1555 += 2;
-    dst_y += 1;
-  }
-}
-
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
-    b = (b << 4) | b;
-    g = (g << 4) | g;
-    r = (r << 4) | r;
-    dst_y[0] = RGBToY(r, g, b);
-    src_argb4444 += 2;
-    dst_y += 1;
-  }
-}
-
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b1 = src_rgb565[2] & 0x1f;
-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8 r1 = src_rgb565[3] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b3 = next_rgb565[2] & 0x1f;
-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8 r3 = next_rgb565[3] >> 3;
-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 787 -> 888.
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-    src_rgb565 += 4;
-    next_rgb565 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b = (b0 + b2);  // 565 * 2 = 676.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 676 -> 888
-    g = (g << 1) | (g >> 6);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-  }
-}
-
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b1 = src_argb1555[2] & 0x1f;
-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8 b3 = next_argb1555[2] & 0x1f;
-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 777 -> 888.
-    g = (g << 1) | (g >> 6);
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-    src_argb1555 += 4;
-    next_argb1555 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = next_argb1555[1] >> 3;
-    uint8 b = (b0 + b2);  // 555 * 2 = 666.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-  }
-}
-
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b1 = src_argb4444[2] & 0x0f;
-    uint8 g1 = src_argb4444[2] >> 4;
-    uint8 r1 = src_argb4444[3] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b3 = next_argb4444[2] & 0x0f;
-    uint8 g3 = next_argb4444[2] >> 4;
-    uint8 r3 = next_argb4444[3] & 0x0f;
-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-    src_argb4444 += 4;
-    next_argb4444 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b = (b0 + b2);  // 444 * 2 = 555.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 3) | (b >> 2);  // 555 -> 888.
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-  }
-}
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 16;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  // Odd width handling mimics 'any' function which replicates last pixel.
-  if ((width & 3) == 3) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  }
-}
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
-    dst_argb[3] = src_argb[3];
-    dst_argb += 4;
-    src_argb += 4;
-  }
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int sb = (b * 17 + g * 68 + r * 35) >> 7;
-    int sg = (b * 22 + g * 88 + r * 45) >> 7;
-    int sr = (b * 24 + g * 98 + r * 50) >> 7;
-    // b does not over flow. a is preserved from original.
-    dst_argb[0] = sb;
-    dst_argb[1] = clamp255(sg);
-    dst_argb[2] = clamp255(sr);
-    dst_argb += 4;
-  }
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = src_argb[0];
-    int g = src_argb[1];
-    int r = src_argb[2];
-    int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
-    dst_argb[0] = Clamp(sb);
-    dst_argb[1] = Clamp(sg);
-    dst_argb[2] = Clamp(sr);
-    dst_argb[3] = Clamp(sa);
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-// Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int a = dst_argb[3];
-    dst_argb[0] = table_argb[b * 4 + 0];
-    dst_argb[1] = table_argb[g * 4 + 1];
-    dst_argb[2] = table_argb[r * 4 + 2];
-    dst_argb[3] = table_argb[a * 4 + 3];
-    dst_argb += 4;
-  }
-}
-
-// Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    dst_argb[0] = table_argb[b * 4 + 0];
-    dst_argb[1] = table_argb[g * 4 + 1];
-    dst_argb[2] = table_argb[r * 4 + 2];
-    dst_argb += 4;
-  }
-}
-
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
-    dst_argb += 4;
-  }
-}
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value) {
-  const uint32 b_scale = REPEAT8(value & 0xff);
-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32 a_scale = REPEAT8(value >> 24);
-
-  int i;
-  for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
-
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb0[0]);
-    const uint32 g = REPEAT8(src_argb0[1]);
-    const uint32 r = REPEAT8(src_argb0[2]);
-    const uint32 a = REPEAT8(src_argb0[3]);
-    const uint32 b_scale = src_argb1[0];
-    const uint32 g_scale = src_argb1[1];
-    const uint32 r_scale = src_argb1[2];
-    const uint32 a_scale = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb0 += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define SHADE(f, v) clamp255(v + f)
-
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                  uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
-    const int b_add = src_argb1[0];
-    const int g_add = src_argb1[1];
-    const int r_add = src_argb1[2];
-    const int a_add = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_add);
-    dst_argb[1] = SHADE(g, g_add);
-    dst_argb[2] = SHADE(r, r_add);
-    dst_argb[3] = SHADE(a, a_add);
-    src_argb0 += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef SHADE
-
-#define SHADE(f, v) clamp0(f - v)
-
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
-    const int b_sub = src_argb1[0];
-    const int g_sub = src_argb1[1];
-    const int r_sub = src_argb1[2];
-    const int a_sub = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_sub);
-    dst_argb[1] = SHADE(g, g_sub);
-    dst_argb[2] = SHADE(r, r_sub);
-    dst_argb[3] = SHADE(a, a_sub);
-    src_argb0 += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef SHADE
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int a = src_y0[i];
-    int b = src_y1[i];
-    int c = src_y2[i];
-    int a_sub = src_y0[i + 2];
-    int b_sub = src_y1[i + 2];
-    int c_sub = src_y2[i + 2];
-    int a_diff = a - a_sub;
-    int b_diff = b - b_sub;
-    int c_diff = c - c_sub;
-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobelx[i] = (uint8)(clamp255(sobel));
-  }
-}
-
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int a = src_y0[i + 0];
-    int b = src_y0[i + 1];
-    int c = src_y0[i + 2];
-    int a_sub = src_y1[i + 0];
-    int b_sub = src_y1[i + 1];
-    int c_sub = src_y1[i + 2];
-    int a_diff = a - a_sub;
-    int b_diff = b - b_sub;
-    int c_diff = c - c_sub;
-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobely[i] = (uint8)(clamp255(sobel));
-  }
-}
-
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int s = clamp255(r + b);
-    dst_argb[0] = (uint8)(s);
-    dst_argb[1] = (uint8)(s);
-    dst_argb[2] = (uint8)(s);
-    dst_argb[3] = (uint8)(255u);
-    dst_argb += 4;
-  }
-}
-
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int s = clamp255(r + b);
-    dst_y[i] = (uint8)(s);
-  }
-}
-
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int g = clamp255(r + b);
-    dst_argb[0] = (uint8)(b);
-    dst_argb[1] = (uint8)(g);
-    dst_argb[2] = (uint8)(r);
-    dst_argb[3] = (uint8)(255u);
-    dst_argb += 4;
-  }
-}
-
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
-  // Copy a Y to RGB.
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 y = src_y[0];
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    ++src_y;
-  }
-}
-
-// TODO(fbarchard): Unify these structures to be platform independent.
-// TODO(fbarchard): Generate SIMD structures from float matrix.
-
-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef YG
-
-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414  * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef YG
-
-// BT.709 YUV to RGB reference
-// *  R = Y                - V * -1.28033
-// *  G = Y - U *  0.21482 - V *  0.38059
-// *  B = Y - U * -2.12798
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
-
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
-// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059  * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef YG
-
-// C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
-                              uint8* b, uint8* g, uint8* r,
-                              const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32)(-(u * ub         ) + y1 + bb) >> 6);
-  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
-  *r = Clamp((int32)(-(         v * vr) + y1 + br) >> 6);
-}
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(y1 + YGB) >> 6);
-  *g = Clamp((int32)(y1 + YGB) >> 6);
-  *r = Clamp((int32)(y1 + YGB) >> 6);
-}
-
-#undef YG
-#undef YGB
-
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-#else
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-#endif
-
-// Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void I422AlphaToARGBRow_C(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          const uint8* src_a,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = src_a[1];
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    src_a += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
-  }
-}
-
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 6;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-  }
-}
-
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 4;
-    g0 = g0 >> 4;
-    r0 = r0 >> 4;
-    b1 = b1 >> 4;
-    g1 = g1 >> 4;
-    r1 = r1 >> 4;
-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb4444 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 4;
-    g0 = g0 >> 4;
-    r0 = r0 >> 4;
-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        0xf000;
-  }
-}
-
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb1555,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 3;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 3;
-    r1 = r1 >> 3;
-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb1555 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 3;
-    r0 = r0 >> 3;
-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        0x8000;
-  }
-}
-
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 2;
-    r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_rgb565 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    YuvPixel(src_y[2], src_u[0], src_v[0],
-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
-    rgb_buf[11] = 255;
-    YuvPixel(src_y[3], src_u[0], src_v[0],
-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
-    rgb_buf[15] = 255;
-    src_y += 4;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_uv += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_vu,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_vu += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 2;
-    r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
-    src_y += 2;
-    src_uv += 2;
-    dst_rgb565 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_yuy2 += 4;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_uyvy += 4;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
-    rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
-    rgb_buf[4] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
-    rgb_buf[0] = 255;
-  }
-}
-
-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    rgb_buf[3] = 255;
-    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    rgb_buf[3] = 255;
-  }
-}
-
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
-  int x;
-  src += width - 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst[x] = src[0];
-    dst[x + 1] = src[-1];
-    src -= 2;
-  }
-  if (width & 1) {
-    dst[width - 1] = src[0];
-  }
-}
-
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  src_uv += (width - 1) << 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_u[x] = src_uv[0];
-    dst_u[x + 1] = src_uv[-2];
-    dst_v[x] = src_uv[1];
-    dst_v[x + 1] = src_uv[-2 + 1];
-    src_uv -= 4;
-  }
-  if (width & 1) {
-    dst_u[width - 1] = src_uv[0];
-    dst_v[width - 1] = src_uv[1];
-  }
-}
-
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
-  int x;
-  const uint32* src32 = (const uint32*)(src);
-  uint32* dst32 = (uint32*)(dst);
-  src32 += width - 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst32[x] = src32[0];
-    dst32[x + 1] = src32[-1];
-    src32 -= 2;
-  }
-  if (width & 1) {
-    dst32[width - 1] = src32[0];
-  }
-}
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_u[x] = src_uv[0];
-    dst_u[x + 1] = src_uv[2];
-    dst_v[x] = src_uv[1];
-    dst_v[x + 1] = src_uv[3];
-    src_uv += 4;
-  }
-  if (width & 1) {
-    dst_u[width - 1] = src_uv[0];
-    dst_v[width - 1] = src_uv[1];
-  }
-}
-
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                  int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x];
-    dst_uv[1] = src_v[x];
-    dst_uv[2] = src_u[x + 1];
-    dst_uv[3] = src_v[x + 1];
-    dst_uv += 4;
-  }
-  if (width & 1) {
-    dst_uv[0] = src_u[width - 1];
-    dst_uv[1] = src_v[width - 1];
-  }
-}
-
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
-  memcpy(dst, src, count);
-}
-
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
-  memcpy(dst, src, count * 2);
-}
-
-void SetRow_C(uint8* dst, uint8 v8, int width) {
-  memset(dst, v8, width);
-}
-
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
-  uint32* d = (uint32*)(dst_argb);
-  int x;
-  for (x = 0; x < width; ++x) {
-    d[x] = v32;
-  }
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width) {
-  // Output a row of UV values, filtering 2 rows of YUY2.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
-    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
-    src_yuy2 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = src_yuy2[1];
-    dst_v[0] = src_yuy2[3];
-    src_yuy2 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
-  // Output a row of Y values.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_y[x] = src_yuy2[0];
-    dst_y[x + 1] = src_yuy2[2];
-    src_yuy2 += 4;
-  }
-  if (width & 1) {
-    dst_y[width - 1] = src_yuy2[0];
-  }
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
-    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
-    src_uyvy += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = src_uyvy[0];
-    dst_v[0] = src_uyvy[2];
-    src_uyvy += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
-  // Output a row of Y values.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_y[x] = src_uyvy[1];
-    dst_y[x + 1] = src_uyvy[3];
-    src_uyvy += 4;
-  }
-  if (width & 1) {
-    dst_y[width - 1] = src_uyvy[1];
-  }
-}
-
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
-
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
-    dst_argb[3] = 255u;
-
-    fb = src_argb0[4 + 0];
-    fg = src_argb0[4 + 1];
-    fr = src_argb0[4 + 2];
-    a = src_argb0[4 + 3];
-    bb = src_argb1[4 + 0];
-    bg = src_argb1[4 + 1];
-    br = src_argb1[4 + 2];
-    dst_argb[4 + 0] = BLEND(fb, bb, a);
-    dst_argb[4 + 1] = BLEND(fg, bg, a);
-    dst_argb[4 + 2] = BLEND(fr, br, a);
-    dst_argb[4 + 3] = 255u;
-    src_argb0 += 8;
-    src_argb1 += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
-    dst_argb[3] = 255u;
-  }
-}
-#undef BLEND
-
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
-    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
-    src0 += 2;
-    src1 += 2;
-    alpha += 2;
-    dst += 2;
-  }
-  if (width & 1) {
-    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
-  }
-}
-#undef UBLEND
-
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    uint32 a = src_argb[3];
-    dst_argb[0] = ATTENUATE(b, a);
-    dst_argb[1] = ATTENUATE(g, a);
-    dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
-    b = src_argb[4];
-    g = src_argb[5];
-    r = src_argb[6];
-    a = src_argb[7];
-    dst_argb[4] = ATTENUATE(b, a);
-    dst_argb[5] = ATTENUATE(g, a);
-    dst_argb[6] = ATTENUATE(r, a);
-    dst_argb[7] = a;
-    src_argb += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    const uint32 b = src_argb[0];
-    const uint32 g = src_argb[1];
-    const uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    dst_argb[0] = ATTENUATE(b, a);
-    dst_argb[1] = ATTENUATE(g, a);
-    dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
-  }
-}
-#undef ATTENUATE
-
-// Divide source RGB by alpha and store to destination.
-// b = (b * 255 + (a / 2)) / a;
-// g = (g * 255 + (a / 2)) / a;
-// r = (r * 255 + (a / 2)) / a;
-// Reciprocal method is off by 1 on some values. ie 125
-// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
-#define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
-#undef T
-
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
-    b = (b * ia) >> 8;
-    g = (g * ia) >> 8;
-    r = (r * ia) >> 8;
-    // Clamping should not be necessary but is free in assembly.
-    dst_argb[0] = clamp255(b);
-    dst_argb[1] = clamp255(g);
-    dst_argb[2] = clamp255(r);
-    dst_argb[3] = a;
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width) {
-  int32 row_sum[4] = {0, 0, 0, 0};
-  int x;
-  for (x = 0; x < width; ++x) {
-    row_sum[0] += row[x * 4 + 0];
-    row_sum[1] += row[x * 4 + 1];
-    row_sum[2] += row[x * 4 + 2];
-    row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
-  }
-}
-
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
-                                int w, int area, uint8* dst, int count) {
-  float ooa = 1.0f / area;
-  int i;
-  for (i = 0; i < count; ++i) {
-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
-    dst += 4;
-    tl += 4;
-    bl += 4;
-  }
-}
-
-// Copy pixels from rotated source to destination row with a slope.
-LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width) {
-  int i;
-  // Render a row of pixels from source into a buffer.
-  float uv[2];
-  uv[0] = uv_dudv[0];
-  uv[1] = uv_dudv[1];
-  for (i = 0; i < width; ++i) {
-    int x = (int)(uv[0]);
-    int y = (int)(uv[1]);
-    *(uint32*)(dst_argb) =
-        *(const uint32*)(src_argb + y * src_argb_stride +
-                                         x * 4);
-    dst_argb += 4;
-    uv[0] += uv_dudv[2];
-    uv[1] += uv_dudv[3];
-  }
-}
-
-// Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-                      uint8* dst_uv, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
-static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                         uint16* dst_uv, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                      ptrdiff_t src_stride,
-                      int width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction ;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  int x;
-  if (y1_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, width);
-    return;
-  }
-  if (y1_fraction == 128) {
-    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
-    return;
-  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-    dst_ptr[1] =
-        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-  }
-}
-
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint16* src_ptr1 = src_ptr + src_stride;
-  int x;
-  if (source_y_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, width * 2);
-    return;
-  }
-  if (source_y_fraction == 128) {
-    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
-    return;
-  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-  }
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width) {
-  int index0 = shuffler[0];
-  int index1 = shuffler[1];
-  int index2 = shuffler[2];
-  int index3 = shuffler[3];
-  // Shuffle a row of ARGB.
-  int x;
-  for (x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 b = src_argb[index0];
-    uint8 g = src_argb[index1];
-    uint8 r = src_argb[index2];
-    uint8 a = src_argb[index3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_frame[0] = src_y[0];
-    dst_frame[1] = src_u[0];
-    dst_frame[2] = src_y[1];
-    dst_frame[3] = src_v[0];
-    dst_frame += 4;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-  }
-  if (width & 1) {
-    dst_frame[0] = src_y[0];
-    dst_frame[1] = src_u[0];
-    dst_frame[2] = 0;
-    dst_frame[3] = src_v[0];
-  }
-}
-
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_frame[0] = src_u[0];
-    dst_frame[1] = src_y[0];
-    dst_frame[2] = src_v[0];
-    dst_frame[3] = src_y[1];
-    dst_frame += 4;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-  }
-  if (width & 1) {
-    dst_frame[0] = src_u[0];
-    dst_frame[1] = src_y[0];
-    dst_frame[2] = src_v[0];
-    dst_frame[3] = 0;
-  }
-}
-
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb,
-                         const float* poly,
-                         int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    float b = (float)(src_argb[0]);
-    float g = (float)(src_argb[1]);
-    float r = (float)(src_argb[2]);
-    float a = (float)(src_argb[3]);
-    float b2 = b * b;
-    float g2 = g * g;
-    float r2 = r * r;
-    float a2 = a * a;
-    float db = poly[0] + poly[4] * b;
-    float dg = poly[1] + poly[5] * g;
-    float dr = poly[2] + poly[6] * r;
-    float da = poly[3] + poly[7] * a;
-    float b3 = b2 * b;
-    float g3 = g2 * g;
-    float r3 = r2 * r;
-    float a3 = a2 * a;
-    db += poly[8] * b2;
-    dg += poly[9] * g2;
-    dr += poly[10] * r2;
-    da += poly[11] * a2;
-    db += poly[12] * b3;
-    dg += poly[13] * g3;
-    dr += poly[14] * r3;
-    da += poly[15] * a3;
-
-    dst_argb[0] = Clamp((int32)(db));
-    dst_argb[1] = Clamp((int32)(dg));
-    dst_argb[2] = Clamp((int32)(dr));
-    dst_argb[3] = Clamp((int32)(da));
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff) {
-  uint32 bc = lumacoeff & 0xff;
-  uint32 gc = (lumacoeff >> 8) & 0xff;
-  uint32 rc = (lumacoeff >> 16) & 0xff;
-
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    const uint8* luma1;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
-              src_argb[6] * rc) & 0x7F00u) + luma;
-    dst_argb[4] = luma1[src_argb[4]];
-    dst_argb[5] = luma1[src_argb[5]];
-    dst_argb[6] = luma1[src_argb[6]];
-    dst_argb[7] = src_argb[7];
-    src_argb += 8;
-    dst_argb += 8;
-  }
-  if (width & 1) {
-    // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-  }
-}
-
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[3];
-    dst[7] = src[7];
-    dst += 8;
-    src += 8;
-  }
-  if (width & 1) {
-    dst[3] = src[3];
-  }
-}
-
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[0];
-    dst[7] = src[1];
-    dst += 8;
-    src += 2;
-  }
-  if (width & 1) {
-    dst[3] = src[0];
-  }
-}
-
-// Maximum temporary width for wrappers to process at a time, in pixels.
-#define MAXTWIDTH 2048
-
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
-    defined(HAS_I422TORGB565ROW_SSSE3)
-// row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb1555,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb1555 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb4444,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb4444 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
-    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
-#else
-    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb1555 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
-#else
-    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb4444 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_rgb24,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-    // TODO(fbarchard): ARGBToRGB24Row_AVX2
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_gcc.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_gcc.cc
deleted file mode 100755
index d517451..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_gcc.cc
+++ /dev/null
@@ -1,5507 +0,0 @@
-// VERSION 2
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-// JPeg full range.
-static vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-static vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
-
-static vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
-
-// Constants for BGRA
-static vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-static vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-static vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR
-static vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-static vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-static vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-// Constants for RGBA.
-static vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
-
-static vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
-
-static vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
-
-static uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-// 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-static uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-
-// Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_J400TOARGBROW_SSE2
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-   "movdqa     %3,%%xmm3                       \n"
-   "movdqa     %4,%%xmm4                       \n"
-   "movdqa     %5,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
-    "lea       " MEMLEA(0x18,0) ",%0           \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-    "m"(kShuffleMaskRAWToRGB24_1),  // %4
-    "m"(kShuffleMaskRAWToRGB24_2)   // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "movd       %3,%%xmm6                      \n"
-    "punpcklbw  %%xmm6,%%xmm6                  \n"
-    "movdqa     %%xmm6,%%xmm7                  \n"
-    "punpcklwd  %%xmm6,%%xmm6                  \n"
-    "punpckhwd  %%xmm7,%%xmm7                  \n"
-    "pcmpeqb    %%xmm3,%%xmm3                  \n"
-    "psrld      $0x1b,%%xmm3                   \n"
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrld      $0x1a,%%xmm4                   \n"
-    "pslld      $0x5,%%xmm4                    \n"
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "pslld      $0xb,%%xmm5                    \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "paddusb    %%xmm6,%%xmm0                  \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "movdqa     %%xmm0,%%xmm2                  \n"
-    "pslld      $0x8,%%xmm0                    \n"
-    "psrld      $0x3,%%xmm1                    \n"
-    "psrld      $0x5,%%xmm2                    \n"
-    "psrad      $0x10,%%xmm0                   \n"
-    "pand       %%xmm3,%%xmm1                  \n"
-    "pand       %%xmm4,%%xmm2                  \n"
-    "pand       %%xmm5,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm1                  \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "packssdw   %%xmm0,%%xmm0                  \n"
-    "lea        0x10(%0),%0                    \n"
-    "movq       %%xmm0,(%1)                    \n"
-    "lea        0x8(%1),%1                     \n"
-    "sub        $0x4,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vbroadcastss %3,%%xmm6                    \n"
-    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "lea        0x20(%0),%0                    \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
-}
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_ARGBTOYJROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBTOYJROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_ARGBTOYJROW_AVX2
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToV),  // %5
-    "m"(kARGBToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
-
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUV128),  // %5
-    "m"(kARGBToV),   // %6
-    "m"(kARGBToU),   // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUVJ128),  // %5
-    "m"(kARGBToVJ),  // %6
-    "m"(kARGBToUJ),  // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToVJ),  // %5
-    "m"(kARGBToUJ),  // %6
-    "m"(kAddUVJ128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBTOUVJROW_SSSE3
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                          int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6"
-  );
-}
-#endif  // HAS_ARGBTOUV444ROW_SSSE3
-
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)), // %4
-    "m"(kBGRAToV),  // %5
-    "m"(kBGRAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
-}
-
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)), // %4
-    "m"(kABGRToV),  // %5
-    "m"(kABGRToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
-}
-
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)), // %4
-    "m"(kRGBAToV),  // %5
-    "m"(kRGBAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
-}
-
-#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-
-// Read 8 UV from 444
-#define READYUV444                                                             \
-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
-    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
-
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
-//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_TEMP                                                        \
-    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
-    "movd       %[temp],%%xmm0                                  \n"            \
-    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
-    "movd       %[temp],%%xmm1                                  \n"            \
-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
-// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                               \
-    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
-    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                               \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
-
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                               \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                                           \
-    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
-    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
-    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
-    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     %%xmm11,%%xmm0                                  \n"            \
-    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     %%xmm12,%%xmm1                                  \n"            \
-    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     %%xmm13,%%xmm2                                  \n"            \
-    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
-#define YUVTORGB_REGS \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-
-#else
-#define YUVTORGB_SETUP(yuvconstants)
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
-    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
-    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
-#define YUVTORGB_REGS
-#endif
-
-// Store 8 ARGB values.
-#define STOREARGB                                                              \
-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
-    "movdqa     %%xmm0,%%xmm1                                    \n"           \
-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
-    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
-
-// Store 8 RGBA values.
-#define STORERGBA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* dst_rgb24,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-    "sub       %[u_buf],%[v_buf]               \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
-    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                                     const uint8* u_buf,
-                                     const uint8* v_buf,
-                                     const uint8* a_buf,
-                                     uint8* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422ALPHATOARGBROW_SSSE3
-
-#ifdef HAS_I411TOARGBROW_SSSE3
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  int temp = 0;
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_TEMP
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [temp]"+r"(temp),       // %[temp]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* vu_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV21
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_rgba,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STORERGBA
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2                                                        \
-    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
-    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                                        \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                                       \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
-    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
-    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
-    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2                                                        \
-    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
-// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
-    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
-    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
-    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                          \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
-    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
-    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
-    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
-    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
-    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
-    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
-    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
-    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
-    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
-    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
-    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
-    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
-    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
-    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
-    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
-    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
-#define YUVTORGB_REGS_AVX2 \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-#else  // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
-    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
-    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
-    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
-    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
-#define YUVTORGB_REGS_AVX2
-#endif
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2                                                         \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
-    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
-    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV444_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               const uint8* a_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "subl      $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#if defined(HAS_I422TORGBAROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-
-    // Step 3: Weave into RGBA
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_NV12TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* uv_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
-    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#if defined(HAS_NV21TOARGBROW_AVX2)
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* vu_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV21_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUY2_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READUYVY_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "psrlw     $6, %%xmm0                      \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
-    "vmovd      %%eax,%%xmm2                   \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
-    "vmovd      %%eax,%%xmm3                   \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
-    "lea        " MEMLEA(0x10,0) ",%0          \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub        $0x10,%2                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm5                  \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $8,%3                           \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vmovdqu    %3,%%ymm5                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror_AVX2) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
-    "lea        " MEMLEA(0x40,0) ",%0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
-    "lea        " MEMLEA(0x20,1) ",%1            \n"
-    "sub        $0x20,%3                         \n"
-    "jg         1b                               \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
-    "lea       " MEMLEA(0x20,0) ",%0             \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
-    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
-    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
-    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
-    "lea       " MEMLEA(0x40,2) ",%2             \n"
-    "sub       $0x20,%3                          \n"
-    "jg        1b                                \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
-}
-#endif  // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
-    "jg        1b                                \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
-}
-#endif  // HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    "test       $0xf,%0                        \n"
-    "jne        2f                             \n"
-    "test       $0xf,%1                        \n"
-    "jne        2f                             \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       9f                              \n"
-    LABELALIGN
-  "2:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        2b                              \n"
-  "9:                                          \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x40,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_COPYROW_AVX
-
-#ifdef HAS_COPYROW_ERMS
-// Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
-}
-#endif  // HAS_COPYROW_ERMS
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpckhwd %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint8 v8, int width) {
-  size_t width_tmp = (size_t)(width >> 2);
-  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
-}
-
-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosb " MEMSTORESTRING(al,0) "        \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v8)          // %2
-    : "memory", "cc");
-}
-
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst_argb),  // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
-}
-
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "psllw      $0x8,%%xmm5                    \n"
-    "mov        $0x80808080,%%eax              \n"
-    "movd       %%eax,%%xmm6                   \n"
-    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "movd       %%eax,%%xmm7                   \n"
-    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq       (%2),%%xmm0                    \n"
-    "punpcklbw  %%xmm0,%%xmm0                  \n"
-    "pxor       %%xmm5,%%xmm0                  \n"
-    "movq       (%0,%2,1),%%xmm1               \n"
-    "movq       (%1,%2,1),%%xmm2               \n"
-    "punpcklbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm6,%%xmm1                  \n"
-    "pmaddubsw  %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm7,%%xmm0                  \n"
-    "psrlw      $0x8,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq       %%xmm0,(%3,%2,1)               \n"
-    "lea        0x8(%2),%2                     \n"
-    "sub        $0x8,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+r"(width)       // %4
-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm6                   \n"
-    "vbroadcastss %%xmm6,%%ymm6                \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "vmovd      %%eax,%%xmm7                   \n"
-    "vbroadcastss %%xmm7,%%ymm7                \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
-
-    // 32 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%2),%%ymm0                    \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vmovdqu    (%0,%2,1),%%ymm1               \n"
-    "vmovdqu    (%1,%2,1),%%ymm2               \n"
-    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x20,%4                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+r"(width)       // %4
-  :: "memory", "cc", "eax",
-     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
-// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-    "sub        %0,%1                          \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha_AVX2)  // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  uintptr_t alpha = 0;
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width),       // %2
-    "+r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
-// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  uintptr_t alpha = 0;
-  asm volatile (
-    "sub        %0,%1                          \n"
-    "vbroadcastf128 %5,%%ymm5                  \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    // replace VPGATHER
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-    // end of VPGATHER
-
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width),       // %2
-    "+r"(alpha)        // %3
-  : "r"(fixed_invtbl8),  // %4
-    "m"(kUnattenShuffleAlpha_AVX2)  // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBUNATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64)     // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-static vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-static vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm7,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "punpcklwd %%xmm1,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm6                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width)          // %2
-  : "r"(matrix_argb)     // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x4,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(value)       // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea       " MEMLEA(0x20,2) ",%2           \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-#if defined(__AVX2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psubusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "sub       %0,%3                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%4                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm2                   \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm2,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklwd %%xmm0,%%xmm3                   \n"
-    "punpckhwd %%xmm0,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "paddusb   %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "punpckhbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "punpcklbw %%xmm2,%%xmm4                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "punpcklwd %%xmm3,%%xmm6                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "punpcklwd %%xmm0,%%xmm7                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
-                                    int count) {
-  asm volatile (
-    "movd      %5,%%xmm5                       \n"
-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-    "rcpss     %%xmm5,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "cmpl      $0x80,%5                        \n"
-    "ja        40f                             \n"
-
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrld     $0x10,%%xmm6                    \n"
-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-    "addps     %%xmm6,%%xmm5                   \n"
-    "mulps     %%xmm4,%%xmm5                   \n"
-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
-    "packssdw  %%xmm5,%%xmm5                   \n"
-
-  // 4 pixel small loop                        \n"
-    LABELALIGN
-  "4:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       4b                              \n"
-    "jmp       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"((intptr_t)(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* src_dudv, int width) {
-  intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp = 0;
-  asm volatile (
-    "movq      " MEMACCESS(3) ",%%xmm2         \n"
-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1," MEMACCESS(2) "         \n"
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%k1                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x04,2) ",%2           \n"
-    "sub       $0x1,%4                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(src_dudv),  // %3
-    "+rm"(width),    // %4
-    "+r"(temp)   // %5
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x100,%3                       \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x80808080,%%eax               \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "punpcklbw  %%xmm2,%%xmm0                  \n"
-    "punpckhbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm4,%%xmm0                  \n"
-    "psubb      %%xmm4,%%xmm1                  \n"
-    "movdqa     %%xmm5,%%xmm2                  \n"
-    "movdqa     %%xmm5,%%xmm3                  \n"
-    "pmaddubsw  %%xmm0,%%xmm2                  \n"
-    "pmaddubsw  %%xmm1,%%xmm3                  \n"
-    "paddw      %%xmm4,%%xmm2                  \n"
-    "paddw      %%xmm4,%%xmm3                  \n"
-    "psrlw      $0x8,%%xmm2                    \n"
-    "psrlw      $0x8,%%xmm3                    \n"
-    "packuswb   %%xmm3,%%xmm2                  \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  asm volatile (
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "sub       %1,%0                           \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
-
-    "vmovd      %3,%%xmm0                      \n"
-    "neg        %3                             \n"
-    "add        $0x100,%3                      \n"
-    "vmovd      %3,%%xmm5                      \n"
-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-    "vbroadcastss %%xmm5,%%ymm5                \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm4                   \n"
-    "vbroadcastss %%xmm4,%%ymm4                \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "rep movsb " MEMMOVESTRING(1,0) "          \n"
-    "jmp       999f                            \n"
-
-  "99:                                         \n"
-    "vzeroupper                                \n"
-  "999:                                        \n"
-  : "+D"(dst_ptr),    // %0
-    "+S"(src_ptr),    // %1
-    "+c"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSSE3
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "mov       " MEMACCESS(4) ",%k2            \n"
-    "cmp       $0x3000102,%k2                  \n"
-    "je        3012f                           \n"
-    "cmp       $0x10203,%k2                    \n"
-    "je        123f                            \n"
-    "cmp       $0x30201,%k2                    \n"
-    "je        321f                            \n"
-    "cmp       $0x2010003,%k2                  \n"
-    "je        2103f                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(4) ",%2             \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS(1) "            \n"
-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "sub       $0x1,%3                         \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "123:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        123b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "321:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        321b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "2103:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        2103b                           \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "3012:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        3012b                           \n"
-
-  "99:                                         \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+d"(pixel_temp),  // %2
-    "+r"(width)         // %3
-  : "r"(shuffler)      // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSE2
-
-#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub       %1,%2                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
-}
-#endif  // HAS_I422TOYUY2ROW_SSE2
-
-#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
-}
-#endif  // HAS_I422TOUYVYROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  asm volatile (
-    "pxor      %%xmm3,%%xmm3                   \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm3,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm4                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
-    "addps     " MEMACCESS(3) ",%%xmm0         \n"
-    "addps     " MEMACCESS(3) ",%%xmm4         \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm1,%%xmm2                   \n"
-    "mulps     %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm2,%%xmm1                   \n"
-    "mulps     %%xmm6,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
-    "addps     %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm6,%%xmm4                   \n"
-    "addps     %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm5,%%xmm4                   \n"
-    "cvttps2dq %%xmm0,%%xmm0                   \n"
-    "cvttps2dq %%xmm4,%%xmm4                   \n"
-    "packuswb  %%xmm4,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x2,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
-    "lea         " MEMLEA(0x8,0) ",%0          \n"
-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
-    "lea         " MEMLEA(0x8,1) ",%1          \n"
-    "sub         $0x2,%2                       \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
-    "+d"(pixel_temp), // %1
-    "+r"(width)       // %2
-  : "r"(table_argb)   // %3
-  : "memory", "cc");
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
-    "+d"(pixel_temp), // %1
-    "+r"(width)       // %2
-  : "r"(table_argb)   // %3
-  : "memory", "cc");
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
-  uintptr_t pixel_temp = 0u;
-  uintptr_t table_temp = 0u;
-  asm volatile (
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0x8,%%xmm4                     \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "phaddw    %%xmm0,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS(2) ",%0             \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS(3) "            \n"
-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-
-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "lea       " MEMLEA(0x10,3) ",%3           \n"
-    "sub       $0x4,%4                         \n"
-    "jg        1b                              \n"
-  : "+d"(pixel_temp),  // %0
-    "+a"(table_temp),  // %1
-    "+r"(src_argb),    // %2
-    "+r"(dst_argb),    // %3
-    "+rm"(width)       // %4
-  : "r"(luma),         // %5
-    "rm"(lumacoeff)    // %6
-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_mips.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_mips.cc
deleted file mode 100755
index 2c55b78..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_mips.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  __asm__ __volatile__ (
-    ".set      noreorder                         \n"
-    ".set      noat                              \n"
-    "slti      $at, %[count], 8                  \n"
-    "bne       $at ,$zero, $last8                \n"
-    "xor       $t8, %[src], %[dst]               \n"
-    "andi      $t8, $t8, 0x3                     \n"
-
-    "bne       $t8, $zero, unaligned             \n"
-    "negu      $a3, %[dst]                       \n"
-    // make dst/src aligned
-    "andi      $a3, $a3, 0x3                     \n"
-    "beq       $a3, $zero, $chk16w               \n"
-    // word-aligned now count is the remining bytes count
-    "subu     %[count], %[count], $a3            \n"
-
-    "lwr       $t8, 0(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"
-    "swr       $t8, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-
-    // Now the dst/src are mutually word-aligned with word-aligned addresses
-    "$chk16w:                                    \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, chk8w              \n"
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"
-    // t0 is the "past the end" address
-
-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
-    // the "t0-32" address
-    // This means: for x=128 the last "safe" a1 address is "t0-160"
-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
-    // we will use "pref 30,128(a1)", so "t0-160" is the limit
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line of src
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $loop16w                     \n"
-    "nop                                         \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$loop16w:                                    \n"
-    "pref      0, 96(%[src])                     \n"
-    "lw        $t0, 0(%[src])                    \n"
-    "bgtz      $v1, $skip_pref30_96              \n"  // skip
-    "lw        $t1, 4(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"  // continue
-    "$skip_pref30_96:                            \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    //  bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lw        $t0, 32(%[src])                   \n"
-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
-    "lw        $t1, 36(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-    "$skip_pref30_128:                           \n"
-    "lw        $t2, 40(%[src])                   \n"
-    "lw        $t3, 44(%[src])                   \n"
-    "lw        $t4, 48(%[src])                   \n"
-    "lw        $t5, 52(%[src])                   \n"
-    "lw        $t6, 56(%[src])                   \n"
-    "lw        $t7, 60(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bne       %[dst], $a3, $loop16w             \n"
-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
-    "move      %[count], $t8                     \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "chk8w:                                      \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count past 32-bytes
-    "beq       %[count], $t8, chk1w              \n"
-    // count=t8,no 32-byte chunk
-    " nop                                        \n"
-
-    "lw        $t0, 0(%[src])                    \n"
-    "lw        $t1, 4(%[src])                    \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "chk1w:                                      \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, $last8             \n"
-    " subu     $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-    // copying in words (4-byte chunks)
-    "$wordCopy_loop:                             \n"
-    "lw        $t3, 0(%[src])                    \n"
-    // the first t3 may be equal t0 ... optimize?
-    "addiu     %[src], %[src],4                  \n"
-    "addiu     %[dst], %[dst],4                  \n"
-    "bne       %[dst], $a3,$wordCopy_loop        \n"
-    " sw       $t3, -4(%[dst])                   \n"
-
-    // For the last (<8) bytes
-    "$last8:                                     \n"
-    "blez      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
-    "$last8loop:                                 \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst], $a3, $last8loop           \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "leave:                                      \n"
-    "  j       $ra                               \n"
-    "  nop                                       \n"
-
-    //
-    // UNALIGNED case
-    //
-
-    "unaligned:                                  \n"
-    // got here with a3="negu a1"
-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
-    "beqz      $a3, $ua_chk16w                   \n"
-    " subu     %[count], %[count], $a3           \n"
-    // bytes left after initial a3 bytes
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
-    "swr       $v1, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-    // below the dst will be word aligned (NOTE1)
-    "$ua_chk16w:                                 \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, ua_chk8w           \n"
-    // if a2==t8, no 64-byte chunks
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line  addr 32
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // safe, as we have at least 64 bytes ahead
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $ua_loop16w                  \n"
-    // skip "pref 30,64(a1)" for too short arrays
-    " nop                                        \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$ua_loop16w:                                \n"
-    "pref      0, 96(%[src])                     \n"
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "bgtz      $v1, $ua_skip_pref30_96           \n"
-    " lwl      $t1, 7(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"
-    // continue setting up the dest, addr 96
-    "$ua_skip_pref30_96:                         \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    // bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lwr       $t0, 32(%[src])                   \n"
-    "lwl       $t0, 35(%[src])                   \n"
-    "lwr       $t1, 36(%[src])                   \n"
-    "bgtz      $v1, ua_skip_pref30_128           \n"
-    " lwl      $t1, 39(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"
-    // continue setting up the dest, addr 128
-    "ua_skip_pref30_128:                         \n"
-
-    "lwr       $t2, 40(%[src])                   \n"
-    "lwl       $t2, 43(%[src])                   \n"
-    "lwr       $t3, 44(%[src])                   \n"
-    "lwl       $t3, 47(%[src])                   \n"
-    "lwr       $t4, 48(%[src])                   \n"
-    "lwl       $t4, 51(%[src])                   \n"
-    "lwr       $t5, 52(%[src])                   \n"
-    "lwl       $t5, 55(%[src])                   \n"
-    "lwr       $t6, 56(%[src])                   \n"
-    "lwl       $t6, 59(%[src])                   \n"
-    "lwr       $t7, 60(%[src])                   \n"
-    "lwl       $t7, 63(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
-    "sgtu      $v1,%[dst],$t9                    \n"
-    "bne       %[dst],$a3,$ua_loop16w            \n"
-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
-    "move      %[count],$t8                      \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "ua_chk8w:                                   \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count
-    "beq       %[count], $t8, $ua_chk1w          \n"
-    // when count==t8, no 32-byte chunk
-
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "lwl       $t1, 7(%[src])                    \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "$ua_chk1w:                                  \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, ua_smallCopy       \n"
-    "subu      $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-
-    // copying in words (4-byte chunks)
-    "$ua_wordCopy_loop:                          \n"
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addiu     %[src], %[src], 4                 \n"
-    "addiu     %[dst], %[dst], 4                 \n"
-    // note: dst=a1 is word aligned here, see NOTE1
-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
-    " sw       $v1,-4(%[dst])                    \n"
-
-    // Now less than 4 bytes (value in count) left to copy
-    "ua_smallCopy:                               \n"
-    "beqz      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
-    "$ua_smallCopy_loop:                         \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "j         $ra                               \n"
-    " nop                                        \n"
-    ".set      at                                \n"
-    ".set      reorder                           \n"
-       : [dst] "+r" (dst), [src] "+r" (src)
-       : [count] "r" (count)
-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
-       "t8", "t9", "a3", "v1", "at"
-  );
-}
-#endif  // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                           int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
-  __asm__ __volatile__ (
-    ".set push                             \n"
-    ".set noreorder                        \n"
-
-    "srl       $t4, %[width], 4            \n"  // multiplies of 16
-    "andi      $t5, %[width], 0xf          \n"
-    "blez      $t4, 2f                     \n"
-    " addu     %[src], %[src], %[width]    \n"  // src += width
-
-   "1:                                     \n"
-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-    "addiu     %[src], %[src], -16         \n"
-    "addiu     $t4, $t4, -1                \n"
-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-    "bgtz      $t4, 1b                     \n"
-    " addiu    %[dst], %[dst], 16          \n"
-    "beqz      $t5, 3f                     \n"
-    " nop                                  \n"
-
-   "2:                                     \n"
-    "lbu       $t0, -1(%[src])             \n"
-    "addiu     $t5, $t5, -1                \n"
-    "addiu     %[src], %[src], -1          \n"
-    "sb        $t0, 0(%[dst])              \n"
-    "bgez      $t5, 2b                     \n"
-    " addiu    %[dst], %[dst], 1           \n"
-
-   "3:                                     \n"
-    ".set pop                              \n"
-      : [src] "+r" (src), [dst] "+r" (dst)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4", "t5"
-  );
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                            int width) {
-  int x = 0;
-  int y = 0;
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "addu            $t4, %[width], %[width]      \n"
-    "srl             %[x], %[width], 4            \n"
-    "andi            %[y], %[width], 0xf          \n"
-    "blez            %[x], 2f                     \n"
-    " addu           %[src_uv], %[src_uv], $t4    \n"
-
-   "1:                                            \n"
-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-
-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-    "addiu           %[src_uv], %[src_uv], -32    \n"
-    "addiu           %[x], %[x], -1               \n"
-    "swr             $t4, 0(%[dst_u])             \n"
-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-    "swr             $t6, 0(%[dst_v])             \n"
-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-    "swr             $t2, 4(%[dst_u])             \n"
-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-    "swr             $t3, 4(%[dst_v])             \n"
-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-    "swr             $t0, 8(%[dst_u])             \n"
-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-    "swr             $t1, 8(%[dst_v])             \n"
-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-    "swr             $t9, 12(%[dst_u])            \n"
-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-    "swr             $t5, 12(%[dst_v])            \n"
-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-    "addiu           %[dst_v], %[dst_v], 16       \n"
-    "bgtz            %[x], 1b                     \n"
-    " addiu          %[dst_u], %[dst_u], 16       \n"
-    "beqz            %[y], 3f                     \n"
-    " nop                                         \n"
-    "b               2f                           \n"
-    " nop                                         \n"
-
-   "2:                                            \n"
-    "lbu             $t0, -2(%[src_uv])           \n"
-    "lbu             $t1, -1(%[src_uv])           \n"
-    "addiu           %[src_uv], %[src_uv], -2     \n"
-    "addiu           %[y], %[y], -1               \n"
-    "sb              $t0, 0(%[dst_u])             \n"
-    "sb              $t1, 0(%[dst_v])             \n"
-    "addiu           %[dst_u], %[dst_u], 1        \n"
-    "bgtz            %[y], 2b                     \n"
-    " addiu          %[dst_v], %[dst_v], 1        \n"
-
-   "3:                                            \n"
-    ".set pop                                     \n"
-      : [src_uv] "+r" (src_uv),
-        [dst_u] "+r" (dst_u),
-        [dst_v] "+r" (dst_v),
-        [x] "=&r" (x),
-        [y] "+r" (y)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4",
-      "t5", "t7", "t8", "t9"
-  );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                                               \
-      "lw                $t0, 0(%[y_buf])       \n"                            \
-      "lhu               $t1, 0(%[u_buf])       \n"                            \
-      "lhu               $t2, 0(%[v_buf])       \n"                            \
-      "preceu.ph.qbr     $t1, $t1               \n"                            \
-      "preceu.ph.qbr     $t2, $t2               \n"                            \
-      "preceu.ph.qbra    $t3, $t0               \n"                            \
-      "preceu.ph.qbla    $t0, $t0               \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t3, $t3, $s4          \n"                            \
-      "subu.ph           $t0, $t0, $s4          \n"                            \
-      "mul.ph            $t3, $t3, $s0          \n"                            \
-      "mul.ph            $t0, $t0, $s0          \n"                            \
-      "shll.ph           $t4, $t1, 0x7          \n"                            \
-      "subu.ph           $t4, $t4, $t1          \n"                            \
-      "mul.ph            $t6, $t1, $s1          \n"                            \
-      "mul.ph            $t1, $t2, $s2          \n"                            \
-      "addq_s.ph         $t5, $t4, $t3          \n"                            \
-      "addq_s.ph         $t4, $t4, $t0          \n"                            \
-      "shra.ph           $t5, $t5, 6            \n"                            \
-      "shra.ph           $t4, $t4, 6            \n"                            \
-      "addiu             %[u_buf], 2            \n"                            \
-      "addiu             %[v_buf], 2            \n"                            \
-      "addu.ph           $t6, $t6, $t1          \n"                            \
-      "mul.ph            $t1, $t2, $s3          \n"                            \
-      "addu.ph           $t9, $t6, $t3          \n"                            \
-      "addu.ph           $t8, $t6, $t0          \n"                            \
-      "shra.ph           $t9, $t9, 6            \n"                            \
-      "shra.ph           $t8, $t8, 6            \n"                            \
-      "addu.ph           $t2, $t1, $t3          \n"                            \
-      "addu.ph           $t1, $t1, $t0          \n"                            \
-      "shra.ph           $t2, $t2, 6            \n"                            \
-      "shra.ph           $t1, $t1, 6            \n"                            \
-      "subu.ph           $t5, $t5, $s5          \n"                            \
-      "subu.ph           $t4, $t4, $s5          \n"                            \
-      "subu.ph           $t9, $t9, $s5          \n"                            \
-      "subu.ph           $t8, $t8, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "shll_s.ph         $t5, $t5, 8            \n"                            \
-      "shll_s.ph         $t4, $t4, 8            \n"                            \
-      "shll_s.ph         $t9, $t9, 8            \n"                            \
-      "shll_s.ph         $t8, $t8, 8            \n"                            \
-      "shll_s.ph         $t2, $t2, 8            \n"                            \
-      "shll_s.ph         $t1, $t1, 8            \n"                            \
-      "shra.ph           $t5, $t5, 8            \n"                            \
-      "shra.ph           $t4, $t4, 8            \n"                            \
-      "shra.ph           $t9, $t9, 8            \n"                            \
-      "shra.ph           $t8, $t8, 8            \n"                            \
-      "shra.ph           $t2, $t2, 8            \n"                            \
-      "shra.ph           $t1, $t1, 8            \n"                            \
-      "addu.ph           $t5, $t5, $s5          \n"                            \
-      "addu.ph           $t4, $t4, $s5          \n"                            \
-      "addu.ph           $t9, $t9, $s5          \n"                            \
-      "addu.ph           $t8, $t8, $s5          \n"                            \
-      "addu.ph           $t2, $t2, $s5          \n"                            \
-      "addu.ph           $t1, $t1, $s5          \n"
-
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
-    "lui               $s6, 0xff00            \n"
-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
-
-   "1:                                        \n"
-      YUVTORGB
-// Arranging into argb format
-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
-    "addiu             %[width], -4           \n"
-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
-    "sll               $t9, $t9, 16           \n"
-    "sll               $t8, $t8, 16           \n"
-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                               ptrdiff_t src_stride, int dst_width,
-                               int source_y_fraction) {
-    int y0_fraction = 256 - source_y_fraction;
-    const uint8* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-     ".set push                                           \n"
-     ".set noreorder                                      \n"
-
-     "replv.ph          $t0, %[y0_fraction]               \n"
-     "replv.ph          $t1, %[source_y_fraction]         \n"
-
-   "1:                                                    \n"
-     "lw                $t2, 0(%[src_ptr])                \n"
-     "lw                $t3, 0(%[src_ptr1])               \n"
-     "lw                $t4, 4(%[src_ptr])                \n"
-     "lw                $t5, 4(%[src_ptr1])               \n"
-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-     "addq.ph           $t6, $t6, $t8                     \n"
-     "addq.ph           $t7, $t7, $t9                     \n"
-     "addq.ph           $t2, $t2, $t4                     \n"
-     "addq.ph           $t3, $t3, $t5                     \n"
-     "shra.ph           $t6, $t6, 8                       \n"
-     "shra.ph           $t7, $t7, 8                       \n"
-     "shra.ph           $t2, $t2, 8                       \n"
-     "shra.ph           $t3, $t3, 8                       \n"
-     "precr.qb.ph       $t6, $t6, $t7                     \n"
-     "precr.qb.ph       $t2, $t2, $t3                     \n"
-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-     "addiu             %[dst_width], %[dst_width], -8    \n"
-     "sw                $t6, 0(%[dst_ptr])                \n"
-     "sw                $t2, 4(%[dst_ptr])                \n"
-     "bgtz              %[dst_width], 1b                  \n"
-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-     ".set pop                                            \n"
-  : [dst_ptr] "+r" (dst_ptr),
-    [src_ptr1] "+r" (src_ptr1),
-    [src_ptr] "+r" (src_ptr),
-    [dst_width] "+r" (dst_width)
-  : [source_y_fraction] "r" (source_y_fraction),
-    [y0_fraction] "r" (y0_fraction),
-    [src_stride] "r" (src_stride)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-#endif  // __mips_dsp_rev >= 2
-
-#endif  // defined(__mips__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_neon.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_neon.cc
deleted file mode 100755
index 91d6aa8..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_neon.cc
+++ /dev/null
@@ -1,2828 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-// Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-// Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-#define YUVTORGB_SETUP                                                         \
-    MEMACCESS([kUVToRB])                                                       \
-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
-    MEMACCESS([kUVToG])                                                        \
-    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
-    MEMACCESS([kYToRgb])                                                       \
-    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
-
-#define YUVTORGB                                                               \
-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
-    "vmovl.u8   q0, d0                         \n" /* Y                      */\
-    "vmovl.s16  q10, d1                        \n"                             \
-    "vmovl.s16  q0, d0                         \n"                             \
-    "vmul.s32   q10, q10, q15                  \n"                             \
-    "vmul.s32   q0, q0, q15                    \n"                             \
-    "vqshrun.s32 d0, q0, #16                   \n"                             \
-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
-    "vadd.s16   d18, d19                       \n"                             \
-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
-    "vaddw.u16  q1, q1, d16                    \n"                             \
-    "vaddw.u16  q10, q10, d17                  \n"                             \
-    "vaddw.u16  q3, q3, d18                    \n"                             \
-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
-    "vqshrun.s16 d21, q0, #6                   \n" /* G */
-
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %5, %5, #8                     \n"
-    MEMACCESS(3)
-    "vld1.8     {d23}, [%3]!                   \n"
-    MEMACCESS(4)
-    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
-    MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTORGB565                                                           \
-    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
-    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
-    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB1555                                                         \
-    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
-    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
-    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
-    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
-    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB4444
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV400
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
-  );
-}
-
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV21
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
-    MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
-    "bgt        1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r12, #-16                      \n"
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
-  );
-}
-
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
-
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vdup.32    d2, %2                         \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d20, d20, d2                   \n"
-    "vqadd.u8   d21, d21, d2                   \n"
-    "vqadd.u8   d22, d22, d2                   \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
-                            int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
-                            int width) {
-  asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
-
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
-
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q3, q2, q1)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_stride_rgb24),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_stride_raw),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  asm volatile (
-    "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
-    "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
-
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction)       // %4
-  :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
-  );
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
-
-  "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
-  );
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
-  );
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
-    MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
-  );
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
-  );
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q11, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
-}
-#endif  // HAS_ARGBMULTIPLYROW_NEON
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
-}
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
-    MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2),            // %5
-    "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1),            // %4
-    "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_neon64.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_neon64.cc
deleted file mode 100755
index ee42af1..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_neon64.cc
+++ /dev/null
@@ -1,2961 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.s}[0], [%1], #4            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.s}[1], [%2], #4            \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.h}[0], [%1], #2            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v2.h}[1], [%2], #2            \n"                             \
-    "zip1       v1.8b, v2.8b, v2.8b            \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.d}[0], [%1], #8            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.d}[1], [%2], #8            \n"                             \
-    "uaddlp     v1.8h, v1.16b                  \n"                             \
-    "rshrn      v1.8b, v1.8h, #1               \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    "movi       v1.8b , #128                   \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-#define YUVTORGB_SETUP                                                         \
-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
-    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
-    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
-
-#define YUVTORGB(vR, vG, vB)                                                   \
-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
-    "ushll      v0.4s, v0.4h, #0               \n"                             \
-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
-    "uxtl       v2.8h, v2.8b                   \n"                             \
-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
-
-#ifdef HAS_I444TOARGBROW_NEON
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I444TOARGBROW_NEON
-
-#ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOARGBROW_NEON
-
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    MEMACCESS(3)
-    "ld1        {v23.8b}, [%3], #8             \n"
-    "subs       %w5, %w5, #8                   \n"
-    MEMACCESS(4)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422ALPHATOARGBROW_NEON
-
-#ifdef HAS_I411TOARGBROW_NEON
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I411TOARGBROW_NEON
-
-#ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v20.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v23, v22, v21)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TORGBAROW_NEON
-
-#ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgb24), // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TORGB24ROW_NEON
-
-#define ARGBTORGB565                                                           \
-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
-
-#ifdef HAS_I422TORGB565ROW_NEON
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TORGB565ROW_NEON
-
-#define ARGBTOARGB1555                                                         \
-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
-
-#ifdef HAS_I422TOARGB1555ROW_NEON
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOARGB1555ROW_NEON
-
-#define ARGBTOARGB4444                                                         \
-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
-
-#ifdef HAS_I422TOARGB4444ROW_NEON
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    ARGBTOARGB4444
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOARGB4444ROW_NEON
-
-#ifdef HAS_I400TOARGBROW_NEON
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  int64 width64 = (int64)(width);
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUV400
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I400TOARGBROW_NEON
-
-#ifdef HAS_J400TOARGBROW_NEON
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v20.8b}, [%0], #8             \n"
-    "orr        v21.8b, v20.8b, v20.8b         \n"
-    "orr        v22.8b, v20.8b, v20.8b         \n"
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_J400TOARGBROW_NEON
-
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_NV12TOARGBROW_NEON
-
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READNV21
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_NV12TOARGBROW_NEON
-
-#ifdef HAS_NV12TORGB565ROW_NEON
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_NV12TORGB565ROW_NEON
-
-#ifdef HAS_YUY2TOARGBROW_NEON
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  int64 width64 = (int64)(width);
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_YUY2TOARGBROW_NEON
-
-#ifdef HAS_UYVYTOARGBROW_NEON
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  int64 width64 = (int64)(width);
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_UYVYTOARGBROW_NEON
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-#ifdef HAS_SPLITUVROW_NEON
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store U
-    MEMACCESS(2)
-    "st1        {v1.16b}, [%2], #16            \n"  // store V
-    "b.gt       1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_SPLITUVROW_NEON
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-#ifdef HAS_MERGEUVROW_NEON
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load U
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load V
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-    "b.gt       1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_MERGEUVROW_NEON
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-#ifdef HAS_COPYROW_NEON
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
-    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_COPYROW_NEON
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt      1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt      1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-#ifdef HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %2                     \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
-    "rev64      v0.16b, v0.16b                 \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width64)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-#endif  // HAS_MIRRORROW_NEON
-
-#ifdef HAS_MIRRORUVROW_NEON
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
-    "rev64      v0.8b, v0.8b                   \n"
-    "rev64      v1.8b, v1.8b                   \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width64)    // %3
-  : "r"((ptrdiff_t)-16)      // %4
-  : "cc", "memory", "v0", "v1"
-  );
-}
-#endif  // HAS_MIRRORUVROW_NEON
-
-#ifdef HAS_ARGBMIRRORROW_NEON
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    "rev64      v0.4s, v0.4s                   \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width64)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-#endif  // HAS_ARGBMIRRORROW_NEON
-
-#ifdef HAS_RGB24TOARGBROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v4.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_RGB24TOARGBROW_NEON
-
-#ifdef HAS_RAWTOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v5.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-#endif  // HAS_RAWTOARGBROW_NEON
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                           \
-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
-
-#ifdef HAS_RGB565TOARGBROW_NEON
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
-}
-#endif  // HAS_RGB565TOARGBROW_NEON
-
-#define ARGB1555TOARGB                                                         \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
-                                                                               \
-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
-    "xtn2       v3.16b, v2.8h                  \n"                             \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
-    "dup        v1.2D, v0.D[1]                 \n"                             \
-    "dup        v3.2D, v2.D[1]                 \n"
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
-
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_ARGB1555TOARGBROW_NEON
-
-#define ARGB4444TOARGB                                                         \
-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
-    "dup        v0.2D, v2.D[1]                 \n"                             \
-    "dup        v1.2D, v3.D[1]                 \n"
-
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_ARGB4444TOARGBROW_NEON
-
-#ifdef HAS_ARGBTORGB24ROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTORGB24ROW_NEON
-
-#ifdef HAS_ARGBTORAWROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-    MEMACCESS(1)
-    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTORAWROW_NEON
-
-#ifdef HAS_YUY2TOYROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOYROW_NEON
-
-#ifdef HAS_UYVYTOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_UYVYTOYROW_NEON
-
-#ifdef HAS_YUY2TOUV422ROW_NEON
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOUV422ROW_NEON
-
-#ifdef HAS_UYVYTOUV422ROW_NEON
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_UYVYTOUV422ROW_NEON
-
-#ifdef HAS_YUY2TOUVROW_NEON
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
-    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(src_yuy2b),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOUVROW_NEON
-
-#ifdef HAS_UYVYTOUVROW_NEON
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
-    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(src_uyvyb),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
-}
-#endif  // HAS_UYVYTOUVROW_NEON
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_NEON
-
-#ifdef HAS_I422TOYUY2ROW_NEON
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-    "orr        v2.8b, v1.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_I422TOYUY2ROW_NEON
-
-#ifdef HAS_I422TOUYVYROW_NEON
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-    "orr        v3.8b, v2.8b, v2.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_I422TOUYVYROW_NEON
-
-#ifdef HAS_ARGBTORGB565ROW_NEON
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTORGB565ROW_NEON
-
-#ifdef HAS_ARGBTORGB565DITHERROW_NEON
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "dup        v1.4s, %w2                     \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v20.8b, v20.8b, v1.8b          \n"
-    "uqadd      v21.8b, v21.8b, v1.8b          \n"
-    "uqadd      v22.8b, v22.8b, v1.8b          \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTORGB565ROW_NEON
-
-#ifdef HAS_ARGBTOARGB1555ROW_NEON
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
-                            int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTOARGB1555ROW_NEON
-
-#ifdef HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
-                            int width) {
-  asm volatile (
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTOARGB4444ROW_NEON
-
-#ifdef HAS_ARGBTOYROW_NEON
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBTOYROW_NEON
-
-#ifdef HAS_ARGBTOYJROW_NEON
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
-}
-#endif  // HAS_ARGBTOYJROW_NEON
-
-// 8x1 pixels.
-#ifdef HAS_ARGBTOUV444ROW_NEON
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
-    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-    "movi       v29.16b,#0x80                  \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v24", "v25", "v26", "v27", "v28", "v29"
-  );
-}
-#endif  // HAS_ARGBTOUV444ROW_NEON
-
-#define RGBTOUV_SETUP_REG                                                      \
-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-#ifdef HAS_ARGBTOUV411ROW_NEON
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBTOUV411ROW_NEON
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-// TODO(fbarchard): consider ptrdiff_t for all strides.
-
-#ifdef HAS_ARGBTOUVROW_NEON
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBTOUVROW_NEON
-
-// TODO(fbarchard): Subsample match C code.
-#ifdef HAS_ARGBTOUVJROW_NEON
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
-    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
-    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
-    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
-    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
-    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBTOUVJROW_NEON
-
-#ifdef HAS_BGRATOUVROW_NEON
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v3.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_bgra_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_BGRATOUVROW_NEON
-
-#ifdef HAS_ABGRTOUVROW_NEON
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
-    "urshr      v2.8h, v2.8h, #1               \n"
-    "urshr      v1.8h, v1.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v2.8h, v1.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_abgr_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ABGRTOUVROW_NEON
-
-#ifdef HAS_RGBATOUVROW_NEON
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_rgba_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_RGBATOUVROW_NEON
-
-#ifdef HAS_RGB24TOUVROW_NEON
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_rgb24_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_RGB24TOUVROW_NEON
-
-#ifdef HAS_RAWTOUVROW_NEON
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_raw_1 = src_raw + src_stride_raw;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v0.8h, v0.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v2.8h, v1.8h, v0.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_raw_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_RAWTOUVROW_NEON
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#ifdef HAS_RGB565TOUVROW_NEON
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile (
-    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
-    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v17.D[0]             \n"
-    "ins        v18.D[1], v19.D[0]             \n"
-    "ins        v20.D[1], v21.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v18.8h, #1              \n"
-    "urshr      v6.8h, v20.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_rgb565_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-    "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_RGB565TOUVROW_NEON
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB1555TOUVROW_NEON
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_argb1555_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-  );
-}
-#endif  // HAS_ARGB1555TOUVROW_NEON
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB4444TOUVROW_NEON
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_argb4444_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-
-  );
-}
-#endif  // HAS_ARGB4444TOUVROW_NEON
-
-#ifdef HAS_RGB565TOYROW_NEON
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
-    "v24", "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_RGB565TOYROW_NEON
-
-#ifdef HAS_ARGB1555TOYROW_NEON
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGB1555TOYROW_NEON
-
-#ifdef HAS_ARGB4444TOYROW_NEON
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_ARGB4444TOYROW_NEON
-
-#ifdef HAS_BGRATOYROW_NEON
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_BGRATOYROW_NEON
-
-#ifdef HAS_ABGRTOYROW_NEON
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_ABGRTOYROW_NEON
-
-#ifdef HAS_RGBATOYROW_NEON
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_RGBATOYROW_NEON
-
-#ifdef HAS_RGB24TOYROW_NEON
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),      // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_RGB24TOYROW_NEON
-
-#ifdef HAS_RAWTOYROW_NEON
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),    // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_RAWTOYROW_NEON
-
-// Bilinear filter 16x2 -> 16x1
-#ifdef HAS_INTERPOLATEROW_NEON
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
-    "cmp        %w4, #0                        \n"
-    "b.eq       100f                           \n"
-    "cmp        %w4, #128                      \n"
-    "b.eq       50f                            \n"
-
-    "dup        v5.16b, %w4                    \n"
-    "dup        v4.16b, %w5                    \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "umull      v2.8h, v0.8b,  v4.8b           \n"
-    "umull2     v3.8h, v0.16b, v4.16b          \n"
-    "umlal      v2.8h, v1.8b,  v5.8b           \n"
-    "umlal2     v3.8h, v1.16b, v5.16b          \n"
-    "rshrn      v0.8b,  v2.8h, #8              \n"
-    "rshrn2     v0.16b, v3.8h, #8              \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_ptr1),         // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction),      // %4
-    "+r"(y0_fraction)       // %5
-  :
-  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
-  );
-}
-#endif  // HAS_INTERPOLATEROW_NEON
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-#ifdef HAS_ARGBBLENDROW_NEON
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %w3, %w3, #8                   \n"
-    "b.lt       89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.ge       8b                             \n"
-
-  "89:                                         \n"
-    "adds       %w3, %w3, #8-1                 \n"
-    "b.lt       99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-    "b.ge       1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18"
-  );
-}
-#endif  // HAS_ARGBBLENDROW_NEON
-
-// Attenuate 8 pixels at a time.
-#ifdef HAS_ARGBATTENUATEROW_NEON
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
-}
-#endif  // HAS_ARGBATTENUATEROW_NEON
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-#ifdef HAS_ARGBQUANTIZEROW_NEON
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "dup        v4.8h, %w2                     \n"
-    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-    "dup        v5.8h, %w3                     \n"  // interval multiply.
-    "dup        v6.8h, %w4                     \n"  // interval add
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
-    "uxtl       v1.8h, v1.8b                   \n"
-    "uxtl       v2.8h, v2.8b                   \n"
-    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-    "add        v1.8h, v1.8h, v6.8h            \n"  // g
-    "add        v2.8h, v2.8h, v6.8h            \n"  // r
-    "uqxtn      v0.8b, v0.8h                   \n"
-    "uqxtn      v1.8b, v1.8h                   \n"
-    "uqxtn      v2.8b, v2.8h                   \n"
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
-}
-#endif  // HAS_ARGBQUANTIZEROW_NEON
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-#ifdef HAS_ARGBSHADEROW_NEON
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-    "uxtl       v5.8h, v5.8b                   \n"
-    "uxtl       v6.8h, v6.8b                   \n"
-    "uxtl       v7.8h, v7.8b                   \n"
-    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-    "uqxtn      v4.8b, v4.8h                   \n"
-    "uqxtn      v5.8b, v5.8h                   \n"
-    "uqxtn      v6.8b, v6.8h                   \n"
-    "uqxtn      v7.8b, v7.8h                   \n"
-    MEMACCESS(1)
-    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBSHADEROW_NEON
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-#ifdef HAS_ARGBGRAYROW_NEON
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
-  );
-}
-#endif  // HAS_ARGBGRAYROW_NEON
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-
-#ifdef HAS_ARGBSEPIAROW_NEON
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v20.8b, #17                    \n"  // BB coefficient
-    "movi       v21.8b, #68                    \n"  // BG coefficient
-    "movi       v22.8b, #35                    \n"  // BR coefficient
-    "movi       v24.8b, #22                    \n"  // GB coefficient
-    "movi       v25.8b, #88                    \n"  // GG coefficient
-    "movi       v26.8b, #45                    \n"  // GR coefficient
-    "movi       v28.8b, #24                    \n"  // BB coefficient
-    "movi       v29.8b, #98                    \n"  // BG coefficient
-    "movi       v30.8b, #50                    \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
-    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
-    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
-    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
-    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
-    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
-    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
-    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
-    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
-    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_ARGBSEPIAROW_NEON
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-#ifdef HAS_ARGBCOLORMATRIXROW_NEON
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-    "uxtl       v17.8h, v17.8b                 \n"  // g
-    "uxtl       v18.8h, v18.8b                 \n"  // r
-    "uxtl       v19.8h, v19.8b                 \n"  // a
-    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_NEON
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBMULTIPLYROW_NEON
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBADDROW_NEON
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"
-    "uqadd      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBADDROW_NEON
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqsub      v0.8b, v0.8b, v4.8b            \n"
-    "uqsub      v1.8b, v1.8b, v5.8b            \n"
-    "uqsub      v2.8b, v2.8b, v6.8b            \n"
-    "uqsub      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBSUBTRACTROW_NEON
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-#ifdef HAS_SOBELROW_NEON
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-    "orr        v1.8b, v0.8b, v0.8b            \n"
-    "orr        v2.8b, v0.8b, v0.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_SOBELROW_NEON
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-#ifdef HAS_SOBELTOPLANEROW_NEON
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1"
-  );
-}
-#endif  // HAS_SOBELTOPLANEROW_NEON
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-#ifdef HAS_SOBELXYROW_NEON
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_SOBELXYROW_NEON
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-#ifdef HAS_SOBELXROW_NEON
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%5               \n"  // top
-    MEMACCESS(0)
-    "ld1        {v1.8b}, [%0],%6               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%6               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2],%6               \n"
-    "subs       %w4, %w4, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(3)
-    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2LL),          // %5
-    "r"(6LL)           // %6
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_SOBELXROW_NEON
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-#ifdef HAS_SOBELYROW_NEON
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%4               \n"  // left
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1],%4               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%4               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%5               \n"  // right
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%5               \n"
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1LL),          // %4
-    "r"(6LL)           // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_SOBELYROW_NEON
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/row_win.cc b/android/src/main/libenc/jni/libyuv/jni/source/row_win.cc
deleted file mode 100755
index a8c16c3..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/row_win.cc
+++ /dev/null
@@ -1,6241 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
-    defined(_MSC_VER) && !defined(__clang__)
-#include <emmintrin.h>
-#include <tmmintrin.h>  // For _mm_maddubs_epi16
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Visual C 32/64 bit and clangcl 32 bit
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
-
-// 64 bit
-#if defined(_M_X64)
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                                             \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;                                                                \
-    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
-    a_buf += 8;
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                                 \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm2 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
-    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
-    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
-    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
-    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
-    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
-    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
-    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
-    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
-    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
-    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
-
-// Store 8 ARGB values.
-#define STOREARGB                                                              \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
-    dst_argb += 32;
-
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-  while (width > 0) {
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-  while (width > 0) {
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-// 32 bit
-#else  // defined(_M_X64)
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-
-static const vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-static const vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
-
-static const vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static const vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
-
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-
-// Constants for BGRA.
-static const vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-static const vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-static const vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR.
-static const vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-static const vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-static const vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
-
-static const vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
-
-static const vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
-
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-static const uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
-    pslld      xmm5, 24
-
-  convertloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm5
-    por        xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_J400TOARGBROW_AVX2
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
-  __asm {
-    mov         eax, [esp + 4]        // src_y
-    mov         edx, [esp + 8]        // dst_argb
-    mov         ecx, [esp + 12]       // width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
-    vpslld      ymm5, ymm5, 24
-
-  convertloop:
-    vmovdqu     xmm0, [eax]
-    lea         eax,  [eax + 16]
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpcklbw  ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpckhwd  ymm1, ymm0, ymm0
-    vpunpcklwd  ymm0, ymm0, ymm0
-    vpor        ymm0, ymm0, ymm5
-    vpor        ymm1, ymm1, ymm5
-    vmovdqu     [edx], ymm0
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_J400TOARGBROW_AVX2
-
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_rgb24
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_rgb24
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
-    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 4]
-    movdqu    xmm2, [eax + 8]
-    lea       eax, [eax + 24]
-    pshufb    xmm0, xmm3
-    pshufb    xmm1, xmm4
-    pshufb    xmm2, xmm5
-    movq      qword ptr [edx], xmm0
-    movq      qword ptr [edx + 8], xmm1
-    movq      qword ptr [edx + 16], xmm2
-    lea       edx, [edx + 24]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-// 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
-    psllw     xmm4, 10
-    psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm3    // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    pand      xmm0, xmm4    // G in middle 6 bits
-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
-    por       xmm0, xmm7    // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_RGB565TOARGBROW_AVX2
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
-    vpsllw     ymm4, ymm4, 10
-    vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax, [esp + 4]   // src_rgb565
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    sub        edx, eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_RGB565TOARGBROW_AVX2
-
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax,  [esp + 4]   // src_argb1555
-    mov        edx,  [esp + 8]   // dst_argb
-    mov        ecx,  [esp + 12]  // width
-    sub        edx,  eax
-    sub        edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpsraw     ymm2, ymm0, 8       // A
-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
-    vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB1555TOARGBROW_AVX2
-
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
-  __asm {
-    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    vmovd     xmm4, eax
-    vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]   // src_argb4444
-    mov       edx,  [esp + 8]   // dst_argb
-    mov       ecx,  [esp + 12]  // width
-    sub       edx,  eax
-    sub       edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5    // mask high nibbles
-    vpand      ymm0, ymm0, ymm4    // mask low nibbles
-    vpsrlw     ymm3, ymm2, 4
-    vpsllw     ymm1, ymm0, 4
-    vpor       ymm2, ymm2, ymm3
-    vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpckhbw ymm1, ymm0, ymm2
-    vpunpcklbw ymm0, ymm0, ymm2
-    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB4444TOARGBROW_AVX2
-
-// 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
-    psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    psllw     xmm1, 1       // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // G in middle 5 bits
-    psraw     xmm2, 8       // A
-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
-    pand      xmm2, xmm7
-    por       xmm0, xmm2    // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
-  __asm {
-    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    movd      xmm4, eax
-    pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
-    pslld     xmm5, 4
-    mov       eax, [esp + 4]   // src_argb4444
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // mask low nibbles
-    pand      xmm2, xmm5    // mask high nibbles
-    movdqa    xmm1, xmm0
-    movdqa    xmm3, xmm2
-    psllw     xmm1, 4
-    psrlw     xmm3, 4
-    por       xmm0, xmm1
-    por       xmm2, xmm3
-    movdqa    xmm1, xmm0
-    punpcklbw xmm0, xmm2
-    punpckhbw xmm1, xmm2
-    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  __asm {
-
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    movd      xmm6, [esp + 12] // dither4
-    mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6       // make dither 16 bytes
-    movdqa    xmm7, xmm6
-    punpcklwd xmm6, xmm6
-    punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6    // add dither
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]     // width
-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
-    vpermq     ymm6, ymm6, 0xd8
-    vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6    // add dither
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
-    psrld     xmm4, 27
-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
-    pslld     xmm5, 5
-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
-    pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
-    pslld     xmm7, 15
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    movdqa    xmm3, xmm0    // R
-    psrad     xmm0, 16      // A
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 6       // G
-    psrld     xmm3, 9       // R
-    pand      xmm0, xmm7    // A
-    pand      xmm1, xmm4    // B
-    pand      xmm2, xmm5    // G
-    pand      xmm3, xmm6    // R
-    por       xmm0, xmm1    // BA
-    por       xmm2, xmm3    // GR
-    por       xmm0, xmm2    // BGRA
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
-    psllw     xmm4, 12
-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
-    psrlw     xmm3, 8
-
- convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0
-    pand      xmm0, xmm3    // low nibble
-    pand      xmm1, xmm4    // high nibble
-    psrld     xmm0, 4
-    psrld     xmm1, 8
-    por       xmm0, xmm1
-    packuswb  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
-    vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
-    vpslld     ymm7, ymm7, 15
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9       // R
-    vpsrld     ymm2, ymm0, 6       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrad     ymm0, ymm0, 16      // A
-    vpand      ymm3, ymm3, ymm6    // R
-    vpand      ymm2, ymm2, ymm5    // G
-    vpand      ymm1, ymm1, ymm4    // B
-    vpand      ymm0, ymm0, ymm7    // A
-    vpor       ymm0, ymm0, ymm1    // BA
-    vpor       ymm2, ymm2, ymm3    // GR
-    vpor       ymm0, ymm0, ymm2    // BGRA
-    vpackssdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB1555ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
-    vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
-
- convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4    // high nibble
-    vpand      ymm0, ymm0, ymm3    // low nibble
-    vpsrld     ymm1, ymm1, 8
-    vpsrld     ymm0, ymm0, 4
-    vpor       ymm0, ymm0, ymm1
-    vpackuswb  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB4444ROW_AVX2
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kARGBToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    paddw      xmm2, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToY
-    vbroadcastf128 ymm5, xmmword ptr kAddY16
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
-    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
-    vpaddw     ymm2, ymm2, ymm5
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYJROW_AVX2
-
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kBGRAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kABGRToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kRGBAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUVJ128
-    movdqa     xmm6, xmmword ptr kARGBToVJ
-    movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
-    paddw      xmm1, xmm5
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
-
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
-    vpaddw     ymm0, ymm0, ymm5
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* convert to U and V */
-    movdqu     xmm0, [eax]          // U
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-
-    movdqu     xmm0, [eax]          // V
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm6
-    pmaddubsw  xmm1, xmm6
-    pmaddubsw  xmm2, xmm6
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    lea        eax,  [eax + 64]
-    movdqu     [edx + edi], xmm0
-    lea        edx,  [edx + 16]
-    sub        ecx,  16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
-    movdqa     xmm6, xmmword ptr kBGRAToV
-    movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
-    movdqa     xmm6, xmmword ptr kABGRToV
-    movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
-    movdqa     xmm6, xmmword ptr kRGBAToV
-    movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2 __asm {                                                \
-    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
-    __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]                                           \
-  }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm {                                                \
-    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
-
-// Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
-    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
-    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
-    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
-    __asm vpsubw     ymm2, ymm3, ymm2                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
-    __asm vpsubw     ymm1, ymm3, ymm1                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0                                          \
-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
-    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
-    __asm vpsraw     ymm0, ymm0, 6                                             \
-    __asm vpsraw     ymm1, ymm1, 6                                             \
-    __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
-  }
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
-    __asm vmovdqu    0[edx], ymm1                                              \
-    __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
-
-// Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
-    __asm vmovdqu    [edx], ymm0                                               \
-    __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
-
-#ifdef HAS_I422TOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
- convertloop:
-    READYUV444_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV411_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
-#ifdef HAS_NV12TOARGBROW_AVX2
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV12_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#ifdef HAS_NV21TOARGBROW_AVX2
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* vu_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV21_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#ifdef HAS_YUY2TOARGBROW_AVX2
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUY2_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#ifdef HAS_UYVYTOARGBROW_AVX2
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READUYVY_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I422TORGBAROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STORERGBA_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Allows a conversion with half size scaling.
-
-// Read 8 UV from 444.
-#define READYUV444 __asm {                                                     \
-    __asm movq       xmm0, qword ptr [esi] /* U */                             \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 __asm {                                                    \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
-    __asm lea        ebp, [ebp + 8]                                            \
-  }
-
-// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_EBX __asm {                                                 \
-    __asm movzx      ebx, word ptr [esi]        /* U */                        \
-    __asm movd       xmm0, ebx                                                 \
-    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
-    __asm movd       xmm1, ebx                                                 \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
-    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
-    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
-
-// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
-
-// Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
-
-// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
-    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
-    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm {                                         \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm movdqa     xmm3, xmm0                                                \
-    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
-    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm psubw      xmm0, xmm1                                                \
-    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
-    __asm psubw      xmm1, xmm2                                                \
-    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
-    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm psubw      xmm2, xmm3                                                \
-    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
-  }
-
-// Store 8 ARGB values.
-#define STOREARGB __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm0                                              \
-    __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
-
-// Store 8 BGRA values.
-#define STOREBGRA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
-
-// Store 8 RGBA values.
-#define STORERGBA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
-
-// Store 8 RGB24 values.
-#define STORERGB24 __asm {                                                     \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
-
-// Store 8 RGB565 values.
-#define STORERGB565 __asm {                                                    \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0    /* G */                                     \
-    __asm pslld      xmm0, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm0, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm0, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm0, xmm3    /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1    /* G */                                     \
-    __asm pslld      xmm1, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm1, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm1, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm1, xmm3    /* BGR */                                   \
-    __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]                                           \
-  }
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV444
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
-    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB24
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb565_buf,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
-    psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
-    psrld      xmm6, 26
-    pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
-    pslld      xmm7, 11
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB565
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        edx, [esp + 16 + 16]  // abgr
-    mov        ebp, [esp + 16 + 20]  // yuvconstants
-    mov        ecx, [esp + 16 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV411_EBX
-    YUVTORGB(ebp)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-
- convertloop:
-    READNV12
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* vu_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-
- convertloop:
-    READNV21
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-
- convertloop:
-    READYUY2
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-
- convertloop:
-    READUYVY
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGBA
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-#ifdef HAS_I400TOARGBROW_SSE2
-// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
-  __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
-    movd       xmm2, eax
-    pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
-    movd       xmm3, eax
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
-
- convertloop:
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, qword ptr [eax]
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0           // Y.Y
-    pmulhuw    xmm0, xmm2
-    psubusw    xmm0, xmm3
-    psrlw      xmm0, 6
-    packuswb   xmm0, xmm0           // G
-
-    // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0           // GG
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
-    por        xmm0, xmm4
-    por        xmm1, xmm4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
-  __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
-    vmovd      xmm2, eax
-    vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
-    vmovd      xmm3, eax
-    vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
-    vpslld     ymm4, ymm4, 24
-
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
-
- convertloop:
-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
-    vmovdqu    xmm0, [eax]
-    lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
-    vpmulhuw   ymm0, ymm0, ymm2
-    vpsubusw   ymm0, ymm0, ymm3
-    vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
-
-    // TODO(fbarchard): Weave alpha with unpack.
-    // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
-    vpor       ymm0, ymm0, ymm4
-    vpor       ymm1, ymm1, ymm4
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx,  [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    movdqu    xmm0, [eax - 16 + ecx]
-    pshufb    xmm0, xmm5
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    vmovdqu   ymm0, [eax - 32 + ecx]
-    vpshufb   ymm0, ymm0, ymm5
-    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 32
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_u
-    mov       edi, [esp + 4 + 12]  // dst_v
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    sub       ecx, 8
-    jg        convertloop
-
-    pop       edi
-    ret
-  }
-}
-#endif  // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufd    xmm0, xmm0, 0x1b
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
-
- convertloop:
-    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [edx], xmm0
-    movdqu     [edx + edi], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8      // odd bytes
-    vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5   // even bytes
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1
-    vpackuswb  ymm2, ymm2, ymm3
-    vpermq     ymm0, ymm0, 0xd8
-    vpermq     ymm2, ymm2, 0xd8
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + edi], ymm2
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
-    sub        edx, eax
-
-  convertloop:
-    movdqu     xmm0, [eax]      // read 16 U's
-    movdqu     xmm1, [eax + edx]  // and 16 V's
-    lea        eax,  [eax + 16]
-    movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
-    sub        edx, eax
-
-  convertloop:
-    vmovdqu    ymm0, [eax]           // read 32 U's
-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
-    lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0       // bytes 0..15
-    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
-    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
-    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
-    lea        edi, [edi + 64]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    test       eax, 15
-    jne        convertloopu
-    test       edx, 15
-    jne        convertloopu
-
-  convertloopa:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopa
-    ret
-
-  convertloopu:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopu
-    ret
-  }
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 64
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_COPYROW_AVX
-
-// Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    rep movsb
-    mov        edi, edx
-    mov        esi, eax
-    ret
-  }
-}
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movdqu     xmm2, [eax]
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
-
-  convertloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + 32]
-    lea        eax, [eax + 64]
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movq       xmm2, qword ptr [eax]  // 8 Y's
-    lea        eax, [eax + 8]
-    punpcklbw  xmm2, xmm2
-    punpckhwd  xmm3, xmm2
-    punpcklwd  xmm2, xmm2
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
-
-  convertloop:
-    vpmovzxbd  ymm1, qword ptr [eax]
-    vpmovzxbd  ymm2, qword ptr [eax + 8]
-    lea        eax, [eax + 16]
-    vpslld     ymm1, ymm1, 24
-    vpslld     ymm2, ymm2, 24
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
-  __asm {
-    movzx      eax, byte ptr [esp + 8]    // v8
-    mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx              // overwrites edx with upper part of result.
-    mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v8
-    mov        ecx, [esp + 12]  // count
-    rep stosb
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
-    mov        ecx, [esp + 12]  // count
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  __asm {
-    push       esi
-    push       edi
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm6, eax
-    pshufd     xmm6, xmm6, 0x00
-
-    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    movd       xmm7, eax
-    pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]   // src0
-    mov        edx, [esp + 8 + 8]   // src1
-    mov        esi, [esp + 8 + 12]  // alpha
-    mov        edi, [esp + 8 + 16]  // dst
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        eax, esi
-    sub        edx, esi
-    sub        edi, esi
-
-    // 8 pixel loop.
-  convertloop8:
-    movq       xmm0, qword ptr [esi]        // alpha
-    punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5         // a, 255-a
-    movq       xmm1, qword ptr [eax + esi]  // src0
-    movq       xmm2, qword ptr [edx + esi]  // src1
-    punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6         // bias src0/1 - 128
-    pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7         // unbias result - 32768 and round.
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edi + esi], xmm0
-    lea        esi, [esi + 8]
-    sub        ecx, 8
-    jg         convertloop8
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  __asm {
-    push        esi
-    push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
-    vpsllw      ymm5, ymm5, 8
-    mov         eax, 0x80808080  // 128 for biasing image to signed.
-    vmovd       xmm6, eax
-    vbroadcastss ymm6, xmm6
-    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    vmovd       xmm7, eax
-    vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]   // src0
-    mov         edx, [esp + 8 + 8]   // src1
-    mov         esi, [esp + 8 + 12]  // alpha
-    mov         edi, [esp + 8 + 16]  // dst
-    mov         ecx, [esp + 8 + 20]  // width
-    sub         eax, esi
-    sub         edx, esi
-    sub         edi, esi
-
-    // 32 pixel loop.
-  convertloop32:
-    vmovdqu     ymm0, [esi]        // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5   // a, 255-a
-    vpxor       ymm0, ymm0, ymm5   // a, 255-a
-    vmovdqu     ymm1, [eax + esi]  // src0
-    vmovdqu     ymm2, [edx + esi]  // src1
-    vpunpckhbw  ymm4, ymm1, ymm2
-    vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
-    vpsrlw      ymm3, ymm3, 8
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm3
-    vmovdqu     [edi + esi], ymm0
-    lea         esi, [esi + 32]
-    sub         ecx, 32
-    jg          convertloop32
-
-    pop         edi
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-    sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
-
-    // 4 pixel loop.
-  convertloop4:
-    movdqu     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
-  convertloop4b:
-    add        ecx, 4 - 1
-    jl         convertloop1b
-
-    // 1 pixel loop.
-  convertloop1:
-    movd       xmm3, [eax]      // src argb
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
-  convertloop1b:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-static const uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
-    pslld      xmm3, 24
-    movdqa     xmm4, xmmword ptr kShuffleAlpha0
-    movdqa     xmm5, xmmword ptr kShuffleAlpha1
-
- convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqu     xmm2, [eax]      // read 4 pixels
-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // mask original alpha
-    lea        eax, [eax + 16]
-    pand       xmm2, xmm3
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm2       // copy original alpha
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
-    vpslld     ymm5, ymm5, 24
-
- convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
-    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
-    vpand      ymm6, ymm6, ymm5  // isolate alpha
-    vpsrlw     ymm0, ymm0, 8
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vpor       ymm0, ymm0, ymm6  // copy original alpha
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    lea        ebx, fixed_invtbl8
-
- convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    movzx      esi, byte ptr [eax + 3]  // first alpha
-    movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0       // first 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2       // rgb * a
-
-    movdqu     xmm1, [eax]      // read 4 pixels
-    movzx      esi, byte ptr [eax + 11]  // third alpha
-    movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1       // next 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2       // rgb * a
-    lea        eax, [eax + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
-// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
-// USE_GATHER is not on by default, due to being a slow instruction.
-#ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#else  // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
-  __asm {
-
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    sub        edx, eax
-    lea        ebx, fixed_invtbl8
-    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-    // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]                 // alpha0
-    movzx      edi, byte ptr [eax + 7]                 // alpha1
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]                // alpha2
-    movzx      edi, byte ptr [eax + 15]                // alpha3
-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]                // alpha4
-    movzx      edi, byte ptr [eax + 23]                // alpha5
-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]                // alpha6
-    movzx      edi, byte ptr [eax + 31]                // alpha7
-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
-    // end of VPGATHER
-
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // USE_GATHER
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 G bytes
-    movdqu     xmm2, [eax]  // A
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm2, 24
-    psrld      xmm3, 24
-    packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2   // 8 A bytes
-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0   // 8 GG words
-    punpcklbw  xmm3, xmm2   // 8 GA words
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3   // GGGA first 4
-    punpckhwd  xmm1, xmm3   // GGGA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-static const vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-static const vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        ecx, [esp + 8]   /* width */
-    movdqa     xmm2, xmmword ptr kARGBToSepiaB
-    movdqa     xmm3, xmmword ptr kARGBToSepiaG
-    movdqa     xmm4, xmmword ptr kARGBToSepiaR
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm6, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    phaddw     xmm0, xmm6
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
-    movdqu     xmm5, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqu     xmm5, [eax]  // R
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
-    movdqu     [eax], xmm0
-    movdqu     [eax + 16], xmm1
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
-// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* matrix_argb */
-    movdqu     xmm5, [ecx]
-    pshufd     xmm2, xmm5, 0x00
-    pshufd     xmm3, xmm5, 0x55
-    pshufd     xmm4, xmm5, 0xaa
-    pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16]  /* width */
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm7, xmm2
-    movdqu     xmm6, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm6, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
-    movdqu     xmm1, [eax]  // R
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm6, xmm5
-    pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7   // A
-    psraw      xmm1, 6      // R
-    psraw      xmm6, 6      // A
-    packuswb   xmm1, xmm1   // 8 R values
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm1, xmm6   // 8 RA values
-    movdqa     xmm6, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm1   // BGRA first 4
-    punpckhwd  xmm6, xmm1   // BGRA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm6
-    lea        eax, [eax + 32]
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  __asm {
-    mov        eax, [esp + 4]    /* dst_argb */
-    movd       xmm2, [esp + 8]   /* scale */
-    movd       xmm3, [esp + 12]  /* interval_size */
-    movd       xmm4, [esp + 16]  /* interval_offset */
-    mov        ecx, [esp + 20]   /* width */
-    pshuflw    xmm2, xmm2, 040h
-    pshufd     xmm2, xmm2, 044h
-    pshuflw    xmm3, xmm3, 040h
-    pshufd     xmm3, xmm3, 044h
-    pshuflw    xmm4, xmm4, 040h
-    pshufd     xmm4, xmm4, 044h
-    pxor       xmm5, xmm5  // constant 0
-    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
-    pslld      xmm6, 24
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
-    movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
-    pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
-    movdqu     xmm7, [eax]  // read 4 pixels
-    pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
-    paddw      xmm1, xmm4
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm7
-    movdqu     [eax], xmm0
-    lea        eax, [eax + 16]
-    sub        ecx, 4
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    movd       xmm2, [esp + 16]  // value
-    punpcklbw  xmm2, xmm2
-    punpcklqdq xmm2, xmm2
-
- convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
-    movdqu     xmm1, xmm0
-    movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0         // first 2
-    punpckhbw  xmm1, xmm1         // next 2
-    punpcklbw  xmm2, xmm5         // first 2
-    punpckhbw  xmm3, xmm5         // next 2
-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
-    lea        eax, [eax + 16]
-    lea        esi, [esi + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
-    sub        ecx, 4
-    jl         convertloop49
-
- convertloop4:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
- convertloop49:
-    add        ecx, 4 - 1
-    jl         convertloop19
-
- convertloop1:
-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
-    lea        eax, [eax + 4]
-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
-    lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
- convertloop19:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
-
- convertloop:
-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
-    lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
-    vpackuswb  ymm0, ymm0, ymm1
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
-    mov        edi, [esp + 8 + 12]  // src_y2
-    mov        edx, [esp + 8 + 16]  // dst_sobelx
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        esi, eax
-    sub        edi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
-    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
-    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
-    mov        edx, [esp + 4 + 12]  // dst_sobely
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
-    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
-    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-    pslld      xmm5, 24             // 0xff000000
-
- convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0             // GG
-    punpcklbw  xmm2, xmm0             // First 8
-    punpckhbw  xmm0, xmm0             // Next 8
-    movdqa     xmm1, xmm2             // GGGG
-    punpcklwd  xmm1, xmm2             // First 4
-    punpckhwd  xmm2, xmm2             // Next 4
-    por        xmm1, xmm5             // GGGA
-    por        xmm2, xmm5
-    movdqa     xmm3, xmm0             // GGGG
-    punpcklwd  xmm3, xmm0             // Next 4
-    punpckhwd  xmm0, xmm0             // Last 4
-    por        xmm3, xmm5             // GGGA
-    por        xmm0, xmm5
-    movdqu     [edx], xmm1
-    movdqu     [edx + 16], xmm2
-    movdqu     [edx + 32], xmm3
-    movdqu     [edx + 48], xmm0
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-
- convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-
- convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0             // XA
-    punpcklbw  xmm3, xmm5
-    punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1             // YS
-    punpcklbw  xmm4, xmm2
-    punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4             // YSXA
-    punpcklwd  xmm6, xmm3             // First 4
-    punpckhwd  xmm4, xmm3             // Next 4
-    movdqa     xmm7, xmm1             // YSXA
-    punpcklwd  xmm7, xmm0             // Next 4
-    punpckhwd  xmm1, xmm0             // Last 4
-    movdqu     [edx], xmm6
-    movdqu     [edx + 16], xmm4
-    movdqu     [edx + 32], xmm7
-    movdqu     [edx + 48], xmm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-// Consider float CumulativeSum.
-// Consider calling CumulativeSum one row at time as needed.
-// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
-// Convert cumulative sum for an area to an average for 1 pixel.
-// topleft is pointer to top left of CumulativeSum buffer for area.
-// botleft is pointer to bottom left of CumulativeSum buffer.
-// width is offset from left to right of area in CumulativeSum buffer measured
-//   in number of ints.
-// area is the number of pixels in the area being averaged.
-// dst points to pixel to store result to.
-// count is number of averaged pixels to produce.
-// Does 4 pixels at a time.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
-                                    int count) {
-  __asm {
-    mov        eax, topleft  // eax topleft
-    mov        esi, botleft  // esi botleft
-    mov        edx, width
-    movd       xmm5, area
-    mov        edi, dst
-    mov        ecx, count
-    cvtdq2ps   xmm5, xmm5
-    rcpss      xmm4, xmm5  // 1.0f / area
-    pshufd     xmm4, xmm4, 0
-    sub        ecx, 4
-    jl         l4b
-
-    cmp        area, 128  // 128 pixels will not overflow 15 bits.
-    ja         l4
-
-    pshufd     xmm5, xmm5, 0        // area
-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
-    psrld      xmm6, 16
-    cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6           // (65536.0 + area - 1)
-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
-    packssdw   xmm5, xmm5           // 16 bit shorts
-
-    // 4 pixel loop small blocks.
-  s4:
-    // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
-    packssdw   xmm2, xmm3
-
-    pmulhuw    xmm0, xmm5
-    pmulhuw    xmm2, xmm5
-
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        s4
-
-    jmp        l4b
-
-    // 4 pixel loop
-  l4:
-    // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
-    cvtdq2ps   xmm1, xmm1
-    mulps      xmm0, xmm4
-    mulps      xmm1, xmm4
-    cvtdq2ps   xmm2, xmm2
-    cvtdq2ps   xmm3, xmm3
-    mulps      xmm2, xmm4
-    mulps      xmm3, xmm4
-    cvtps2dq   xmm0, xmm0
-    cvtps2dq   xmm1, xmm1
-    cvtps2dq   xmm2, xmm2
-    cvtps2dq   xmm3, xmm3
-    packssdw   xmm0, xmm1
-    packssdw   xmm2, xmm3
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-    // 1 pixel loop
-  l1:
-    movdqu     xmm0, [eax]
-    psubd      xmm0, [eax + edx * 4]
-    lea        eax, [eax + 16]
-    psubd      xmm0, [esi]
-    paddd      xmm0, [esi + edx * 4]
-    lea        esi, [esi + 16]
-    cvtdq2ps   xmm0, xmm0
-    mulps      xmm0, xmm4
-    cvtps2dq   xmm0, xmm0
-    packssdw   xmm0, xmm0
-    packuswb   xmm0, xmm0
-    movd       dword ptr [edi], xmm0
-    lea        edi, [edi + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-  }
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  __asm {
-    mov        eax, row
-    mov        edx, cumsum
-    mov        esi, previous_cumsum
-    mov        ecx, width
-    pxor       xmm0, xmm0
-    pxor       xmm1, xmm1
-
-    sub        ecx, 4
-    jl         l4b
-    test       edx, 15
-    jne        l4b
-
-    // 4 pixel loop
-  l4:
-    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
-    lea        eax, [eax + 16]
-    movdqa     xmm4, xmm2
-
-    punpcklbw  xmm2, xmm1
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm2, xmm1
-    punpckhwd  xmm3, xmm1
-
-    punpckhbw  xmm4, xmm1
-    movdqa     xmm5, xmm4
-    punpcklwd  xmm4, xmm1
-    punpckhwd  xmm5, xmm1
-
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]  // previous row above.
-    paddd      xmm2, xmm0
-
-    paddd      xmm0, xmm3
-    movdqu     xmm3, [esi + 16]
-    paddd      xmm3, xmm0
-
-    paddd      xmm0, xmm4
-    movdqu     xmm4, [esi + 32]
-    paddd      xmm4, xmm0
-
-    paddd      xmm0, xmm5
-    movdqu     xmm5, [esi + 48]
-    lea        esi, [esi + 64]
-    paddd      xmm5, xmm0
-
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    movdqu     [edx + 32], xmm4
-    movdqu     [edx + 48], xmm5
-
-    lea        edx, [edx + 64]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-    // 1 pixel loop
-  l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
-    lea        eax, [eax + 4]
-    punpcklbw  xmm2, xmm1
-    punpcklwd  xmm2, xmm1
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]
-    lea        esi, [esi + 16]
-    paddd      xmm2, xmm0
-    movdqu     [edx], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 1
-    jge        l1
-
- l1b:
-  }
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 12]  // src_argb
-    mov        esi, [esp + 16]  // stride
-    mov        edx, [esp + 20]  // dst_argb
-    mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]  // uv
-    movq       xmm7, qword ptr [ecx + 8]  // dudv
-    mov        ecx, [esp + 28]  // width
-    shl        esi, 16          // 4, stride
-    add        esi, 4
-    movd       xmm5, esi
-    sub        ecx, 4
-    jl         l4b
-
-    // setup for 4 pixel loop
-    pshufd     xmm7, xmm7, 0x44  // dup dudv
-    pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
-    addps      xmm0, xmm7
-    movlhps    xmm2, xmm0
-    movdqa     xmm4, xmm7
-    addps      xmm4, xmm4    // dudv *= 2
-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
-    addps      xmm3, xmm4
-    addps      xmm4, xmm4    // dudv *= 4
-
-    // 4 pixel loop
-  l4:
-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
-    packssdw   xmm0, xmm1    // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       xmm1, [eax + esi]  // read pixel 0
-    movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
-    movq       qword ptr [edx], xmm1
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    movd       xmm6, [eax + esi]  // read pixel 2
-    movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
-    movq       qword ptr 8[edx], xmm6
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-    // 1 pixel loop
-  l1:
-    cvttps2dq  xmm0, xmm2    // x, y float to int
-    packssdw   xmm0, xmm0    // x, y as shorts
-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
-    addps      xmm2, xmm7    // x, y += dx, dy
-    movd       esi, xmm0
-    movd       xmm0, [eax + esi]  // copy a pixel
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    sub        edi, esi
-    cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
-
-    vmovd      xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    vmovd      xmm5, eax  // low fraction 256..1
-    vpunpcklbw xmm5, xmm5, xmm0
-    vpunpcklwd xmm5, xmm5, xmm5
-    vbroadcastss ymm5, xmm5
-
-    mov        eax, 0x80808080  // 128b for bias and rounding.
-    vmovd      xmm4, eax
-    vbroadcastss ymm4, xmm4
-
-  xloop:
-    vmovdqu    ymm0, [esi]
-    vmovdqu    ymm2, [esi + edx]
-    vpunpckhbw ymm1, ymm0, ymm2  // mutates
-    vpunpcklbw ymm0, ymm0, ymm2
-    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
-    vpsubb     ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm5, ymm1
-    vpmaddubsw ymm0, ymm5, ymm0
-    vpaddw     ymm1, ymm1, ymm4  // unbias and round
-    vpaddw     ymm0, ymm0, ymm4
-    vpsrlw     ymm1, ymm1, 8
-    vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    sub        ecx, 32
-    jg         xloop
-    jmp        xloop99
-
-   // Blend 50 / 50.
- xloop50:
-   vmovdqu    ymm0, [esi]
-   vpavgb     ymm0, ymm0, [esi + edx]
-   vmovdqu    [esi + edi], ymm0
-   lea        esi, [esi + 32]
-   sub        ecx, 32
-   jg         xloop50
-   jmp        xloop99
-
-   // Blend 100 / 0 - Copy row unchanged.
- xloop100:
-   rep movsb
-
-  xloop99:
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-// Bilinear filter 16x2 -> 16x1
-// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 /256.  Blend 100 / 0.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-
-    movd       xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    movd       xmm5, eax  // low fraction 255..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm4, eax
-    pshufd     xmm4, xmm4, 0x00
-
-  xloop:
-    movdqu     xmm0, [esi]
-    movdqu     xmm2, [esi + edx]
-    movdqu     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
-    psubb      xmm1, xmm4
-    movdqa     xmm2, xmm5
-    movdqa     xmm3, xmm5
-    pmaddubsw  xmm2, xmm0
-    pmaddubsw  xmm3, xmm1
-    paddw      xmm2, xmm4
-    paddw      xmm3, xmm4
-    psrlw      xmm2, 8
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [esi + edi], xmm2
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 50 / 50.
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-  xloop100:
-    movdqu     xmm0, [esi]
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
-    movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  __asm {
-    mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_argb
-    mov        ecx, [esp + 12]    // shuffler
-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
-    mov        ecx, [esp + 16]    // width
-
-  wloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vpshufb    ymm0, ymm0, ymm5
-    vpshufb    ymm1, ymm1, ymm5
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         wloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  __asm {
-    push       ebx
-    push       esi
-    mov        eax, [esp + 8 + 4]    // src_argb
-    mov        edx, [esp + 8 + 8]    // dst_argb
-    mov        esi, [esp + 8 + 12]   // shuffler
-    mov        ecx, [esp + 8 + 16]   // width
-    pxor       xmm5, xmm5
-
-    mov        ebx, [esi]   // shuffler
-    cmp        ebx, 0x03000102
-    je         shuf_3012
-    cmp        ebx, 0x00010203
-    je         shuf_0123
-    cmp        ebx, 0x00030201
-    je         shuf_0321
-    cmp        ebx, 0x02010003
-    je         shuf_2103
-
-  // TODO(fbarchard): Use one source pointer and 3 offsets.
-  shuf_any1:
-    movzx      ebx, byte ptr [esi]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx], bl
-    movzx      ebx, byte ptr [esi + 1]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 1], bl
-    movzx      ebx, byte ptr [esi + 2]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 2], bl
-    movzx      ebx, byte ptr [esi + 3]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 3], bl
-    lea        eax, [eax + 4]
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jg         shuf_any1
-    jmp        shuf99
-
-  shuf_0123:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
-    pshuflw    xmm0, xmm0, 01Bh
-    pshufhw    xmm1, xmm1, 01Bh
-    pshuflw    xmm1, xmm1, 01Bh
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0123
-    jmp        shuf99
-
-  shuf_0321:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
-    pshuflw    xmm0, xmm0, 039h
-    pshufhw    xmm1, xmm1, 039h
-    pshuflw    xmm1, xmm1, 039h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0321
-    jmp        shuf99
-
-  shuf_2103:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
-    pshuflw    xmm0, xmm0, 093h
-    pshufhw    xmm1, xmm1, 093h
-    pshuflw    xmm1, xmm1, 093h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_2103
-    jmp        shuf99
-
-  shuf_3012:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
-    pshuflw    xmm0, xmm0, 0C6h
-    pshufhw    xmm1, xmm1, 0C6h
-    pshuflw    xmm1, xmm1, 0C6h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_3012
-
-  shuf99:
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
-    punpckhbw  xmm1, xmm2
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
-    movdqa     xmm1, xmm2
-    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
-    punpckhbw  xmm2, xmm0
-    movdqu     [edi], xmm1
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   /* src_argb */
-    mov        edx, [esp + 4 + 8]   /* dst_argb */
-    mov        esi, [esp + 4 + 12]  /* poly */
-    mov        ecx, [esp + 4 + 16]  /* width */
-    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
-
-    // 2 pixel loop.
- convertloop:
-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
-    movq       xmm0, qword ptr [eax]  // BGRABGRA
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm3
-    movdqa     xmm4, xmm0
-    punpcklwd  xmm0, xmm3  // pixel 0
-    punpckhwd  xmm4, xmm3  // pixel 1
-    cvtdq2ps   xmm0, xmm0  // 4 floats
-    cvtdq2ps   xmm4, xmm4
-    movdqa     xmm1, xmm0  // X
-    movdqa     xmm5, xmm4
-    mulps      xmm0, [esi + 16]  // C1 * X
-    mulps      xmm4, [esi + 16]
-    addps      xmm0, [esi]  // result = C0 + C1 * X
-    addps      xmm4, [esi]
-    movdqa     xmm2, xmm1
-    movdqa     xmm6, xmm5
-    mulps      xmm2, xmm1  // X * X
-    mulps      xmm6, xmm5
-    mulps      xmm1, xmm2  // X * X * X
-    mulps      xmm5, xmm6
-    mulps      xmm2, [esi + 32]  // C2 * X * X
-    mulps      xmm6, [esi + 32]
-    mulps      xmm1, [esi + 48]  // C3 * X * X * X
-    mulps      xmm5, [esi + 48]
-    addps      xmm0, xmm2  // result += C2 * X * X
-    addps      xmm4, xmm6
-    addps      xmm0, xmm1  // result += C3 * X * X * X
-    addps      xmm4, xmm5
-    cvttps2dq  xmm0, xmm0
-    cvttps2dq  xmm4, xmm4
-    packuswb   xmm0, xmm4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 2
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]   /* poly */
-    vbroadcastf128 ymm4, [ecx]       // C0
-    vbroadcastf128 ymm5, [ecx + 16]  // C1
-    vbroadcastf128 ymm6, [ecx + 32]  // C2
-    vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16]  /* width */
-
-    // 2 pixel loop.
- convertloop:
-    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
-    lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
-    vmulps      ymm2, ymm0, ymm0  // X * X
-    vmulps      ymm3, ymm0, ymm7  // C3 * X
-    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
-    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
-    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
-    vcvttps2dq  ymm0, ymm0
-    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
-    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
-    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
-    vmovq       qword ptr [edx], xmm0
-    lea         edx, [edx + 8]
-    sub         ecx, 2
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    movzx      edx, byte ptr [eax - 4 + 3]
-    movzx      edx, byte ptr [esi + edx * 4 + 3]
-    mov        byte ptr [eax - 4 + 3], dl
-    dec        ecx
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    dec        ecx
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   /* src_argb */
-    mov        edi, [esp + 8 + 8]   /* dst_argb */
-    mov        ecx, [esp + 8 + 12]  /* width */
-    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
-    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
-    pshufd     xmm2, xmm2, 0
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
-    psllw      xmm4, 8
-    pxor       xmm5, xmm5
-
-    // 4 pixel loop.
-  convertloop:
-    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
-    pmaddubsw  xmm0, xmm3
-    phaddw     xmm0, xmm0
-    pand       xmm0, xmm4  // mask out low bits
-    punpcklwd  xmm0, xmm5
-    paddd      xmm0, xmm2  // add table base
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi], dl
-    movzx      edx, byte ptr [eax + 1]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 1], dl
-    movzx      edx, byte ptr [eax + 2]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 2], dl
-    movzx      edx, byte ptr [eax + 3]  // copy alpha.
-    mov        byte ptr [edi + 3], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 4]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 4], dl
-    movzx      edx, byte ptr [eax + 5]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 5], dl
-    movzx      edx, byte ptr [eax + 6]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 6], dl
-    movzx      edx, byte ptr [eax + 7]  // copy alpha.
-    mov        byte ptr [edi + 7], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 8]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 8], dl
-    movzx      edx, byte ptr [eax + 9]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 9], dl
-    movzx      edx, byte ptr [eax + 10]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 10], dl
-    movzx      edx, byte ptr [eax + 11]  // copy alpha.
-    mov        byte ptr [edi + 11], dl
-
-    movd       esi, xmm0
-
-    movzx      edx, byte ptr [eax + 12]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 12], dl
-    movzx      edx, byte ptr [eax + 13]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 13], dl
-    movzx      edx, byte ptr [eax + 14]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 14], dl
-    movzx      edx, byte ptr [eax + 15]  // copy alpha.
-    mov        byte ptr [edi + 15], dl
-
-    lea        eax, [eax + 16]
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif  // defined(_M_X64)
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale.cc
deleted file mode 100755
index 36e3fe5..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale.cc
+++ /dev/null
@@ -1,1672 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyPlane
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-
-// Scale plane, 1/2
-// This is an optimized version for scaling down a plane to 1/2 of
-// its original size.
-
-static void ScalePlaneDown2(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering == kFilterNone ? ScaleRowDown2_C :
-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
-  int row_stride = src_stride << 1;
-  if (!filtering) {
-    src_ptr += src_stride;  // Point to odd rows.
-    src_stride = 0;
-  }
-
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
-        ScaleRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
-          ScaleRowDown2Box_NEON);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
-        ScaleRowDown2Box_Any_SSSE3);
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
-          ScaleRowDown2Box_SSSE3);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
-        ScaleRowDown2Box_Any_AVX2);
-    if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
-          ScaleRowDown2Box_AVX2);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void ScalePlaneDown2_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
-                               enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_16_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
-        ScaleRowDown2Box_16_C);
-  int row_stride = src_stride << 1;
-  if (!filtering) {
-    src_ptr += src_stride;  // Point to odd rows.
-    src_stride = 0;
-  }
-
-#if defined(HAS_SCALEROWDOWN2_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
-        ScaleRowDown2_16_NEON;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-        ScaleRowDown2Box_16_SSE2);
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-// Scale plane, 1/4
-// This is an optimized version for scaling down a plane to 1/4 of
-// its original size.
-
-static void ScalePlaneDown4(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-  int row_stride = src_stride << 2;
-  if (!filtering) {
-    src_ptr += src_stride * 2;  // Point to row 2.
-    src_stride = 0;
-  }
-#if defined(HAS_SCALEROWDOWN4_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void ScalePlaneDown4_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
-                               enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
-  int row_stride = src_stride << 2;
-  if (!filtering) {
-    src_ptr += src_stride * 2;  // Point to row 2.
-    src_stride = 0;
-  }
-#if defined(HAS_SCALEROWDOWN4_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
-        ScaleRowDown4_16_NEON;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
-        ScaleRowDown4_16_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-// Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
-                             enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_C;
-    ScaleRowDown34_1 = ScaleRowDown34_C;
-  } else {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
-  }
-#if defined(HAS_SCALEROWDOWN34_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
-    }
-    if (dst_width % 24 == 0) {
-      if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_NEON;
-        ScaleRowDown34_1 = ScaleRowDown34_NEON;
-      } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
-    }
-    if (dst_width % 24 == 0) {
-      if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
-      } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-static void ScalePlaneDown34_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
-                                enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_16_C;
-    ScaleRowDown34_1 = ScaleRowDown34_16_C;
-  } else {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
-  }
-#if defined(HAS_SCALEROWDOWN34_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-
-// Scale plane, 3/8
-// This is an optimized version for scaling down a plane to 3/8
-// of its original size.
-//
-// Uses box filter arranges like this
-// aaabbbcc -> abc
-// aaabbbcc    def
-// aaabbbcc    ghi
-// dddeeeff
-// dddeeeff
-// dddeeeff
-// ggghhhii
-// ggghhhii
-// Boxes are 3x3, 2x3, 3x2 and 2x2
-
-static void ScalePlaneDown38(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
-                             enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown38_3 = ScaleRowDown38_C;
-    ScaleRowDown38_2 = ScaleRowDown38_C;
-  } else {
-    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
-    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
-  }
-
-#if defined(HAS_SCALEROWDOWN38_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
-    }
-    if (dst_width % 12 == 0) {
-      if (!filtering) {
-        ScaleRowDown38_3 = ScaleRowDown38_NEON;
-        ScaleRowDown38_2 = ScaleRowDown38_NEON;
-      } else {
-        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
-        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
-    }
-    if (dst_width % 12 == 0 && !filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
-    }
-    if (dst_width % 6 == 0 && filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-static void ScalePlaneDown38_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
-                                enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown38_3 = ScaleRowDown38_16_C;
-    ScaleRowDown38_2 = ScaleRowDown38_16_C;
-  } else {
-    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
-    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
-  }
-#if defined(HAS_SCALEROWDOWN38_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-#define MIN1(x) ((x) < 1 ? 1 : (x))
-
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
-  uint32 sum = 0u;
-  int x;
-  assert(iboxwidth > 0);
-  for (x = 0; x < iboxwidth; ++x) {
-    sum += src_ptr[x];
-  }
-  return sum;
-}
-
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
-  uint32 sum = 0u;
-  int x;
-  assert(iboxwidth > 0);
-  for (x = 0; x < iboxwidth; ++x) {
-    sum += src_ptr[x];
-  }
-  return sum;
-}
-
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
-  int i;
-  int scaletbl[2];
-  int minboxwidth = dx >> 16;
-  int boxwidth;
-  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth] >> 16;
-  }
-}
-
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
-  int i;
-  int scaletbl[2];
-  int minboxwidth = dx >> 16;
-  int boxwidth;
-  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth]  >> 16;
-  }
-}
-
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
-                            const uint16* src_ptr, uint8* dst_ptr) {
-  int scaleval = 65536 / boxheight;
-  int i;
-  src_ptr += (x >> 16);
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
-  }
-}
-
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
-  int boxwidth = MIN1(dx >> 16);
-  int scaleval = 65536 / (boxwidth * boxheight);
-  int i;
-  x >>= 16;
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
-    x += boxwidth;
-  }
-}
-
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
-  int boxwidth = MIN1(dx >> 16);
-  int scaleval = 65536 / (boxwidth * boxheight);
-  int i;
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
-    x += boxwidth;
-  }
-}
-
-// Scale plane down to any dimensions, with interpolation.
-// (boxfilter).
-//
-// Same method as SimpleScale, which is fixed point, outputting
-// one pixel of destination using fixed point (16.16) to step
-// through source, sampling a box of pixel with simple
-// averaging.
-static void ScalePlaneBox(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr) {
-  int j, k;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-  {
-    // Allocate a row buffer of uint16.
-    align_buffer_64(row16, src_width * 2);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C:
-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
-        ScaleAddRow_C;
-#if defined(HAS_SCALEADDROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      ScaleAddRow = ScaleAddRow_Any_SSE2;
-      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRow = ScaleAddRow_SSE2;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      ScaleAddRow = ScaleAddRow_Any_AVX2;
-      if (IS_ALIGNED(src_width, 32)) {
-        ScaleAddRow = ScaleAddRow_AVX2;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ScaleAddRow = ScaleAddRow_Any_NEON;
-      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRow = ScaleAddRow_NEON;
-      }
-    }
-#endif
-
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-      boxheight = MIN1((y >> 16) - iy);
-      memset(row16, 0, src_width * 2);
-      for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16 *)(row16), src_width);
-        src += src_stride;
-      }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
-      dst_ptr += dst_stride;
-    }
-    free_aligned_buffer_64(row16);
-  }
-}
-
-static void ScalePlaneBox_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr) {
-  int j, k;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-  {
-    // Allocate a row buffer of uint32.
-    align_buffer_64(row32, src_width * 4);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
-        ScaleAddRow_16_C;
-
-#if defined(HAS_SCALEADDROW_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
-      ScaleAddRow = ScaleAddRow_16_SSE2;
-    }
-#endif
-
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint16* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-      boxheight = MIN1((y >> 16) - iy);
-      memset(row32, 0, src_width * 4);
-      for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32 *)(row32), src_width);
-        src += src_stride;
-      }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
-      dst_ptr += dst_stride;
-    }
-    free_aligned_buffer_64(row32);
-  }
-}
-
-// Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
-                            enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row buffer.
-  align_buffer_64(row, src_width);
-
-  const int max_y = (src_height - 1) << 16;
-  int j;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(src_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
-
-
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleFilterCols_NEON;
-    }
-  }
-#endif
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
-    if (filtering == kFilterLinear) {
-      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
-    }
-    dst_ptr += dst_stride;
-    y += dy;
-    if (y > max_y) {
-      y = max_y;
-    }
-  }
-  free_aligned_buffer_64(row);
-}
-
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
-                               enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row buffer.
-  align_buffer_64(row, src_width * 2);
-
-  const int max_y = (src_height - 1) << 16;
-  int j;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(src_width, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-
-
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
-  }
-#endif
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
-    if (filtering == kFilterLinear) {
-      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
-    }
-    dst_ptr += dst_stride;
-    y += dy;
-    if (y > max_y) {
-      y = max_y;
-    }
-  }
-  free_aligned_buffer_64(row);
-}
-
-// Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr,
-                          enum FilterMode filtering) {
-  int j;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
-      filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
-
-  if (filtering && src_width >= 32768) {
-    ScaleFilterCols = ScaleFilterCols64_C;
-  }
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleFilterCols_NEON;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleFilterCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_SSE2;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-  {
-    int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
-
-    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-
-    uint8* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
-      }
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr,
-                             enum FilterMode filtering) {
-  int j;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
-      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-
-  if (filtering && src_width >= 32768) {
-    ScaleFilterCols = ScaleFilterCols64_16_C;
-  }
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleFilterCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_16_SSE2;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-  {
-    int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
-
-    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 4);
-
-    uint16* rowptr = (uint16*)row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
-      }
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-// Scale Plane to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScalePlaneSimple(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
-  int i;
-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_C;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_SSE2;
-    }
-#endif
-  }
-
-  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
-  }
-}
-
-static void ScalePlaneSimple_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr) {
-  int i;
-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_16_C;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_16_SSE2;
-    }
-#endif
-  }
-
-  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
-  }
-}
-
-// Scale a plane.
-// This function dispatches to a specialized scaler based on scale factor.
-
-LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
-                enum FilterMode filtering) {
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
-
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  // Use specialized scales to improve performance for common resolutions.
-  // For example, all the 1/2 scalings will use ScalePlaneDown2()
-  if (dst_width == src_width && dst_height == src_height) {
-    // Straight copy.
-    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
-  }
-  if (dst_width == src_width && filtering != kFilterBox) {
-    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height,
-                       dst_width, dst_height,
-                       src_stride, dst_stride, src, dst,
-                       0, 0, dy, 1, filtering);
-    return;
-  }
-  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
-    // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
-      // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-      // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
-      // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-        (filtering == kFilterBox || filtering == kFilterNone)) {
-      // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-  }
-  if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src, dst);
-    return;
-  }
-  if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  if (filtering) {
-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst);
-}
-
-LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                  int src_width, int src_height,
-                  uint16* dst, int dst_stride,
-                  int dst_width, int dst_height,
-                  enum FilterMode filtering) {
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
-
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  // Use specialized scales to improve performance for common resolutions.
-  // For example, all the 1/2 scalings will use ScalePlaneDown2()
-  if (dst_width == src_width && dst_height == src_height) {
-    // Straight copy.
-    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
-  }
-  if (dst_width == src_width) {
-    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical_16(src_height,
-                          dst_width, dst_height,
-                          src_stride, dst_stride, src, dst,
-                          0, 0, dy, 1, filtering);
-    return;
-  }
-  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
-    // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
-      // optimized, 3/4
-      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
-                          src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-      // optimized, 1/2
-      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
-      // optimized, 3/8
-      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
-                          src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
-      // optimized, 1/4
-      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-  }
-  if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst);
-    return;
-  }
-  if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  if (filtering) {
-    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst);
-}
-
-// Scale an I420 image.
-// This function in turn calls a scaling function for each plane.
-
-LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
-              enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
-  return 0;
-}
-
-LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
-                 enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
-                dst_y, dst_stride_y, dst_width, dst_height,
-                filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-                filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-                filtering);
-  return 0;
-}
-
-// Deprecated api
-LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
-          LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y,
-                   src_u, src_stride_u,
-                   src_v, src_stride_v,
-                   src_width, src_height,
-                   dst_y, dst_stride_y,
-                   dst_u, dst_stride_u,
-                   dst_v, dst_stride_v,
-                   dst_width, dst_height,
-                   interpolate ? kFilterBox : kFilterNone);
-}
-
-// Deprecated api
-LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate) {
-  // Chroma requires offset to multiple of 2.
-  int dst_yoffset_even = dst_yoffset & ~1;
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
-  const uint8* src_y = src;
-  const uint8* src_u = src + src_width * src_height;
-  const uint8* src_v = src + src_width * src_height +
-                             src_halfwidth * src_halfheight;
-  uint8* dst_y = dst + dst_yoffset_even * dst_width;
-  uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
-      dst_yoffset_even >= dst_height) {
-    return -1;
-  }
-  return I420Scale(src_y, src_width,
-                   src_u, src_halfwidth,
-                   src_v, src_halfwidth,
-                   src_width, src_height,
-                   dst_y, dst_width,
-                   dst_u, dst_halfwidth,
-                   dst_v, dst_halfwidth,
-                   dst_width, aheight,
-                   interpolate ? kFilterBox : kFilterNone);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_any.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_any.cc
deleted file mode 100755
index ed76a9e..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_any.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 int dst_width, int x, int dx) {                               \
-      int n = dst_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
-      }                                                                        \
-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
-             dst_width & MASK, x + n * dx, dx);                                \
-    }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C, 4, 3)
-#endif
-#undef CANY
-
-// Fixed scale down.
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
-
-// Fixed scale down for odd source width.  Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
-      2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
-#endif
-#ifdef HAS_SCALEROWDOWN2_AVX2
-SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
-      2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
-      2, 1, 31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_NEON
-SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
-      4, 1, 7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_AVX2
-SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
-      4, 1, 15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_NEON
-SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
-      4, 1, 7)
-#endif
-#ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
-#endif
-#ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
-#endif
-#ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
-#endif
-
-#ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C, 2, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C, 2, 4, 7)
-#endif
-#undef SDANY
-
-// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
-                     src_stepx, dst_ptr + n * BPP, r);                         \
-    }
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
-#endif
-
-// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-      int n = src_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
-      }                                                                        \
-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-    }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
-#endif
-#undef SAANY
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-
-
-
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_argb.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_argb.cc
deleted file mode 100755
index 17f51ae..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_argb.cc
+++ /dev/null
@@ -1,859 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// ScaleARGB ARGB, 1/2
-// This is an optimized version for scaling down a ARGB to 1/2 of
-// its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_argb, uint8* dst_argb,
-                           int x, int dx, int y, int dy,
-                           enum FilterMode filtering) {
-  int j;
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) =
-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
-        ScaleARGBRowDown2Box_C);
-  assert(dx == 65536 * 2);  // Test scale factor of 2.
-  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
-  // Advance to odd row, even column.
-  if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  } else {
-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
-  }
-
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-}
-
-// ScaleARGB ARGB, 1/4
-// This is an optimized version for scaling down a ARGB to 1/4 of
-// its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy) {
-  int j;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
-  // Advance to odd row, even column.
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  assert(dx == 65536 * 4);  // Test scale factor of 4.
-  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
-    }
-  }
-#endif
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
-                      row + kRowSize, dst_width * 2);
-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-// ScaleARGB ARGB Even
-// This is an optimized version for scaling down a ARGB to even
-// multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy,
-                              enum FilterMode filtering) {
-  int j;
-  int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_step, uint8* dst_argb, int dst_width) =
-      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-}
-
-// Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  int src_stride, int dst_stride,
-                                  const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy,
-                                  enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
-  int64 xlast = x + (int64)(dst_width - 1) * dx;
-  int64 xl = (dx >= 0) ? x : xlast;
-  int64 xr = (dx >= 0) ? xlast : x;
-  int clip_src_width;
-  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
-  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
-  if (xr > src_width) {
-    xr = src_width;
-  }
-  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
-  src_argb += xl * 4;
-  x -= (int)(xl << 16);
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(clip_src_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row of ARGB.
-  {
-    align_buffer_64(row, clip_src_width * 4);
-
-    const int max_y = (src_height - 1) << 16;
-    if (y > max_y) {
-      y = max_y;
-    }
-    for (j = 0; j < dst_height; ++j) {
-      int yi = y >> 16;
-      const uint8* src = src_argb + yi * src_stride;
-      if (filtering == kFilterLinear) {
-        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(row, src, src_stride, clip_src_width, yf);
-        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-// Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy,
-                                enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-  const int max_y = (src_height - 1) << 16;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
-  }
-#endif
-  if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
-  }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  {
-    int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
-
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-
-    uint8* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_argb + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-#ifdef YUVSCALEUP
-// Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
-                                     int dst_width, int dst_height,
-                                     int src_stride_y,
-                                     int src_stride_u,
-                                     int src_stride_v,
-                                     int dst_stride_argb,
-                                     const uint8* src_y,
-                                     const uint8* src_u,
-                                     const uint8* src_v,
-                                     uint8* dst_argb,
-                                     int x, int dx, int y, int dy,
-                                     enum FilterMode filtering) {
-  int j;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(src_width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
-  }
-#endif
-
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
-  }
-#endif
-
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-  if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
-  }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-  }
-
-  const int max_y = (src_height - 1) << 16;
-  if (y > max_y) {
-    y = max_y;
-  }
-  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
-  int yi = y >> 16;
-  int uv_yi = yi >> kYShift;
-  const uint8* src_row_y = src_y + yi * src_stride_y;
-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
-
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
-
-  // Allocate 1 row of ARGB for source conversion.
-  align_buffer_64(argb_row, src_width * 4);
-
-  uint8* rowptr = row;
-  int rowstride = kRowSize;
-  int lasty = yi;
-
-  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
-  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
-  if (src_height > 1) {
-    src_row_y += src_stride_y;
-    if (yi & 1) {
-      src_row_u += src_stride_u;
-      src_row_v += src_stride_v;
-    }
-  }
-  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
-  if (src_height > 2) {
-    src_row_y += src_stride_y;
-    if (!(yi & 1)) {
-      src_row_u += src_stride_u;
-      src_row_v += src_stride_v;
-    }
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    yi = y >> 16;
-    if (yi != lasty) {
-      if (y > max_y) {
-        y = max_y;
-        yi = y >> 16;
-        uv_yi = yi >> kYShift;
-        src_row_y = src_y + yi * src_stride_y;
-        src_row_u = src_u + uv_yi * src_stride_u;
-        src_row_v = src_v + uv_yi * src_stride_v;
-      }
-      if (yi != lasty) {
-        // TODO(fbarchard): Convert the clipped region of row.
-        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
-        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
-        rowptr += rowstride;
-        rowstride = -rowstride;
-        lasty = yi;
-        src_row_y += src_stride_y;
-        if (yi & 1) {
-          src_row_u += src_stride_u;
-          src_row_v += src_stride_v;
-        }
-      }
-    }
-    if (filtering == kFilterLinear) {
-      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-    }
-    dst_argb += dst_stride_argb;
-    y += dy;
-  }
-  free_aligned_buffer_64(row);
-  free_aligned_buffer_64(row_argb);
-}
-#endif
-
-// Scale ARGB to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScaleARGBSimple(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_argb, uint8* dst_argb,
-                            int x, int dx, int y, int dy) {
-  int j;
-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
-                  dst_width, x, dx);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-
-// ScaleARGB a ARGB.
-// This function in turn calls a scaling function
-// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
-                      int src_width, int src_height,
-                      uint8* dst, int dst_stride,
-                      int dst_width, int dst_height,
-                      int clip_x, int clip_y, int clip_width, int clip_height,
-                      enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // ARGB does not support box filter yet, but allow the user to pass it.
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
-                                filtering);
-
-  // Negative src_height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
-  src_width = Abs(src_width);
-  if (clip_x) {
-    int64 clipf = (int64)(clip_x) * dx;
-    x += (clipf & 0xffff);
-    src += (clipf >> 16) * 4;
-    dst += clip_x * 4;
-  }
-  if (clip_y) {
-    int64 clipf = (int64)(clip_y) * dy;
-    y += (clipf & 0xffff);
-    src += (clipf >> 16) * src_stride;
-    dst += clip_y * dst_stride;
-  }
-
-  // Special case for integer step values.
-  if (((dx | dy) & 0xffff) == 0) {
-    if (!dx || !dy) {  // 1 pixel wide and/or tall.
-      filtering = kFilterNone;
-    } else {
-      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
-      if (!(dx & 0x10000) && !(dy & 0x10000)) {
-        if (dx == 0x20000) {
-          // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height,
-                         clip_width, clip_height,
-                         src_stride, dst_stride, src, dst,
-                         x, dx, y, dy, filtering);
-          return;
-        }
-        if (dx == 0x40000 && filtering == kFilterBox) {
-          // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height,
-                            clip_width, clip_height,
-                            src_stride, dst_stride, src, dst,
-                            x, dx, y, dy);
-          return;
-        }
-        ScaleARGBDownEven(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
-        return;
-      }
-      // Optimized odd scale down. ie 3, 5, 7, 9x.
-      if ((dx & 0x10000) && (dy & 0x10000)) {
-        filtering = kFilterNone;
-        if (dx == 0x10000 && dy == 0x10000) {
-          // Straight copy.
-          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
-                   dst, dst_stride, clip_width, clip_height);
-          return;
-        }
-      }
-    }
-  }
-  if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical(src_height,
-                       clip_width, clip_height,
-                       src_stride, dst_stride, src, dst,
-                       x, y, dy, 4, filtering);
-    return;
-  }
-  if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src, dst,
-                        x, dx, y, dy, filtering);
-    return;
-  }
-  if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
-    return;
-  }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src, dst,
-                  x, dx, y, dy);
-}
-
-LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
-                  enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      clip_x < 0 || clip_y < 0 ||
-      clip_width > 32768 || clip_height > 32768 ||
-      (clip_x + clip_width) > dst_width ||
-      (clip_y + clip_height) > dst_height) {
-    return -1;
-  }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            clip_x, clip_y, clip_width, clip_height, filtering);
-  return 0;
-}
-
-// Scale an ARGB image.
-LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
-              enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            0, 0, dst_width, dst_height, filtering);
-  return 0;
-}
-
-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
-                       enum FilterMode filtering) {
-  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
-  int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
-
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
-  free(argb_buffer);
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_common.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_common.cc
deleted file mode 100755
index d3992df..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_common.cc
+++ /dev/null
@@ -1,1151 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-    dst[1] = (s[2] + s[3] + 1) >> 1;
-    dst += 2;
-    s += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-  }
-}
-
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-    dst[1] = (s[2] + s[3] + 1) >> 1;
-    dst += 2;
-    s += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-  }
-}
-
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  int x;
-  dst_width -= 1;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst += 1;
-    s += 2;
-    t += 2;
-  }
-  dst[0] = (s[0] + t[0] + 1) >> 1;
-}
-
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-  }
-}
-
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-  }
-}
-
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  }
-}
-
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  }
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[0] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr[1] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[x >> 16];
-  }
-}
-
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[0] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr[1] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[x >> 16];
-  }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-    src_ptr += 1;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[0];
-  }
-}
-
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-    src_ptr += 1;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[0];
-  }
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-#undef BLENDER
-
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-#undef BLENDER
-
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
-  int x;
-  assert(dst_width % 3 == 0);
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
-  int x;
-  assert(dst_width % 3 == 0);
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
-// 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  int x;
-  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
-  }
-}
-
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
-  int x;
-  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
-  }
-}
-
-void ScaleARGBRowDown2_C(const uint8* src_argb,
-                         ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 4;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
-  }
-}
-
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
-    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
-    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
-    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
-    src_argb += 8;
-    dst_argb += 4;
-  }
-}
-
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
-    src_argb += 8;
-    dst_argb += 4;
-  }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            int src_stepx,
-                            uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[src_stepx];
-    src += src_stepx * 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
-    src_argb += src_stepx * 4;
-    dst_argb += 4;
-  }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[1] = dst[0] = src[0];
-    src += 1;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int64 xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-#undef BLENDER1
-#undef BLENDERC
-#undef BLENDER
-
-// Scale plane vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering) {
-  // TODO(fbarchard): Allow higher bpp.
-  int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  int j;
-  assert(bpp >= 1 && bpp <= 4);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
-  for (j = 0; j < dst_height; ++j) {
-    int yi;
-    int yf;
-    if (y > max_y) {
-      y = max_y;
-    }
-    yi = y >> 16;
-    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_bytes, yf);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
-  // TODO(fbarchard): Allow higher wpp.
-  int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  int j;
-  assert(wpp >= 1 && wpp <= 2);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  src_argb += (x >> 16) * wpp;
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-  for (j = 0; j < dst_height; ++j) {
-    int yi;
-    int yf;
-    if (y > max_y) {
-      y = max_y;
-    }
-    yi = y >> 16;
-    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  enum FilterMode filtering) {
-  if (src_width < 0) {
-    src_width = -src_width;
-  }
-  if (src_height < 0) {
-    src_height = -src_height;
-  }
-  if (filtering == kFilterBox) {
-    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
-    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
-      filtering = kFilterBilinear;
-    }
-  }
-  if (filtering == kFilterBilinear) {
-    if (src_height == 1) {
-      filtering = kFilterLinear;
-    }
-    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
-    if (dst_height == src_height || dst_height * 3 == src_height) {
-      filtering = kFilterLinear;
-    }
-    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
-    // avoid reading 2 pixels horizontally that causes memory exception.
-    if (src_width == 1) {
-      filtering = kFilterNone;
-    }
-  }
-  if (filtering == kFilterLinear) {
-    if (src_width == 1) {
-      filtering = kFilterNone;
-    }
-    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
-    if (dst_width == src_width || dst_width * 3 == src_width) {
-      filtering = kFilterNone;
-    }
-  }
-  return filtering;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div) {
-  return (int)(((int64)(num) << 16) / div);
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv1_C(int num, int div) {
-  return (int)((((int64)(num) << 16) - 0x00010001) /
-                          (div - 1));
-}
-
-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
-                enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy) {
-  assert(x != NULL);
-  assert(y != NULL);
-  assert(dx != NULL);
-  assert(dy != NULL);
-  assert(src_width != 0);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  // Check for 1 pixel and avoid FixedDiv overflow.
-  if (dst_width == 1 && src_width >= 32768) {
-    dst_width = src_width;
-  }
-  if (dst_height == 1 && src_height >= 32768) {
-    dst_height = src_height;
-  }
-  if (filtering == kFilterBox) {
-    // Scale step for point sampling duplicates all pixels equally.
-    *dx = FixedDiv(Abs(src_width), dst_width);
-    *dy = FixedDiv(src_height, dst_height);
-    *x = 0;
-    *y = 0;
-  } else if (filtering == kFilterBilinear) {
-    // Scale step for bilinear sampling renders last pixel once for upsample.
-    if (dst_width <= Abs(src_width)) {
-      *dx = FixedDiv(Abs(src_width), dst_width);
-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
-      *dx = FixedDiv1(Abs(src_width), dst_width);
-      *x = 0;
-    }
-    if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height,  dst_height);
-      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_height > 1) {
-      *dy = FixedDiv1(src_height, dst_height);
-      *y = 0;
-    }
-  } else if (filtering == kFilterLinear) {
-    // Scale step for bilinear sampling renders last pixel once for upsample.
-    if (dst_width <= Abs(src_width)) {
-      *dx = FixedDiv(Abs(src_width), dst_width);
-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
-      *dx = FixedDiv1(Abs(src_width), dst_width);
-      *x = 0;
-    }
-    *dy = FixedDiv(src_height, dst_height);
-    *y = *dy >> 1;
-  } else {
-    // Scale step for point sampling duplicates all pixels equally.
-    *dx = FixedDiv(Abs(src_width), dst_width);
-    *dy = FixedDiv(src_height, dst_height);
-    *x = CENTERSTART(*dx, 0);
-    *y = CENTERSTART(*dy, 0);
-  }
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    *x += (dst_width - 1) * *dx;
-    *dx = -*dx;
-    // src_width = -src_width;   // Caller must do this.
-  }
-}
-#undef CENTERSTART
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_gcc.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_gcc.cc
deleted file mode 100755
index a1ae4e2..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_gcc.cc
+++ /dev/null
@@ -1,1292 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
-}
-
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
-}
-
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "psrlw      $0x1,%%xmm0                    \n"
-    "psrlw      $0x1,%%xmm1                    \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
-}
-
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                  \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "psllw      $0x3,%%xmm5                    \n"
-    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "phaddw     %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm5,%%xmm0                  \n"
-    "psrlw      $0x4,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(stridex3)     // %3
-  : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
-}
-
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(src_stride * 3))   // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc", NACL_R14
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "lea       " MEMLEA(0xc,1) ",%1            \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movd      %%xmm6," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
-    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
-    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
-  asm volatile (
-    "movd      %6,%%xmm2                       \n"
-    "movd      %7,%%xmm3                       \n"
-    "movl      $0x04040000,%k2                 \n"
-    "movd      %k2,%%xmm5                      \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "subl      $0x2,%5                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
-    "movd      %k2,%%xmm4                      \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpcklwd %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,%k2                      \n"
-    "mov       %w2," MEMACCESS(0) "            \n"
-    "lea       " MEMLEA(0x2,0) ",%0            \n"
-    "sub       $0x2,%5                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "addl      $0x1,%5                         \n"
-    "jl        99f                             \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,%k2                      \n"
-    "mov       %b2," MEMACCESS(0) "            \n"
-  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+a"(temp_pixel),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1),          // %4
-    "+rm"(dst_width)   // %5
-  : "rm"(x),           // %6
-    "rm"(dx)           // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x20,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
-}
-
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
-}
-
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
-}
-
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movd      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
-    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(src_stepx_x4),  // %1
-    "+r"(dst_argb),      // %2
-    "+r"(dst_width),     // %3
-    "+r"(src_stepx_x12)  // %4
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride, int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
-    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
-    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
-    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+rm"(dst_width),     // %3
-    "+r"(src_stepx_x12),  // %4
-    "+r"(row1)            // %5
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
-}
-
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "cmp       $0x0,%4                         \n"
-    "jl        99f                             \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    LABELALIGN
-  "40:                                         \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "pextrw    $0x7,%%xmm2,%k1                 \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
-    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "punpckldq %%xmm4,%%xmm1                   \n"
-    "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "test      $0x2,%4                         \n"
-    "je        29f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x8,2) ",%2            \n"
-  "29:                                         \n"
-    "test      $0x1,%4                         \n"
-    "je        99f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-  "99:                                         \n"
-  : "+a"(x0),          // %0
-    "+d"(x1),          // %1
-    "+r"(dst_argb),    // %2
-    "+r"(src_argb),    // %3
-    "+r"(dst_width)    // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpckldq %%xmm0,%%xmm0                   \n"
-    "punpckhdq %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
-}
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
-  );
-
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(0) "         \n"
-
-    LABELALIGN
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1)           // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
-  return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "sub       $0x10001,%%eax                  \n"
-    "sbb       $0x0,%%edx                      \n"
-    "sub       $0x1,%1                         \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
-  return num;
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_mips.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_mips.cc
deleted file mode 100755
index ae95307..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_mips.cc
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__(
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-
-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
-    "beqz           $t9, 2f                        \n"
-    " nop                                          \n"
-
-  "1:                                              \n"
-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
-    // TODO(fbarchard): Use odd pixels instead of even.
-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
-    "addiu          %[src_ptr], %[src_ptr], 32     \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sw             $t8, 0(%[dst])                 \n"
-    "sw             $t0, 4(%[dst])                 \n"
-    "sw             $t1, 8(%[dst])                 \n"
-    "sw             $t2, 12(%[dst])                \n"
-    "bgtz           $t9, 1b                        \n"
-    " addiu         %[dst], %[dst], 16             \n"
-
-  "2:                                              \n"
-    "andi           $t9, %[dst_width], 0xf         \n"  // residue
-    "beqz           $t9, 3f                        \n"
-    " nop                                          \n"
-
-  "21:                                             \n"
-    "lbu            $t0, 0(%[src_ptr])             \n"
-    "addiu          %[src_ptr], %[src_ptr], 2      \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sb             $t0, 0(%[dst])                 \n"
-    "bgtz           $t9, 21b                       \n"
-    " addiu         %[dst], %[dst], 1              \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* t = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
-    "bltz           $t9, 2f                       \n"
-    " nop                                         \n"
-
-  "1:                                             \n"
-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
-    "addiu          $t9, $t9, -1                  \n"
-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
-    "addiu          %[src_ptr], %[src_ptr], 16    \n"
-    "addiu          %[t], %[t], 16                \n"
-    "sb             $t0, 0(%[dst])                \n"
-    "sb             $t4, 1(%[dst])                \n"
-    "sb             $t1, 2(%[dst])                \n"
-    "sb             $t5, 3(%[dst])                \n"
-    "sb             $t2, 4(%[dst])                \n"
-    "sb             $t6, 5(%[dst])                \n"
-    "sb             $t3, 6(%[dst])                \n"
-    "sb             $t7, 7(%[dst])                \n"
-    "bgtz           $t9, 1b                       \n"
-    " addiu         %[dst], %[dst], 8             \n"
-
-  "2:                                             \n"
-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
-    "beqz           $t9, 3f                       \n"
-    " nop                                         \n"
-
-    "21:                                          \n"
-    "lwr            $t1, 0(%[src_ptr])            \n"
-    "lwl            $t1, 3(%[src_ptr])            \n"
-    "lwr            $t2, 0(%[t])                  \n"
-    "lwl            $t2, 3(%[t])                  \n"
-    "srl            $t8, $t1, 16                  \n"
-    "ins            $t1, $t2, 16, 16              \n"
-    "ins            $t2, $t8, 0, 16               \n"
-    "raddu.w.qb     $t1, $t1                      \n"
-    "raddu.w.qb     $t2, $t2                      \n"
-    "shra_r.w       $t1, $t1, 2                   \n"
-    "shra_r.w       $t2, $t2, 2                   \n"
-    "sb             $t1, 0(%[dst])                \n"
-    "sb             $t2, 1(%[dst])                \n"
-    "addiu          %[src_ptr], %[src_ptr], 4     \n"
-    "addiu          $t9, $t9, -2                  \n"
-    "addiu          %[t], %[t], 4                 \n"
-    "bgtz           $t9, 21b                      \n"
-    " addiu         %[dst], %[dst], 2             \n"
-
-  "3:                                             \n"
-    ".set pop                                     \n"
-
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst), [t] "+r" (t)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "srl            $t9, %[dst_width], 3          \n"
-      "beqz           $t9, 2f                       \n"
-      " nop                                         \n"
-
-     "1:                                            \n"
-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
-      "addiu          %[src_ptr], %[src_ptr], 32    \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sw             $t1, 0(%[dst])                \n"
-      "sw             $t5, 4(%[dst])                \n"
-      "bgtz           $t9, 1b                       \n"
-      " addiu         %[dst], %[dst], 8             \n"
-
-    "2:                                             \n"
-      "andi           $t9, %[dst_width], 7          \n"  // residue
-      "beqz           $t9, 3f                       \n"
-      " nop                                         \n"
-
-    "21:                                            \n"
-      "lbu            $t1, 0(%[src_ptr])            \n"
-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sb             $t1, 0(%[dst])                \n"
-      "bgtz           $t9, 21b                      \n"
-      " addiu         %[dst], %[dst], 1             \n"
-
-    "3:                                             \n"
-      ".set pop                                     \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst)
-      : [dst_width] "r" (dst_width)
-      : "t1", "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  const uint8* s2 = s1 + stride;
-  const uint8* s3 = s2 + stride;
-
-  __asm__ __volatile__ (
-      ".set push                                  \n"
-      ".set noreorder                             \n"
-
-      "srl           $t9, %[dst_width], 1         \n"
-      "andi          $t8, %[dst_width], 1         \n"
-
-     "1:                                          \n"
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "add           $t4, $t4, $t5                \n"
-      "add           $t6, $t6, $t7                \n"
-      "add           $t4, $t4, $t6                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "shra_r.w      $t4, $t4, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-      "sb            $t4, 1(%[dst])               \n"
-      "addiu         %[src_ptr], %[src_ptr], 8    \n"
-      "addiu         %[s1], %[s1], 8              \n"
-      "addiu         %[s2], %[s2], 8              \n"
-      "addiu         %[s3], %[s3], 8              \n"
-      "addiu         $t9, $t9, -1                 \n"
-      "bgtz          $t9, 1b                      \n"
-      " addiu        %[dst], %[dst], 2            \n"
-      "beqz          $t8, 2f                      \n"
-      " nop                                       \n"
-
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-
-      "2:                                         \n"
-      ".set pop                                   \n"
-
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [s3] "+r" (s3)
-      : [dst_width] "r" (dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                          \n"
-      ".set noreorder                                     \n"
-    "1:                                                   \n"
-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
-      "addiu           %[dst_width], %[dst_width], -24    \n"
-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
-      "addiu           %[src_ptr], %[src_ptr], 32         \n"
-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
-      "sw              $t1, 0(%[dst])                     \n"
-      "sw              $t0, 4(%[dst])                     \n"
-      "sw              $t3, 8(%[dst])                     \n"
-      "sw              $t5, 12(%[dst])                    \n"
-      "sw              $t9, 16(%[dst])                    \n"
-      "sw              $t7, 20(%[dst])                    \n"
-      "bnez            %[dst_width], 1b                   \n"
-      " addiu          %[dst], %[dst], 24                 \n"
-      ".set pop                                           \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "repl.ph           $t3, 3                          \n"  // 0x00030003
-
-    "1:                                                  \n"
-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                        \n"
-      "raddu.w.qb        $t1, $t1                        \n"
-      "shra_r.w          $t0, $t0, 1                     \n"
-      "shra_r.w          $t1, $t1, 1                     \n"
-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
-      "addu.ph           $t2, $t2, $t4                   \n"
-      "addu.ph           $t6, $t6, $t5                   \n"
-      "sll               $t5, $t0, 1                     \n"
-      "add               $t0, $t5, $t0                   \n"
-      "shra_r.ph         $t2, $t2, 2                     \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "shll.ph           $t4, $t2, 1                     \n"
-      "addq.ph           $t4, $t4, $t2                   \n"
-      "addu              $t0, $t0, $t1                   \n"
-      "addiu             %[src_ptr], %[src_ptr], 4       \n"
-      "shra_r.w          $t0, $t0, 2                     \n"
-      "addu.ph           $t6, $t6, $t4                   \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "srl               $t1, $t6, 16                    \n"
-      "addiu             %[dst_width], %[dst_width], -3  \n"
-      "sb                $t1, 0(%[d])                    \n"
-      "sb                $t0, 1(%[d])                    \n"
-      "sb                $t6, 2(%[d])                    \n"
-      "bgtz              %[dst_width], 1b                \n"
-      " addiu            %[d], %[d], 3                   \n"
-    "3:                                                  \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                           \n"
-      ".set noreorder                                      \n"
-      "repl.ph           $t2, 3                            \n"  // 0x00030003
-
-    "1:                                                    \n"
-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                          \n"
-      "raddu.w.qb        $t1, $t1                          \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "shra_r.w          $t1, $t1, 1                       \n"
-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
-      "addu.ph           $t4, $t4, $t3                     \n"
-      "addu.ph           $t6, $t6, $t5                     \n"
-      "shra_r.ph         $t6, $t6, 2                       \n"
-      "shra_r.ph         $t4, $t4, 2                       \n"
-      "addu.ph           $t6, $t6, $t4                     \n"
-      "addiu             %[src_ptr], %[src_ptr], 4         \n"
-      "shra_r.ph         $t6, $t6, 1                       \n"
-      "addu              $t0, $t0, $t1                     \n"
-      "addiu             %[dst_width], %[dst_width], -3    \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "srl               $t1, $t6, 16                      \n"
-      "sb                $t1, 0(%[d])                      \n"
-      "sb                $t0, 1(%[d])                      \n"
-      "sb                $t6, 2(%[d])                      \n"
-      "bgtz              %[dst_width], 1b                  \n"
-      " addiu            %[d], %[d], 3                     \n"
-    "3:                                                    \n"
-      ".set pop                                            \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-
-    "1:                                              \n"
-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
-      "addiu      %[src_ptr], %[src_ptr], 32         \n"
-      "addiu      %[dst_width], %[dst_width], -12    \n"
-      "addiu      $t8,%[dst_width], -12              \n"
-      "sw         $t1, 0(%[dst])                     \n"
-      "sw         $t4, 4(%[dst])                     \n"
-      "sw         $t6, 8(%[dst])                     \n"
-      "bgez       $t8, 1b                            \n"
-      " addiu     %[dst], %[dst], 12                 \n"
-      ".set pop                                      \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* t = src_ptr + stride;
-  const int c = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
-      "srl             $t4, $t4, 2                       \n"  // t4 / 4
-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
-      "addu            $t6, $t5, $t6                     \n"
-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
-      "addu            $t0, $t0, $t2                     \n"
-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[t], %[t], 8                     \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t4, -1(%[dst_ptr])               \n"
-      "sb              $t6, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [t] "+r" (t),
-        [dst_width] "+r" (dst_width)
-      : [c] "r" (c)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  stride += stride;
-  const uint8* s2 = src_ptr + stride;
-  const int c1 = 0x1C71;
-  const int c2 = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
-      "addu            $t7, $t7, $t8                     \n"
-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
-      "addu            $t6, $t6, $t8                     \n"
-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
-      "addu            $t7, $t7, $t8                     \n"
-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
-      "raddu.w.qb      $t0, $t0                          \n"
-      "raddu.w.qb      $t2, $t2                          \n"
-      "raddu.w.qb      $t4, $t4                          \n"
-      "addu            $t0, $t0, $t2                     \n"
-      "addu            $t0, $t0, $t4                     \n"
-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[s1], %[s1], 8                   \n"
-      "addiu           %[s2], %[s2], 8                   \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t7, $t7, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t6, -1(%[dst_ptr])               \n"
-      "sb              $t7, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [dst_width] "+r" (dst_width)
-      : [c1] "r" (c1), [c2] "r" (c2)
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_neon.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_neon.cc
deleted file mode 100755
index 10856cf..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_neon.cc
+++ /dev/null
@@ -1,1017 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// NEON downscalers with interpolation.
-// Provided by Fritz Koenig
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"              // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"     // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "q0", "q1", "q2", "q3"     // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
-    "subs       %2, %2, #8                     \n" // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d2}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1", "memory", "cc"
-  );
-}
-
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
-    MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
-    "subs       %2, %2, #4                     \n"
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-    "vpaddl.u16 q0, q0                         \n"
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-    "vmovn.u16  d0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.32    {d0[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
-  :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
-  );
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs       %2, %2, #24                  \n"
-    "vmov       d2, d3                       \n" // order d0, d1, d2
-    MEMACCESS(1)
-    "vst3.8     {d0, d1, d2}, [%1]!          \n"
-    "bgt        1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "d0", "d1", "d2", "d3", "memory", "cc"
-  );
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
-
-    // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
-  );
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-    // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
-  );
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q3}, [%3]                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-    "subs       %2, %2, #12                    \n"
-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"
-    MEMACCESS(1)
-    "vst1.32    {d5[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
-  );
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
-  asm volatile (
-    MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
-    MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
-  );
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(4)
-    "vld1.16    {q13}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q14}, [%5]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(dst_ptr),       // %1
-    "+r"(dst_width),     // %2
-    "+r"(src_stride)     // %3
-  : "r"(&kMult38_Div6),  // %4
-    "r"(&kShuf38_2)      // %5
-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
-  );
-}
-
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       r12, %5                         \n"
-    "veor      q2, q2, q2                      \n"
-    "veor      q3, q3, q3                      \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], %3                 \n"
-    "vaddw.u8   q3, q3, d1                     \n"
-    "vaddw.u8   q2, q2, d0                     \n"
-    "subs       r12, r12, #1                   \n"
-    "bgt        2b                             \n"
-    MEMACCESS(2)
-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-    "add        %1, %1, #16                    \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(src_tmp),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_ptr),          // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_width),        // %4
-    "+r"(src_height)        // %5
-  :
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                     \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q1, q1, q0                     \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "vadd.s32   q2, q1, q3                     \n"
-    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
-  "1:                                          \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-    "vmov       q10, q1                        \n"
-    "vmov       q11, q2                        \n"
-    "vuzp.16    q10, q11                       \n"
-    "vmovl.u8   q8, d6                         \n"
-    "vmovl.u8   q9, d7                         \n"
-    "vsubl.s16  q11, d18, d16                  \n"
-    "vsubl.s16  q12, d19, d17                  \n"
-    "vmovl.u16  q13, d20                       \n"
-    "vmovl.u16  q10, d21                       \n"
-    "vmul.s32   q11, q11, q13                  \n"
-    "vmul.s32   q12, q12, q10                  \n"
-    "vshrn.s32  d18, q11, #16                  \n"
-    "vshrn.s32  d19, q12, #16                  \n"
-    "vadd.s16   q8, q8, q9                     \n"
-    "vmovn.s16  d6, q8                         \n"
-
-    MEMACCESS(0)
-    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
-    "vadd.s32   q1, q1, q0                     \n"
-    "vadd.s32   q2, q2, q0                     \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp          %4, #0                       \n"
-    "beq          100f                         \n"
-    "add          %2, %1                       \n"
-    "cmp          %4, #64                      \n"
-    "beq          75f                          \n"
-    "cmp          %4, #128                     \n"
-    "beq          50f                          \n"
-    "cmp          %4, #192                     \n"
-    "beq          25f                          \n"
-
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q1}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q0}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "vst1.8       {d1[7]}, [%0]                \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
-  :
-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
-  );
-}
-
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
-    "vld2.32    {q2, q3}, [%0]!                \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    "vrshrn.u16 d2, q2, #1                     \n"
-    "vrshrn.u16 d3, q3, #1                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
-    "bgt       1b                              \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[1]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[1]}, [%0], r12             \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %4, lsl #2                \n"
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d4}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d5}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d6}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d7}, [%1], r12                \n"
-    "vaddl.u8   q0, d0, d1                     \n"
-    "vaddl.u8   q1, d2, d3                     \n"
-    "vaddl.u8   q2, d4, d5                     \n"
-    "vaddl.u8   q3, d6, d7                     \n"
-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  int tmp = 0;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(d0, 0)
-    LOAD1_DATA32_LANE(d0, 1)
-    LOAD1_DATA32_LANE(d1, 0)
-    LOAD1_DATA32_LANE(d1, 1)
-    LOAD1_DATA32_LANE(d2, 0)
-    LOAD1_DATA32_LANE(d2, 1)
-    LOAD1_DATA32_LANE(d3, 0)
-    LOAD1_DATA32_LANE(d3, 1)
-
-    MEMACCESS(0)
-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1"
-  );
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
-    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q8, q1, q0                     \n"
-  "1:                                          \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(d0, d2, 0)
-    LOAD2_DATA32_LANE(d0, d2, 1)
-    LOAD2_DATA32_LANE(d1, d3, 0)
-    LOAD2_DATA32_LANE(d1, d3, 1)
-    "vshrn.i32   d22, q8, #9                   \n"
-    "vand.16     d22, d22, d30                 \n"
-    "vdup.8      d24, d22[0]                   \n"
-    "vdup.8      d25, d22[2]                   \n"
-    "vdup.8      d26, d22[4]                   \n"
-    "vdup.8      d27, d22[6]                   \n"
-    "vext.8      d4, d24, d25, #4              \n"
-    "vext.8      d5, d26, d27, #4              \n"  // f
-    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
-    "vmull.u8    q11, d0, d20                  \n"
-    "vmull.u8    q12, d1, d21                  \n"
-    "vmull.u8    q13, d2, d4                   \n"
-    "vmull.u8    q14, d3, d5                   \n"
-    "vadd.i16    q11, q11, q13                 \n"
-    "vadd.i16    q12, q12, q14                 \n"
-    "vshrn.i16   d0, q11, #7                   \n"
-    "vshrn.i16   d1, q12, #7                   \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
-    "vadd.s32    q8, q8, q9                    \n"
-    "subs        %2, %2, #4                    \n"  // 4 processed per loop
-    "bgt         1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_neon64.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_neon64.cc
deleted file mode 100755
index 1d55193..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_neon64.cc
+++ /dev/null
@@ -1,1042 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into v0, odd into v1
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"              // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #1              \n"
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"     // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
-    "uadalp     v1.8h, v3.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #2              \n"
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "v0", "v1", "v2", "v3"     // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "st1     {v2.8b}, [%1], #8                 \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
-}
-
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
-    MEMACCESS(3)
-    "ld1     {v1.16b}, [%2], #16               \n"
-    MEMACCESS(4)
-    "ld1     {v2.16b}, [%3], #16               \n"
-    MEMACCESS(5)
-    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs    %w5, %w5, #4                      \n"
-    "uaddlp  v0.8h, v0.16b                     \n"
-    "uadalp  v0.8h, v1.16b                     \n"
-    "uadalp  v0.8h, v2.16b                     \n"
-    "uadalp  v0.8h, v3.16b                     \n"
-    "addp    v0.8h, v0.8h, v0.8h               \n"
-    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
-    MEMACCESS(1)
-    "st1    {v0.s}[0], [%1], #4                \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr3),  // %4
-    "+r"(dst_width)  // %5
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    "subs      %w2, %w2, #24                           \n"
-    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "ushll     v16.8h, v4.8b, #0                       \n"
-    "ushll     v17.8h, v5.8b, #0                       \n"
-    "ushll     v18.8h, v6.8b, #0                       \n"
-    "ushll     v19.8h, v7.8b, #0                       \n"
-
-    // 3 * line_0 + line_1
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "umlal     v17.8h, v1.8b, v20.8b                   \n"
-    "umlal     v18.8h, v2.8b, v20.8b                   \n"
-    "umlal     v19.8h, v3.8b, v20.8b                   \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-    "uqrshrn   v1.8b, v17.8h, #2                       \n"
-    "uqrshrn   v2.8b, v18.8h, #2                       \n"
-    "uqrshrn   v3.8b, v19.8h, #2                       \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v16.8h, v1.8b, #0                       \n"
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v16.8h, v2.8b, #0                       \n"
-    "umlal     v16.8h, v3.8b, v20.8b                   \n"
-    "uqrshrn   v2.8b, v16.8h, #2                       \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
-    "v20", "memory", "cc"
-  );
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-    // average src line 0 with src line 1
-    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v4.8h, v1.8b, #0                        \n"
-    "umlal     v4.8h, v0.8b, v20.8b                    \n"
-    "uqrshrn   v0.8b, v4.8h, #2                        \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v4.8h, v2.8b, #0                        \n"
-    "umlal     v4.8h, v3.8b, v20.8b                    \n"
-    "uqrshrn   v2.8b, v4.8h, #2                        \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
-  );
-}
-
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1       {v3.16b}, [%3]                          \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
-    "subs      %w2, %w2, #12                           \n"
-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
-    MEMACCESS(1)
-    "st1       {v2.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v2.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
-  ptrdiff_t tmp_src_stride = src_stride;
-
-  asm volatile (
-    MEMACCESS(5)
-    "ld1       {v29.8h}, [%5]                          \n"
-    MEMACCESS(6)
-    "ld1       {v30.16b}, [%6]                         \n"
-    MEMACCESS(7)
-    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %w4, %w4, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v20.8b, v0.8b, v1.8b                    \n"
-    "trn2      v21.8b, v0.8b, v1.8b                    \n"
-    "trn1      v22.8b, v4.8b, v5.8b                    \n"
-    "trn2      v23.8b, v4.8b, v5.8b                    \n"
-    "trn1      v24.8b, v16.8b, v17.8b                  \n"
-    "trn2      v25.8b, v16.8b, v17.8b                  \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-    "trn1      v16.8b, v18.8b, v19.8b                  \n"
-    "trn2      v17.8b, v18.8b, v19.8b                  \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v20.4h, v20.8b                          \n"
-    "uaddlp    v21.4h, v21.8b                          \n"
-    "uaddlp    v22.4h, v22.8b                          \n"
-    "uaddlp    v23.4h, v23.8b                          \n"
-    "uaddlp    v24.4h, v24.8b                          \n"
-    "uaddlp    v25.4h, v25.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-
-    // combine source lines
-    "add       v20.4h, v20.4h, v22.4h                  \n"
-    "add       v21.4h, v21.4h, v23.4h                  \n"
-    "add       v20.4h, v20.4h, v24.4h                  \n"
-    "add       v21.4h, v21.4h, v25.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-    "add       v2.4h, v2.4h, v17.4h                    \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-    "xtn       v2.8b,  v2.8h                           \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "ushll     v16.8h, v16.8b, #0                      \n"
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // combine source lines
-    "add       v0.8h, v0.8h, v16.8h                    \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v20.8h, v20.8h, v0.8h                   \n"
-    "add       v21.8h, v21.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(tmp_src_stride),   // %2
-    "+r"(src_ptr1),         // %3
-    "+r"(dst_width)         // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
-    "v30", "v31", "memory", "cc"
-  );
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  // TODO(fbarchard): use src_stride directly for clang 3.5+.
-  ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile (
-    MEMACCESS(4)
-    "ld1       {v30.8h}, [%4]                          \n"
-    MEMACCESS(5)
-    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    "subs      %w3, %w3, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v16.8b, v0.8b, v1.8b                    \n"
-    "trn2      v17.8b, v0.8b, v1.8b                    \n"
-    "trn1      v18.8b, v4.8b, v5.8b                    \n"
-    "trn2      v19.8b, v4.8b, v5.8b                    \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v16.4h, v16.8b                          \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-    "uaddlp    v18.4h, v18.8b                          \n"
-    "uaddlp    v19.4h, v19.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-
-    // combine source lines
-    "add       v16.4h, v16.4h, v18.4h                  \n"
-    "add       v17.4h, v17.4h, v19.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "uqrshrn   v2.8b, v2.8h, #2                        \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
-    // combine source lines
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v16.8h, v16.8h, v0.8h                   \n"
-    "add       v17.8h, v17.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),         // %1
-    "+r"(tmp_src_stride),  // %2
-    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)        // %5
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v30", "v31", "memory", "cc"
-  );
-}
-
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       w12, %w5                        \n"
-    "eor       v2.16b, v2.16b, v2.16b          \n"
-    "eor       v3.16b, v3.16b, v3.16b          \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %3              \n"
-    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
-    "uaddw     v2.8h, v2.8h, v0.8b             \n"
-    "subs      w12, w12, #1                    \n"
-    "b.gt      2b                              \n"
-    MEMACCESS(2)
-    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
-    "add      %1, %1, #16                      \n"
-    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
-    "b.gt     1b                               \n"
-  : "+r"(src_tmp),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_ptr),          // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_width),        // %4
-    "+r"(src_height)        // %5
-  :
-  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                    \n"              \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v1.4s, v1.4s, v0.4s            \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "add        v2.4s, v1.4s, v3.4s            \n"
-    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
-  "1:                                          \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-    "mov       v6.16b, v1.16b                  \n"
-    "mov       v7.16b, v2.16b                  \n"
-    "uzp1      v6.8h, v6.8h, v7.8h             \n"
-    "ushll     v4.8h, v4.8b, #0                \n"
-    "ushll     v5.8h, v5.8b, #0                \n"
-    "ssubl     v16.4s, v5.4h, v4.4h            \n"
-    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
-    "ushll     v7.4s, v6.4h, #0                \n"
-    "ushll2    v6.4s, v6.8h, #0                \n"
-    "mul       v16.4s, v16.4s, v7.4s           \n"
-    "mul       v17.4s, v17.4s, v6.4s           \n"
-    "shrn      v6.4h, v16.4s, #16              \n"
-    "shrn2     v6.8h, v17.4s, #16              \n"
-    "add       v4.8h, v4.8h, v6.8h             \n"
-    "xtn       v4.8b, v4.8h                    \n"
-
-    MEMACCESS(0)
-    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
-    "add       v1.4s, v1.4s, v0.4s             \n"
-    "add       v2.4s, v2.4s, v0.4s             \n"
-    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width64),      // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3",
-    "v4", "v5", "v6", "v7", "v16", "v17"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-    int y_fraction = 256 - source_y_fraction;
-  asm volatile (
-    "cmp          %w4, #0                      \n"
-    "b.eq         100f                         \n"
-    "add          %2, %2, %1                   \n"
-    "cmp          %w4, #64                     \n"
-    "b.eq         75f                          \n"
-    "cmp          %w4, #128                    \n"
-    "b.eq         50f                          \n"
-    "cmp          %w4, #192                    \n"
-    "b.eq         25f                          \n"
-
-    "dup          v5.8b, %w4                   \n"
-    "dup          v4.8b, %w5                   \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "umull        v6.8h, v0.8b, v4.8b          \n"
-    "umull2       v7.8h, v0.16b, v4.16b        \n"
-    "umlal        v6.8h, v1.8b, v5.8b          \n"
-    "umlal2       v7.8h, v1.16b, v5.16b        \n"
-    "rshrn        v0.8b, v6.8h, #8             \n"
-    "rshrn2       v0.16b, v7.8h, #8            \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v1.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v0.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "st1          {v0.b}[15], [%0]             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction),// %4
-    "+r"(y_fraction)        // %5
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
-  );
-}
-
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS (0)
-    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
-    MEMACCESS (0)
-    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS (1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    MEMACCESS (1)
-    "st1        {v3.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (dst),              // %1
-    "+r" (dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS (0)
-    // load 8 ARGB pixels.
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #1               \n"
-    "rshrn      v2.8b, v2.8h, #1               \n"
-    "rshrn      v3.8b, v3.8h, #1               \n"
-    MEMACCESS (1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS (0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS (1)
-    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
-    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #2               \n"
-    "rshrn      v2.8b, v2.8h, #2               \n"
-    "rshrn      v3.8b, v3.8h, #2               \n"
-    MEMACCESS (2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (src_stride),       // %1
-    "+r" (dst),              // %2
-    "+r" (dst_width)         // %3
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[0], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[1], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[2], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[3], [%0], %3            \n"
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"((int64)(src_stepx * 4)) // %3
-  : "memory", "cc", "v0"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-// TODO(Yang Zhang): Might be worth another optimization pass in future.
-// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v4.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v5.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v6.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v7.8b}, [%1], %4              \n"
-    "uaddl      v0.8h, v0.8b, v1.8b            \n"
-    "uaddl      v2.8h, v2.8b, v3.8b            \n"
-    "uaddl      v4.8h, v4.8b, v5.8b            \n"
-    "uaddl      v6.8h, v6.8b, v7.8b            \n"
-    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-    "mov        v0.d[1], v2.d[0]               \n"
-    "mov        v2.d[0], v16.d[1]              \n"
-    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-    "mov        v4.d[1], v6.d[0]               \n"
-    "mov        v6.d[0], v16.d[1]              \n"
-    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "st1     {v0.16b}, [%2], #16               \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"((int64)(src_stepx * 4)) // %4
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld1        {"#vn".s}["#n"], [%6]          \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  int64 tmp64 = 0;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(v0, 0)
-    LOAD1_DATA32_LANE(v0, 1)
-    LOAD1_DATA32_LANE(v0, 2)
-    LOAD1_DATA32_LANE(v0, 3)
-    LOAD1_DATA32_LANE(v1, 0)
-    LOAD1_DATA32_LANE(v1, 1)
-    LOAD1_DATA32_LANE(v1, 2)
-    LOAD1_DATA32_LANE(v1, 3)
-
-    MEMACCESS(0)
-    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    "b.gt        1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp64),            // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1"
-  );
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    "movi       v3.16b, #0x7f                  \n"  // 0x7F
-    "movi       v4.8h, #0x7f                   \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v5.4s, v1.4s, v0.4s            \n"
-  "1:                                          \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(v0, v1, 0)
-    LOAD2_DATA32_LANE(v0, v1, 1)
-    LOAD2_DATA32_LANE(v0, v1, 2)
-    LOAD2_DATA32_LANE(v0, v1, 3)
-    "shrn       v2.4h, v5.4s, #9               \n"
-    "and        v2.8b, v2.8b, v4.8b            \n"
-    "dup        v16.8b, v2.b[0]                \n"
-    "dup        v17.8b, v2.b[2]                \n"
-    "dup        v18.8b, v2.b[4]                \n"
-    "dup        v19.8b, v2.b[6]                \n"
-    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
-    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
-    "ins        v2.d[1], v17.d[0]              \n"  // f
-    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
-    "umull      v16.8h, v0.8b, v7.8b           \n"
-    "umull2     v17.8h, v0.16b, v7.16b         \n"
-    "umull      v18.8h, v1.8b, v2.8b           \n"
-    "umull2     v19.8h, v1.16b, v2.16b         \n"
-    "add        v16.8h, v16.8h, v18.8h         \n"
-    "add        v17.8h, v17.8h, v19.8h         \n"
-    "shrn       v0.8b, v16.8h, #7              \n"
-    "shrn2      v0.16b, v17.8h, #7             \n"
-
-    MEMACCESS(0)
-    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
-    "add     v5.4s, v5.4s, v6.4s               \n"
-    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-    "v6", "v7", "v16", "v17", "v18", "v19"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/scale_win.cc b/android/src/main/libenc/jni/libyuv/jni/source/scale_win.cc
deleted file mode 100755
index 21b1ed9..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/scale_win.cc
+++ /dev/null
@@ -1,1357 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5      // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add
-    paddw      xmm1, xmm3
-    psrlw      xmm0, 1
-    psrlw      xmm1, 1
-    pavgw      xmm0, xmm5      // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-// Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
-    vpsrlw      ymm1, ymm1, 8
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// For rounding, average = (sum + 2) / 4
-// becomes average((sum >> 1), 0)
-// Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push        esi
-    mov         eax, [esp + 4 + 4]    // src_ptr
-    mov         esi, [esp + 4 + 8]    // src_stride
-    mov         edx, [esp + 4 + 12]   // dst_ptr
-    mov         ecx, [esp + 4 + 16]   // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add
-    vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
-    vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-// Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
-    psrld      xmm5, 24
-    pslld      xmm5, 16
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
-    psrlw      xmm4, 15
-    movdqa     xmm5, xmm4
-    packuswb   xmm4, xmm4
-    psllw      xmm5, 3               // constant 0x0008
-
-  wloop:
-    movdqu     xmm0, [eax]           // average rows
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4      // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add rows 0, 1
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + esi * 2]
-    movdqu     xmm3, [eax + esi * 2 + 16]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 2
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + edi]
-    movdqu     xmm3, [eax + edi + 16]
-    lea        eax, [eax + 32]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 3
-    paddw      xmm1, xmm3
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5      // + 8 for round
-    psrlw      xmm0, 4         // /16 for average of 4 * 4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-// Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride ignored
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
-    vpsrld      ymm5, ymm5, 24
-    vpslld      ymm5, ymm5, 16
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpand       ymm0, ymm0, ymm5
-    vpand       ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push        esi
-    push        edi
-    mov         eax, [esp + 8 + 4]    // src_ptr
-    mov         esi, [esp + 8 + 8]    // src_stride
-    mov         edx, [esp + 8 + 12]   // dst_ptr
-    mov         ecx, [esp + 8 + 16]   // dst_width
-    lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
-    vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3               // constant 0x0008
-    vpackuswb   ymm4, ymm4, ymm4
-
-  wloop:
-    vmovdqu     ymm0, [eax]           // average rows
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + esi * 2]
-    vmovdqu     ymm3, [eax + esi * 2 + 32]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 2
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + edi]
-    vmovdqu     ymm3, [eax + edi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 3
-    vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1      // mutates
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
-    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm3, xmmword ptr kShuf0
-    movdqa     xmm4, xmmword ptr kShuf1
-    movdqa     xmm5, xmmword ptr kShuf2
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm1
-    palignr    xmm1, xmm0, 8
-    pshufb     xmm0, xmm3
-    pshufb     xmm1, xmm4
-    pshufb     xmm2, xmm5
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + 8], xmm1
-    movq       qword ptr [edx + 16], xmm2
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx+24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm4, xmmword ptr kShuf38a
-    movdqa     xmm5, xmmword ptr kShuf38b
-
-  xloop:
-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm4
-    pshufb     xmm1, xmm5
-    paddusb    xmm0, xmm1
-
-    movq       qword ptr [edx], xmm0  // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edx + 8], xmm1
-    lea        edx, [edx + 12]
-    sub        ecx, 12
-    jg         xloop
-
-    ret
-  }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, xmmword ptr kShufAc
-    movdqa     xmm3, xmmword ptr kShufAc3
-    movdqa     xmm4, xmmword ptr kScaleAc33
-    pxor       xmm5, xmm5
-
-  xloop:
-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqu     xmm6, [eax + esi]
-    movhlps    xmm1, xmm0
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-    movdqu     xmm6, [eax + esi * 2]
-    lea        eax, [eax + 16]
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    pshufb     xmm6, xmm2
-
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    pshufb     xmm7, xmm3
-    paddusw    xmm6, xmm7
-
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
-    packuswb   xmm6, xmm6
-
-    movd       [edx], xmm6           // write 6 pixels
-    psrlq      xmm6, 16
-    movd       [edx + 2], xmm6
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, xmmword ptr kShufAb0
-    movdqa     xmm3, xmmword ptr kShufAb1
-    movdqa     xmm4, xmmword ptr kShufAb2
-    movdqa     xmm5, xmmword ptr kScaleAb2
-
-  xloop:
-    movdqu     xmm0, [eax]           // average 2 rows into xmm0
-    movdqu     xmm1, [eax + esi]
-    lea        eax, [eax + 16]
-    pavgb      xmm0, xmm1
-
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
-    pshufb     xmm1, xmm2
-    movdqa     xmm6, xmm0
-    pshufb     xmm6, xmm3
-    paddusw    xmm1, xmm6
-    pshufb     xmm0, xmm4
-    paddusw    xmm1, xmm0
-
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
-    packuswb   xmm1, xmm1
-
-    movd       [edx], xmm1           // write 6 pixels
-    psrlq      xmm1, 16
-    movd       [edx + 2], xmm1
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    mov        edx, [esp + 8]   // dst_ptr
-    mov        ecx, [esp + 12]  // src_width
-    pxor       xmm5, xmm5
-
-  // sum rows
-  xloop:
-    movdqu     xmm3, [eax]       // read 16 bytes
-    lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]       // read 16 words from destination
-    movdqu     xmm1, [edx + 16]
-    movdqa     xmm2, xmm3
-    punpcklbw  xmm2, xmm5
-    punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2        // sum 16 words
-    paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0       // write 16 words to destination
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 16
-    jg         xloop
-    ret
-  }
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  __asm {
-    mov         eax, [esp + 4]   // src_ptr
-    mov         edx, [esp + 8]   // dst_ptr
-    mov         ecx, [esp + 12]  // src_width
-    vpxor       ymm5, ymm5, ymm5
-
-  // sum rows
-  xloop:
-    vmovdqu     ymm3, [eax]       // read 32 bytes
-    lea         eax, [eax + 32]
-    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
-    vpunpcklbw  ymm2, ymm3, ymm5
-    vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
-    vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0       // write 32 words to destination
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 32
-    jg          xloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        edi, [esp + 12 + 4]    // dst_ptr
-    mov        esi, [esp + 12 + 8]    // src_ptr
-    mov        ecx, [esp + 12 + 12]   // dst_width
-    movd       xmm2, [esp + 12 + 16]  // x
-    movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
-    movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm1, 9              // 7 bit fractions.
-    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
-    movd       xmm4, ebx
-    pshufb     xmm1, xmm5           // 0011
-    punpcklwd  xmm0, xmm4
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
-    movd       ebx, xmm0
-    mov        [edi], bx
-    lea        edi, [edi + 2]
-    sub        ecx, 2               // 2 pixels
-    jge        xloop2
-
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-    // 1 pixel remainder
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm2, 9              // 7 bit fractions.
-    pshufb     xmm2, xmm5           // 0011
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // 16 bit
-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // 8 bits
-    movd       ebx, xmm0
-    mov        [edi], bl
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0
-    punpckhbw  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         wloop
-
-    ret
-  }
-}
-
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0xdd
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
-  __asm {
-    push       ebx
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movd       xmm0, [eax]
-    movd       xmm1, [eax + ebx]
-    punpckldq  xmm0, xmm1
-    movd       xmm2, [eax + ebx * 2]
-    movd       xmm3, [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        ebx
-    ret
-  }
-}
-
-// Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movq       xmm0, qword ptr [eax]  // row0 4 pairs
-    movhps     xmm0, qword ptr [eax + ebx]
-    movq       xmm1, qword ptr [eax + ebx * 2]
-    movhps     xmm1, qword ptr [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    movq       xmm2, qword ptr [esi]  // row1 4 pairs
-    movhps     xmm2, qword ptr [esi + ebx]
-    movq       xmm3, qword ptr [esi + ebx * 2]
-    movhps     xmm3, qword ptr [esi + edi]
-    lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  __asm {
-    push       edi
-    push       esi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-
-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
-    paddd      xmm2, xmm0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0            // x3 x2 x1 x0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
-
-    pextrw     eax, xmm2, 1          // get x0 integer.
-    pextrw     edx, xmm2, 3          // get x1 integer.
-
-    cmp        ecx, 0
-    jle        xloop99
-    sub        ecx, 4
-    jl         xloop49
-
-    // 4 Pixel loop.
- xloop4:
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    pextrw     edx, xmm2, 7           // get x3 integer.
-    paddd      xmm2, xmm3             // x += dx
-    punpckldq  xmm0, xmm1             // x0 x1
-
-    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
-    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4             // x2 x3
-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4                 // 4 pixels
-    jge        xloop4
-
- xloop49:
-    test       ecx, 2
-    je         xloop29
-
-    // 2 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    punpckldq  xmm0, xmm1             // x0 x1
-
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-
- xloop29:
-    test       ecx, 1
-    je         xloop99
-
-    // 1 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
-    movd       dword ptr [edi], xmm0
- xloop99:
-
-    pop        esi
-    pop        edi
-    ret
-  }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-    movdqa     xmm4, xmmword ptr kShuffleColARGB
-    movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
-    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
-    jge        xloop2
-
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
-    movd       [edi], xmm0
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  __asm {
-    mov        edx, [esp + 4]    // dst_argb
-    mov        eax, [esp + 8]    // src_argb
-    mov        ecx, [esp + 12]   // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpckldq  xmm0, xmm0
-    punpckhdq  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
-    shl        eax, 16
-    idiv       dword ptr [esp + 8]
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]    // num
-    mov        ecx, [esp + 8]    // denom
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
-    shl        eax, 16
-    sub        eax, 0x00010001
-    sbb        edx, 0
-    sub        ecx, 1
-    idiv       ecx
-    ret
-  }
-}
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/android/src/main/libenc/jni/libyuv/jni/source/video_common.cc b/android/src/main/libenc/jni/libyuv/jni/source/video_common.cc
deleted file mode 100755
index 379a066..0000000
--- a/android/src/main/libenc/jni/libyuv/jni/source/video_common.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
-struct FourCCAliasEntry {
-  uint32 alias;
-  uint32 canonical;
-};
-
-static const struct FourCCAliasEntry kFourCCAliases[] = {
-  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU16, FOURCC_I422},
-  {FOURCC_YU24, FOURCC_I444},
-  {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-  {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-  {FOURCC_RGB3, FOURCC_RAW },
-  {FOURCC_BGR3, FOURCC_24BG},
-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
-};
-// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
-//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
-
-LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
-  int i;
-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
-    if (kFourCCAliases[i].alias == fourcc) {
-      return kFourCCAliases[i].canonical;
-    }
-  }
-  // Not an alias, so return it as-is.
-  return fourcc;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/android/src/main/res/raw/amaro.glsl b/android/src/main/res/raw/amaro.glsl
deleted file mode 100755
index b510316..0000000
--- a/android/src/main/res/raw/amaro.glsl
+++ /dev/null
@@ -1,32 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //blowout;
-uniform sampler2D inputImageTexture3; //overlay;
-uniform sampler2D inputImageTexture4; //map
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 texel = texture2D(inputImageTexture, textureCoordinate);
-    vec3 bbTexel = texture2D(inputImageTexture2, textureCoordinate).rgb;
-
-    texel.r = texture2D(inputImageTexture3, vec2(bbTexel.r, texel.r)).r;
-    texel.g = texture2D(inputImageTexture3, vec2(bbTexel.g, texel.g)).g;
-    texel.b = texture2D(inputImageTexture3, vec2(bbTexel.b, texel.b)).b;
-
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture4, vec2(texel.r, .16666)).r;
-    mapped.g = texture2D(inputImageTexture4, vec2(texel.g, .5)).g;
-    mapped.b = texture2D(inputImageTexture4, vec2(texel.b, .83333)).b;
-    mapped.a = 1.0;
-
-    mapped.rgb = mix(originColor.rgb, mapped.rgb, strength);
-     gl_FragColor = mapped;
-}
diff --git a/android/src/main/res/raw/antique.glsl b/android/src/main/res/raw/antique.glsl
deleted file mode 100755
index 3a05170..0000000
--- a/android/src/main/res/raw/antique.glsl
+++ /dev/null
@@ -1,53 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-void main()
-{
-	highp vec4 textureColor;
-	highp vec4 textureColorRes;
-	highp float satVal = 65.0 / 100.0;
-	
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y;
-	
-	highp float redCurveValue;
-	highp float greenCurveValue;
-	highp float blueCurveValue;
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate));
-	textureColorRes = textureColor;
-	
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b;
-	
-	highp float G = (redCurveValue + greenCurveValue + blueCurveValue);
-	G = G / 3.0;
-	
-	redCurveValue = ((1.0 - satVal) * G + satVal * redCurveValue);
-	greenCurveValue = ((1.0 - satVal) * G + satVal * greenCurveValue);
-	blueCurveValue = ((1.0 - satVal) * G + satVal * blueCurveValue);
-	redCurveValue = (((redCurveValue) > (1.0)) ? (1.0) : (((redCurveValue) < (0.0)) ? (0.0) : (redCurveValue)));
-	greenCurveValue = (((greenCurveValue) > (1.0)) ? (1.0) : (((greenCurveValue) < (0.0)) ? (0.0) : (greenCurveValue)));
-	blueCurveValue = (((blueCurveValue) > (1.0)) ? (1.0) : (((blueCurveValue) < (0.0)) ? (0.0) : (blueCurveValue)));
-	
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 0.0)).a;
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 0.0)).a;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 0.0)).a; 
-	
-	highp vec4 base = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	highp vec4 overlayer = vec4(250.0/255.0, 227.0/255.0, 193.0/255.0, 1.0);
-	
-	textureColor = overlayer * base;
-	base = (textureColor - base) * 0.850980 + base;
-	textureColor = base; 
-	
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0);
-}
-  
\ No newline at end of file
diff --git a/android/src/main/res/raw/beauty.glsl b/android/src/main/res/raw/beauty.glsl
deleted file mode 100755
index 1a32a63..0000000
--- a/android/src/main/res/raw/beauty.glsl
+++ /dev/null
@@ -1,111 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-uniform samplerExternalOES inputImageTexture;
-uniform vec2 singleStepOffset;
-
-varying vec2 textureCoordinate;
-
-const vec4 params = vec4(0.748, 0.874, 0.241, 0.241);
-const vec3 W = vec3(0.299,0.587,0.114);
-const mat3 saturateMatrix = mat3(
-                                1.1102,-0.0598,-0.061,
-                                -0.0774,1.0826,-0.1186,
-                                -0.0228,-0.0228,1.1772);
-
-vec2 blurCoordinates[24];
-
-float hardLight(float color) {
-    if(color <= 0.5) {
-        color = color * color * 2.0;
-    } else {
-        color = 1.0 - ((1.0 - color)*(1.0 - color) * 2.0);
-    }
-    return color;
-}
-
-void main() {
-    vec3 centralColor = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    blurCoordinates[0] = textureCoordinate.xy + singleStepOffset * vec2(0.0, -10.0);
-    blurCoordinates[1] = textureCoordinate.xy + singleStepOffset * vec2(0.0, 10.0);
-    blurCoordinates[2] = textureCoordinate.xy + singleStepOffset * vec2(-10.0, 0.0);
-    blurCoordinates[3] = textureCoordinate.xy + singleStepOffset * vec2(10.0, 0.0);
-    blurCoordinates[4] = textureCoordinate.xy + singleStepOffset * vec2(5.0, -8.0);
-    blurCoordinates[5] = textureCoordinate.xy + singleStepOffset * vec2(5.0, 8.0);
-    blurCoordinates[6] = textureCoordinate.xy + singleStepOffset * vec2(-5.0, 8.0);
-    blurCoordinates[7] = textureCoordinate.xy + singleStepOffset * vec2(-5.0, -8.0);
-    blurCoordinates[8] = textureCoordinate.xy + singleStepOffset * vec2(8.0, -5.0);
-    blurCoordinates[9] = textureCoordinate.xy + singleStepOffset * vec2(8.0, 5.0);
-    blurCoordinates[10] = textureCoordinate.xy + singleStepOffset * vec2(-8.0, 5.0);
-    blurCoordinates[11] = textureCoordinate.xy + singleStepOffset * vec2(-8.0, -5.0);
-    blurCoordinates[12] = textureCoordinate.xy + singleStepOffset * vec2(0.0, -6.0);
-    blurCoordinates[13] = textureCoordinate.xy + singleStepOffset * vec2(0.0, 6.0);
-    blurCoordinates[14] = textureCoordinate.xy + singleStepOffset * vec2(6.0, 0.0);
-    blurCoordinates[15] = textureCoordinate.xy + singleStepOffset * vec2(-6.0, 0.0);
-    blurCoordinates[16] = textureCoordinate.xy + singleStepOffset * vec2(-4.0, -4.0);
-    blurCoordinates[17] = textureCoordinate.xy + singleStepOffset * vec2(-4.0, 4.0);
-    blurCoordinates[18] = textureCoordinate.xy + singleStepOffset * vec2(4.0, -4.0);
-    blurCoordinates[19] = textureCoordinate.xy + singleStepOffset * vec2(4.0, 4.0);
-    blurCoordinates[20] = textureCoordinate.xy + singleStepOffset * vec2(-2.0, -2.0);
-    blurCoordinates[21] = textureCoordinate.xy + singleStepOffset * vec2(-2.0, 2.0);
-    blurCoordinates[22] = textureCoordinate.xy + singleStepOffset * vec2(2.0, -2.0);
-    blurCoordinates[23] = textureCoordinate.xy + singleStepOffset * vec2(2.0, 2.0);
-
-    float sampleColor = centralColor.g * 22.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[0]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[1]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[2]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[3]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[4]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[5]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[6]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[7]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[8]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[9]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[10]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[11]).g;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[12]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[13]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[14]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[15]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[16]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[17]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[18]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[19]).g * 2.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[20]).g * 3.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[21]).g * 3.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[22]).g * 3.0;
-    sampleColor += texture2D(inputImageTexture, blurCoordinates[23]).g * 3.0;
-    sampleColor = sampleColor / 62.0;
-
-    float highPass = centralColor.g - sampleColor + 0.5;
-
-    for(int i = 0; i < 5;i++)
-    {
-        highPass = hardLight(highPass);
-    }
-    float luminance = dot(centralColor, W);
-    float alpha = pow(luminance, params.r);
-
-    vec3 smoothColor = centralColor + (centralColor-vec3(highPass))*alpha*0.1;
-
-    smoothColor.r = clamp(pow(smoothColor.r, params.g),0.0,1.0);
-    smoothColor.g = clamp(pow(smoothColor.g, params.g),0.0,1.0);
-    smoothColor.b = clamp(pow(smoothColor.b, params.g),0.0,1.0);
-
-    vec3 screen = vec3(1.0) - (vec3(1.0)-smoothColor) * (vec3(1.0)-centralColor);
-    vec3 lighten = max(smoothColor, centralColor);
-    vec3 softLight = 2.0 * centralColor*smoothColor + centralColor*centralColor
-                     - 2.0 * centralColor*centralColor * smoothColor;
-
-    gl_FragColor = vec4(mix(centralColor, screen, alpha), 1.0);
-    gl_FragColor.rgb = mix(gl_FragColor.rgb, lighten, alpha);
-    gl_FragColor.rgb = mix(gl_FragColor.rgb, softLight, params.b);
-
-    vec3 satColor = gl_FragColor.rgb * saturateMatrix;
-    gl_FragColor.rgb = mix(gl_FragColor.rgb, satColor, params.a);
-
-    gl_FragColor.rgb = vec3(gl_FragColor.rgb + vec3(-0.096));
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/blackcat.glsl b/android/src/main/res/raw/blackcat.glsl
deleted file mode 100755
index 36697e4..0000000
--- a/android/src/main/res/raw/blackcat.glsl
+++ /dev/null
@@ -1,92 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-
-vec3 rgb2hsv(vec3 c)
-{ 
-    vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0); 
-    vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g)); 
-    vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r)); 
-    
-    float d = q.x - min(q.w, q.y); 
-    float e = 1.0e-10; 
-    return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x); 
-} 
-
-vec3 hsv2rgb(vec3 c) { 
-    vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0); 
-    vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 
-    return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 
-} 
-
-void main() {
-    float GreyVal; 
-    lowp vec4 textureColor; 
-    lowp vec4 textureColorOri; 
-    float xCoordinate = textureCoordinate.x; 
-    float yCoordinate = textureCoordinate.y; 
-
-    highp float redCurveValue; 
-    highp float greenCurveValue; 
-    highp float blueCurveValue; 
-    textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-    // step1 curve 
-    redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-    greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-    blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-
-    //textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-    vec3 tColor = vec3(redCurveValue, greenCurveValue, blueCurveValue); 
-    tColor = rgb2hsv(tColor);
-
-    tColor.g = tColor.g * 1.2; 
-
-    float dStrength = 1.0; 
-    float dSatStrength = 0.3; 
-
-    float dGap = 0.0; 
-
-    if( tColor.r >= 0.0 && tColor.r < 0.417) 
-    { 
-        tColor.g = tColor.g + (tColor.g * dSatStrength); 
-    } 
-    else if( tColor.r > 0.958 && tColor.r <= 1.0) 
-    { 
-        tColor.g = tColor.g + (tColor.g * dSatStrength);
-    } 
-    else if( tColor.r >= 0.875 && tColor.r <= 0.958) 
-    { 
-        dGap = abs(tColor.r - 0.875); 
-        dStrength = (dGap / 0.0833); 
-        
-        tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength);
-    } 
-    else if( tColor.r >= 0.0417 && tColor.r <= 0.125) 
-    { 
-        dGap = abs(tColor.r - 0.125); 
-        dStrength = (dGap / 0.0833);
-
-        tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-    } 
-
-    tColor = hsv2rgb(tColor); 
-    tColor = clamp(tColor, 0.0, 1.0); 
-
-    redCurveValue = texture2D(curve, vec2(tColor.r, 1.0)).r; 
-    greenCurveValue = texture2D(curve, vec2(tColor.g, 1.0)).r; 
-    blueCurveValue = texture2D(curve, vec2(tColor.b, 1.0)).r; 
-
-    redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).g; 
-    greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).g; 
-    blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).g; 
-
-    textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-    gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/brannan.glsl b/android/src/main/res/raw/brannan.glsl
deleted file mode 100755
index 87c1328..0000000
--- a/android/src/main/res/raw/brannan.glsl
+++ /dev/null
@@ -1,73 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;  //process
-uniform sampler2D inputImageTexture3;  //blowout
-uniform sampler2D inputImageTexture4;  //contrast
-uniform sampler2D inputImageTexture5;  //luma
-uniform sampler2D inputImageTexture6;  //screen
-
-mat3 saturateMatrix = mat3(
-                           1.105150, -0.044850,-0.046000,
-                           -0.088050,1.061950,-0.089200,
-                           -0.017100,-0.017100,1.132900);
-
-vec3 luma = vec3(.3, .59, .11);
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    vec2 lookup;
-    lookup.y = 0.5;
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture2, lookup).r;
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture2, lookup).g;
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture2, lookup).b;
-
-    texel = saturateMatrix * texel;
-
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-    vec3 sampled;
-    lookup.y = 0.5;
-    lookup.x = texel.r;
-    sampled.r = texture2D(inputImageTexture3, lookup).r;
-    lookup.x = texel.g;
-    sampled.g = texture2D(inputImageTexture3, lookup).g;
-    lookup.x = texel.b;
-    sampled.b = texture2D(inputImageTexture3, lookup).b;
-    float value = smoothstep(0.0, 1.0, d);
-    texel = mix(sampled, texel, value);
-
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture4, lookup).r;
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture4, lookup).g;
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture4, lookup).b;
-
-
-    lookup.x = dot(texel, luma);
-    texel = mix(texture2D(inputImageTexture5, lookup).rgb, texel, .5);
-
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture6, lookup).r;
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture6, lookup).g;
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture6, lookup).b;
-
-    texel = mix(originColor.rgb, texel.rgb, strength);
-
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/brightness.glsl b/android/src/main/res/raw/brightness.glsl
deleted file mode 100755
index 82dce92..0000000
--- a/android/src/main/res/raw/brightness.glsl
+++ /dev/null
@@ -1,9 +0,0 @@
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform lowp float brightness;
-
-void main() {
-    lowp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);
-    gl_FragColor = vec4((textureColor.rgb + vec3(brightness)), textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/brooklyn.glsl b/android/src/main/res/raw/brooklyn.glsl
deleted file mode 100755
index 44951e4..0000000
--- a/android/src/main/res/raw/brooklyn.glsl
+++ /dev/null
@@ -1,148 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-uniform sampler2D inputImageTexture3;
-uniform sampler2D inputImageTexture4;
-
-uniform float strength;
- // gray
-float NCGray(vec4 color)
-{
-    float gray = 0.2125 * color.r + 0.7154 * color.g + 0.0721 * color.b;
-    return gray;
-}
- 
-// tone mapping
-vec4 NCTonemapping(vec4 color)
-{
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture2, vec2(color.r, 0.0)).r;
-    mapped.g = texture2D(inputImageTexture2, vec2(color.g, 0.0)).g;
-    mapped.b = texture2D(inputImageTexture2, vec2(color.b, 0.0)).b;
-    mapped.a = color.a;
-    
-    return mapped;
-}
- 
-// color control
-vec4 NCColorControl(vec4 color, float saturation, float brightness, float contrast)
-{
-    float gray = NCGray(color);
-    
-    color.rgb = vec3(saturation) * color.rgb + vec3(1.0-saturation) * vec3(gray);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = vec3(contrast) * (color.rgb - vec3(0.5)) + vec3(0.5);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = color.rgb + vec3(brightness);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    return color;
-}
- 
-// hue adjust
-vec4 NCHueAdjust(vec4 color, float hueAdjust)
-{
-    vec3 kRGBToYPrime = vec3(0.299, 0.587, 0.114);
-    vec3 kRGBToI = vec3(0.595716, -0.274453, -0.321263);
-    vec3 kRGBToQ = vec3(0.211456, -0.522591, 0.31135);
-    
-    vec3 kYIQToR   = vec3(1.0, 0.9563, 0.6210);
-    vec3 kYIQToG   = vec3(1.0, -0.2721, -0.6474);
-    vec3 kYIQToB   = vec3(1.0, -1.1070, 1.7046);
-    
-    float yPrime = dot(color.rgb, kRGBToYPrime);
-    float I = dot(color.rgb, kRGBToI);
-    float Q = dot(color.rgb, kRGBToQ);
-    
-    float hue = atan(Q, I);
-    float chroma  = sqrt (I * I + Q * Q);
-    
-    hue -= hueAdjust;
-    
-    Q = chroma * sin (hue);
-    I = chroma * cos (hue);
-    
-    color.r = dot(vec3(yPrime, I, Q), kYIQToR);
-    color.g = dot(vec3(yPrime, I, Q), kYIQToG);
-    color.b = dot(vec3(yPrime, I, Q), kYIQToB);
-    
-    return color;
-}
- 
-// colorMatrix
-vec4 NCColorMatrix(vec4 color, float red, float green, float blue, float alpha, vec4 bias)
-{
-    color = color * vec4(red, green, blue, alpha) + bias;
-    return color;
-}
- 
-// multiply blend
-vec4 NCMultiplyBlend(vec4 overlay, vec4 base)
-{
-    vec4 outputColor;
-    
-    float a = overlay.a + base.a * (1.0 - overlay.a);
-    
-    //    // normal blend
-    //    outputColor.r = (base.r * base.a + overlay.r * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.g = (base.g * base.a + overlay.g * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.b = (base.b * base.a + overlay.b * overlay.a * (1.0 - base.a))/a;
-    
-    
-    // multiply blend
-    outputColor.rgb = ((1.0-base.a) * overlay.rgb * overlay.a + (1.0-overlay.a) * base.rgb * base.a + overlay.a * base.a * overlay.rgb * base.rgb) / a;
-    
-    
-    outputColor.a = a;
-    
-    return outputColor;
-}
- 
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 color = texture2D(inputImageTexture, textureCoordinate);
-    
-    color.a = 1.0;
-    
-    // tone mapping
-    color.r = texture2D(inputImageTexture2, vec2(color.r, 0.0)).r;
-    color.g = texture2D(inputImageTexture2, vec2(color.g, 0.0)).g;
-    color.b = texture2D(inputImageTexture2, vec2(color.b, 0.0)).b;
-    
-    // color control
-    color = NCColorControl(color, 0.88, 0.03, 0.85);
-    
-    // hue adjust
-    color = NCHueAdjust(color, -0.0444);
-    
-    // normal blend
-    vec4 bg = vec4(0.5647, 0.1961, 0.0157, 0.14);
-    color = NCMultiplyBlend(bg, color);
-    
-    // normal blend
-    vec4 bg2 = texture2D(inputImageTexture3, textureCoordinate);
-    bg2.a *= 0.9;
-    color = NCMultiplyBlend(bg2, color);
-    
-    // tone mapping
-    color.r = texture2D(inputImageTexture4, vec2(color.r, 0.0)).r;
-    color.g = texture2D(inputImageTexture4, vec2(color.g, 0.0)).g;
-    color.b = texture2D(inputImageTexture4, vec2(color.b, 0.0)).b;
-    
-    color.rgb = mix(originColor.rgb, color.rgb, strength);
-    gl_FragColor = color;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/calm.glsl b/android/src/main/res/raw/calm.glsl
deleted file mode 100755
index 5704684..0000000
--- a/android/src/main/res/raw/calm.glsl
+++ /dev/null
@@ -1,72 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D grey1Frame; 
-uniform sampler2D grey2Frame;
-uniform sampler2D curve;
-
-const mediump vec3 luminanceWeighting = vec3(0.2125, 0.7154, 0.0721);
-
-void main()
-{
-	lowp float satura = 0.5;
-	float GreyVal;
-	lowp vec4 textureColor;
-	lowp vec4 textureColorRes;
-	
-	highp float redCurveValue;
-	highp float greenCurveValue;
-	highp float blueCurveValue;
-	
-	vec4 grey1Color;
-	vec4 grey2Color;
-	
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y;
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate));
-	textureColorRes = textureColor; 
-	
-	grey1Color = texture2D(grey1Frame, vec2(xCoordinate, yCoordinate));
-	grey2Color = texture2D(grey2Frame, vec2(xCoordinate, yCoordinate));
-	
-	// step 1. saturation 
-	lowp float luminance = dot(textureColor.rgb, luminanceWeighting);
-	lowp vec3 greyScaleColor = vec3(luminance);
-	
-	textureColor = vec4(mix(greyScaleColor, textureColor.rgb, satura), textureColor.w); 
-	
-	// step 2. level, blur curve, rgb curve
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r;
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0/2.0)).r;
-	
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0/2.0)).g;
-	
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0/2.0)).b;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0/2.0)).g;
-	
-	lowp vec4 base = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).r;
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).r;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).r;
-	lowp vec4 overlayer = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-	//gl_FragColor = base * (1.0 - grey1Color.r) + overlayer * grey1Color.r;
-	base = (base - overlayer) * (1.0 - grey1Color.r) + overlayer;
-	
-	redCurveValue = texture2D(curve, vec2(base.r, 1.0)).g;
-	greenCurveValue = texture2D(curve, vec2(base.g, 1.0)).g;
-	blueCurveValue = texture2D(curve, vec2(base.b, 1.0)).g;
-	overlayer = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	
-	textureColor = (base - overlayer) * (1.0 - grey2Color.r) + overlayer;
-	//base * (grey2Color.r) + overlayer * (1.0 - grey2Color.r);
-	
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/constrast.glsl b/android/src/main/res/raw/constrast.glsl
deleted file mode 100755
index f40b4a3..0000000
--- a/android/src/main/res/raw/constrast.glsl
+++ /dev/null
@@ -1,9 +0,0 @@
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform lowp float contrast;
-
-void main() {
-    lowp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);
-    gl_FragColor = vec4(((textureColor.rgb - vec3(0.5)) * contrast + vec3(0.5)), textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/cool.glsl b/android/src/main/res/raw/cool.glsl
deleted file mode 100755
index 8a31bcd..0000000
--- a/android/src/main/res/raw/cool.glsl
+++ /dev/null
@@ -1,44 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-void main() {
-	lowp vec4 textureColor;
-	lowp vec4 textureColorOri;
-	
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y;
-	
-	highp float redCurveValue;
-	highp float greenCurveValue;
-	highp float blueCurveValue;
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate));
-	textureColorOri = textureColor;
-	// step1 curve 
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r;
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b;
-	// step2 level
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 0.0)).a;
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 0.0)).a;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 0.0)).a;
-	// step3 brightness/constrast adjust 
-	redCurveValue = redCurveValue * 1.25 - 0.12549;
-	greenCurveValue = greenCurveValue * 1.25 - 0.12549; 
-	blueCurveValue = blueCurveValue * 1.25 - 0.12549;
-	//redCurveValue = (((redCurveValue) > (1.0)) ? (1.0) : (((redCurveValue) < (0.0)) ? (0.0) : (redCurveValue)));
-	//greenCurveValue = (((greenCurveValue) > (1.0)) ? (1.0) : (((greenCurveValue) < (0.0)) ? (0.0) : (greenCurveValue)));
-	//blueCurveValue = (((blueCurveValue) > (1.0)) ? (1.0) : (((blueCurveValue) < (0.0)) ? (0.0) : (blueCurveValue)));
-	// step4 normal blending with original
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	textureColor = (textureColorOri - textureColor) * 0.549 + textureColor;
-	
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0);
-} 
-  
\ No newline at end of file
diff --git a/android/src/main/res/raw/crayon.glsl b/android/src/main/res/raw/crayon.glsl
deleted file mode 100755
index b9564b2..0000000
--- a/android/src/main/res/raw/crayon.glsl
+++ /dev/null
@@ -1,56 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform vec2 singleStepOffset; 
-uniform float strength;
-
-const highp vec3 W = vec3(0.299,0.587,0.114);
-
-const mat3 rgb2yiqMatrix = mat3(
-		0.299, 0.587, 0.114,
-		0.596,-0.275,-0.321,
-		0.212,-0.523, 0.311);
-
-const mat3 yiq2rgbMatrix = mat3(
-		1.0, 0.956, 0.621,
-		1.0,-0.272,-1.703,
-		1.0,-1.106, 0.0);
-
-
-void main()
-{ 
-	vec4 oralColor = texture2D(inputImageTexture, textureCoordinate);
-
-	vec3 maxValue = vec3(0.,0.,0.);
-	
-	for(int i = -2; i<=2; i++)
-	{
-		for(int j = -2; j<=2; j++)
-		{
-			vec4 tempColor = texture2D(inputImageTexture, textureCoordinate+singleStepOffset*vec2(i,j));
-			maxValue.r = max(maxValue.r,tempColor.r);
-			maxValue.g = max(maxValue.g,tempColor.g);
-			maxValue.b = max(maxValue.b,tempColor.b);
-		}
-	}
-	
-	vec3 textureColor = oralColor.rgb / maxValue;
-	
-	float gray = dot(textureColor, W);
-	float k = 0.223529;
-	float alpha = min(gray,k)/k;
-	
-	textureColor = textureColor * alpha + (1.-alpha)*oralColor.rgb;
-	
-	vec3 yiqColor = textureColor * rgb2yiqMatrix;
-
-	yiqColor.r = max(0.0,min(1.0,pow(gray,strength)));
-
-	textureColor = yiqColor * yiq2rgbMatrix;
-	
-	gl_FragColor = vec4(textureColor, oralColor.w);
-} 
\ No newline at end of file
diff --git a/android/src/main/res/raw/earlybird.glsl b/android/src/main/res/raw/earlybird.glsl
deleted file mode 100755
index 9162d6b..0000000
--- a/android/src/main/res/raw/earlybird.glsl
+++ /dev/null
@@ -1,102 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //earlyBirdCurves
-uniform sampler2D inputImageTexture3; //earlyBirdOverlay
-uniform sampler2D inputImageTexture4; //vig
-uniform sampler2D inputImageTexture5; //earlyBirdBlowout
-uniform sampler2D inputImageTexture6; //earlyBirdMap
-
-const mat3 saturate = mat3(
-                           1.210300,
-                           -0.089700,
-                           -0.091000,
-                           -0.176100,
-                           1.123900,
-                           -0.177400,
-                           -0.034200,
-                           -0.034200,
-                           1.265800);
-const vec3 rgbPrime = vec3(0.25098, 0.14640522, 0.0);
-const vec3 desaturate = vec3(.3, .59, .11);
-
-void main()
-{
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    vec2 lookup;
-    lookup.y = 0.5;
-
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture2, lookup).r;
-
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture2, lookup).g;
-
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture2, lookup).b;
-
-    float desaturatedColor;
-    vec3 result;
-    desaturatedColor = dot(desaturate, texel);
-
-    lookup.x = desaturatedColor;
-    result.r = texture2D(inputImageTexture3, lookup).r;
-    lookup.x = desaturatedColor;
-    result.g = texture2D(inputImageTexture3, lookup).g;
-    lookup.x = desaturatedColor;
-    result.b = texture2D(inputImageTexture3, lookup).b;
-
-    texel = saturate * mix(texel, result, .5);
-
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-
-    vec3 sampled;
-    lookup.y = .5;
-
-    /*
-     lookup.x = texel.r;
-     sampled.r = texture2D(inputImageTexture4, lookup).r;
-
-     lookup.x = texel.g;
-     sampled.g = texture2D(inputImageTexture4, lookup).g;
-
-     lookup.x = texel.b;
-     sampled.b = texture2D(inputImageTexture4, lookup).b;
-
-     float value = smoothstep(0.0, 1.25, pow(d, 1.35)/1.65);
-     texel = mix(texel, sampled, value);
-    */
-
-    //---
-    lookup = vec2(d, texel.r);
-    texel.r = texture2D(inputImageTexture4, lookup).r;
-    lookup.y = texel.g;
-    texel.g = texture2D(inputImageTexture4, lookup).g;
-    lookup.y = texel.b;
-    texel.b	= texture2D(inputImageTexture4, lookup).b;
-    float value = smoothstep(0.0, 1.25, pow(d, 1.35)/1.65);
-
-    //---
-    lookup.x = texel.r;
-    sampled.r = texture2D(inputImageTexture5, lookup).r;
-    lookup.x = texel.g;
-    sampled.g = texture2D(inputImageTexture5, lookup).g;
-    lookup.x = texel.b;
-    sampled.b = texture2D(inputImageTexture5, lookup).b;
-    texel = mix(sampled, texel, value);
-
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture6, lookup).r;
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture6, lookup).g;
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture6, lookup).b;
-
-    gl_FragColor = vec4(texel, 1.0);
-}
diff --git a/android/src/main/res/raw/emerald.glsl b/android/src/main/res/raw/emerald.glsl
deleted file mode 100755
index 8f54c08..0000000
--- a/android/src/main/res/raw/emerald.glsl
+++ /dev/null
@@ -1,89 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-
-vec3 RGBtoHSL(vec3 c) { 
-	vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0); 
-	vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g)); 
-	vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r)); 
-	
-	float d = q.x - min(q.w, q.y); 
-	float e = 1.0e-10; 
-	return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x); 
-} 
-
-vec3 HSLtoRGB(vec3 c) { 
-	vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0); 
-	vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 
-	return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y);
-} 
-
-void main() { 
-	float GreyVal;
-	highp vec4 textureColor; 
-	float xCoordinate = textureCoordinate.x; 
-	float yCoordinate = textureCoordinate.y;
-
-	highp float redCurveValue; 
-	highp float greenCurveValue; 
-    highp float blueCurveValue; 
-
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-	
-	// step1 curve 
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-	vec3 tColor = vec3(redCurveValue, greenCurveValue, blueCurveValue); 
-	tColor = RGBtoHSL(tColor); 
-	tColor = clamp(tColor, 0.0, 1.0); 
-
-	tColor.g = tColor.g * 1.5; 
-
-	float dStrength = 1.0; 
-	float dSatStrength = 0.15; 
-    float dHueStrength = 0.08; 
-
-	float dGap = 0.0; 
-
-	if( tColor.r >= 0.625 && tColor.r <= 0.708)
-	{ 
-		tColor.r = tColor.r - (tColor.r * dHueStrength); 
-        tColor.g = tColor.g + (tColor.g * dSatStrength); 		
-	} 
-	else if( tColor.r >= 0.542 && tColor.r < 0.625) 
-	{ 
-		dGap = abs(tColor.r - 0.542); 
-		dStrength = (dGap / 0.0833); 
-
-		tColor.r = tColor.r + (tColor.r * dHueStrength * dStrength); 
-		tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-	} 
-	else if( tColor.r > 0.708 && tColor.r <= 0.792)
-	{ 
-		dGap = abs(tColor.r - 0.792); 
-		dStrength = (dGap / 0.0833);
-
-		tColor.r = tColor.r + (tColor.r * dHueStrength * dStrength);
-		tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-	} 
-	
-	tColor = HSLtoRGB(tColor); 
-	tColor = clamp(tColor, 0.0, 1.0); 
-	
-	redCurveValue = texture2D(curve, vec2(tColor.r, 1.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(tColor.g, 1.0)).r;
-	blueCurveValue = texture2D(curve, vec2(tColor.b, 1.0)).r; 
-
-    redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).g; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).g; 
-
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-    gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
diff --git a/android/src/main/res/raw/evergreen.glsl b/android/src/main/res/raw/evergreen.glsl
deleted file mode 100755
index 02f3046..0000000
--- a/android/src/main/res/raw/evergreen.glsl
+++ /dev/null
@@ -1,84 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-vec3 RGBtoHSL(vec3 c) 
-{ 
-	vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0); 
-	vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g)); 
-	vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r));
-	
-	float d = q.x - min(q.w, q.y); 
-	float e = 1.0e-10; 
-	return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x);
-} 
-
-vec3 HSLtoRGB(vec3 c) 
-{ 
-	vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0); 
-	vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 
-	return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 
-} 
-
-void main()
-{ 
-	float GreyVal; 
-	lowp vec4 textureColor; 
-	float xCoordinate = textureCoordinate.x; 
-	float yCoordinate = textureCoordinate.y; 
-	
-	highp float redCurveValue; 
-	highp float greenCurveValue;
-	highp float blueCurveValue; 
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-	
-	vec3 tColor = vec3(textureColor.r, textureColor.g, textureColor.b); 
-
-	tColor = RGBtoHSL(tColor); 
-	tColor = clamp(tColor, 0.0, 1.0); 
-
-	
-	tColor.g = tColor.g * 1.3; 
-	
-	float dStrength = 1.0; 
-	float dSatStrength = 0.5; 
-	float dGap = 0.0; 
-
-	
-	if( tColor.r >= 0.292 && tColor.r <= 0.375) 
-	{ 
-		tColor.g = tColor.g + (tColor.g * dSatStrength); 
-	} 
-	else if( tColor.r >= 0.208 && tColor.r < 0.292) 
-	{ 
-		dGap = abs(tColor.r - 0.208); 
-		dStrength = (dGap / 0.0833); 
-
-        tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-	} 
-	else if( tColor.r > 0.375 && tColor.r <= 0.458) 
-	{ 
-		dGap = abs(tColor.r - 0.458); 
-		dStrength = (dGap / 0.0833); 
-		
-		tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-	} 
-	tColor = HSLtoRGB(tColor);
-	tColor = clamp(tColor, 0.0, 1.0); 
-	
-    redCurveValue = texture2D(curve, vec2(tColor.r, 0.0)).b; 
-	greenCurveValue = texture2D(curve, vec2(tColor.g, 0.0)).b; 
-	blueCurveValue = texture2D(curve, vec2(tColor.b, 0.0)).b; 
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 0.0)).r; 
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 0.0)).g; 
-
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/exposure.glsl b/android/src/main/res/raw/exposure.glsl
deleted file mode 100755
index 1405753..0000000
--- a/android/src/main/res/raw/exposure.glsl
+++ /dev/null
@@ -1,9 +0,0 @@
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform highp float exposure;
-
-void main() {
-    highp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);
-    gl_FragColor = vec4(textureColor.rgb * pow(2.0, exposure), textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/fragment.glsl b/android/src/main/res/raw/fragment.glsl
deleted file mode 100755
index 2c6ed48..0000000
--- a/android/src/main/res/raw/fragment.glsl
+++ /dev/null
@@ -1,11 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-
-void main() {
-    gl_FragColor = texture2D(inputImageTexture, textureCoordinate);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/freud.glsl b/android/src/main/res/raw/freud.glsl
deleted file mode 100755
index bebf10c..0000000
--- a/android/src/main/res/raw/freud.glsl
+++ /dev/null
@@ -1,172 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-uniform float inputImageTextureHeight;
-uniform float inputImageTextureWidth;
-
-float texture2Size = 1024.0;
-
-uniform float strength;
-
-// gray
-float NCGray(vec4 color)
-{
-    float gray = 0.2125 * color.r + 0.7154 * color.g + 0.0721 * color.b;
-    return gray;
-}
-
-// tone mapping
-vec4 NCTonemapping(vec4 color)
-{
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture2, vec2(color.r, 0.0)).r;
-    mapped.g = texture2D(inputImageTexture2, vec2(color.g, 0.0)).g;
-    mapped.b = texture2D(inputImageTexture2, vec2(color.b, 0.0)).b;
-    mapped.a = color.a;
-    return mapped;
-}
- 
-// color control
-vec4 NCColorControl(vec4 color, float saturation, float brightness, float contrast)
-{
-    float gray = NCGray(color);
-    
-    color.rgb = vec3(saturation) * color.rgb + vec3(1.0-saturation) * vec3(gray);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = vec3(contrast) * (color.rgb - vec3(0.5)) + vec3(0.5);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = color.rgb + vec3(brightness);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    return color;
-}
- 
-// hue adjust
-vec4 NCHueAdjust(vec4 color, float hueAdjust)
-{
-    vec3 kRGBToYPrime = vec3(0.299, 0.587, 0.114);
-    vec3 kRGBToI = vec3(0.595716, -0.274453, -0.321263);
-    vec3 kRGBToQ = vec3(0.211456, -0.522591, 0.31135);
-    
-    vec3 kYIQToR   = vec3(1.0, 0.9563, 0.6210);
-    vec3 kYIQToG   = vec3(1.0, -0.2721, -0.6474);
-    vec3 kYIQToB   = vec3(1.0, -1.1070, 1.7046);
-    
-    float yPrime = dot(color.rgb, kRGBToYPrime);
-    float I = dot(color.rgb, kRGBToI);
-    float Q = dot(color.rgb, kRGBToQ);
-    
-    float hue = atan(Q, I);
-    float chroma  = sqrt (I * I + Q * Q);
-    
-    hue -= hueAdjust;
-    
-    Q = chroma * sin (hue);
-    I = chroma * cos (hue);
-    
-    color.r = dot(vec3(yPrime, I, Q), kYIQToR);
-    color.g = dot(vec3(yPrime, I, Q), kYIQToG);
-    color.b = dot(vec3(yPrime, I, Q), kYIQToB);
-    
-    return color;
-}
- 
-// colorMatrix
-vec4 NCColorMatrix(vec4 color, float red, float green, float blue, float alpha, vec4 bias)
-{
-    color = color * vec4(red, green, blue, alpha) + bias;
-    
-    return color;
-}
- 
-// multiply blend
-vec4 NCMultiplyBlend(vec4 overlay, vec4 base)
-{
-    vec4 outputColor;
-    
-    float a = overlay.a + base.a * (1.0 - overlay.a);
-    
-    //    // normal blend
-    //    outputColor.r = (base.r * base.a + overlay.r * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.g = (base.g * base.a + overlay.g * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.b = (base.b * base.a + overlay.b * overlay.a * (1.0 - base.a))/a;
-    
-    
-    // multiply blend
-    outputColor.rgb = ((1.0-base.a) * overlay.rgb * overlay.a + (1.0-overlay.a) * base.rgb * base.a + overlay.a * base.a * overlay.rgb * base.rgb) / a;
-    
-    
-    outputColor.a = a;
-    
-    return outputColor;
-}
- 
-// xy should be a integer position (e.g. pixel position on the screen)
-// similar to a texture lookup but is only ALU
-float PseudoRandom(vec2 co)
-{
-    //    return fract(sin(dot(co.xy ,vec2(12.9898,78.233))) * 43758.5453);
-    mediump float a = 12.9898;
-    mediump float b = 78.233;
-    mediump float c = 43758.5453;
-    mediump float dt= dot(co.xy ,vec2(a,b));
-    mediump float sn= mod(dt,3.14);
-    return fract(sin(sn) * c);
-}
- 
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 color = texture2D(inputImageTexture, textureCoordinate);
-    
-    color.a = 1.0;
-    
-    // color control
-//    color = NCColorControl(color, 0.6, -0.06, 0.75);
-    color = NCColorControl(color, 0.5, 0.1, 0.9);
-    
-    // rand
-	  float x = textureCoordinate.x*inputImageTextureWidth/texture2Size;
-      float y = textureCoordinate.y*inputImageTextureHeight/texture2Size;
-
-      vec4 rd = texture2D(inputImageTexture2, vec2( fract(x), fract(y)));
-//    vec4 rd = texture2D(inputImageTexture2, textureCoordinate);
-//    float rand_number1 = PseudoRandom(textureCoordinate.xy);
-//    float rand_number2 = PseudoRandom(textureCoordinate.yx);
-//    float rand_number3 = PseudoRandom(vec2(rand_number1, rand_number2));
-//    float rand_number4 = PseudoRandom(vec2(rand_number2, rand_number1));
-//    float rand_number5 = PseudoRandom(vec2(rand_number3, rand_number4));
-    
-//    vec4 rd = vec4(rand_number1, rand_number3, rand_number5, 1.0);
-    
-//    if(rand_number4>0.2)
-//        rd = vec4(1.0);
-    
-    // rand color control
-//    rd = NCColorControl(rd, 0.65, 0.1, 0.7);
-    rd = NCColorControl(rd, 1.0, 0.4, 1.2);
-    
-    // normal blend
-//    rd.a *= 1.0;
-    color = NCMultiplyBlend(rd, color);
-    
-    // color matrix
-//    color = NCColorMatrix(color, 1.0, 1.0, 1.0, 1.0, vec4(-0.1, -0.1, -0.1, 0));
-    color = NCColorMatrix(color, 1.0, 1.0, 1.0, 1.0, vec4(-0.15, -0.15, -0.15, 0));
-    
-    color.rgb = mix(originColor.rgb, color.rgb, strength);
-    gl_FragColor = color;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/healthy.glsl b/android/src/main/res/raw/healthy.glsl
deleted file mode 100755
index 47b89fb..0000000
--- a/android/src/main/res/raw/healthy.glsl
+++ /dev/null
@@ -1,141 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-uniform sampler2D mask;
-
-uniform float texelWidthOffset ;
-
-uniform float texelHeightOffset;
-
-varying mediump vec2 textureCoordinate;
-
-vec4 level0c(vec4 color, sampler2D sampler) 
-{ 
-	color.r = texture2D(sampler, vec2(color.r, 0.)).r; 
-	color.g = texture2D(sampler, vec2(color.g, 0.)).r;
-	color.b = texture2D(sampler, vec2(color.b, 0.)).r;
-	return color;
-} 
-
-vec4 level1c(vec4 color, sampler2D sampler) 
-{ 
-	color.r = texture2D(sampler, vec2(color.r, 0.)).g;
-	color.g = texture2D(sampler, vec2(color.g, 0.)).g;
-	color.b = texture2D(sampler, vec2(color.b, 0.)).g;
-	return color;
-} 
-
-vec4 level2c(vec4 color, sampler2D sampler) 
-{ 
-	color.r = texture2D(sampler, vec2(color.r,0.)).b; 
-	color.g = texture2D(sampler, vec2(color.g,0.)).b;
-	color.b = texture2D(sampler, vec2(color.b,0.)).b; 
-	return color; 
-} 
-
-vec3 rgb2hsv(vec3 c) 
-{
-	vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0); 
-	vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g)); 
-	vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r)); 
-	
-	float d = q.x - min(q.w, q.y); 
-	float e = 1.0e-10; 
-	return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x); 
-} 
-
-vec3 hsv2rgb(vec3 c) 
-{ 
-	vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0); 
-	vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 
-	return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 
-}
-
-vec4 normal(vec4 c1, vec4 c2, float alpha) 
-{ 
-	return (c2-c1) * alpha + c1; 
-} 
-
-vec4 multiply(vec4 c1, vec4 c2) 
-{
-	return c1 * c2 * 1.01;
-}
-
-vec4 overlay(vec4 c1, vec4 c2)
-{
-	vec4 color = vec4(0.,0.,0.,1.);
-	
-	color.r = c1.r < 0.5 ? 2.0*c1.r*c2.r : 1.0 - 2.0*(1.0-c1.r)*(1.0-c2.r);
-	color.g = c1.g < 0.5 ? 2.0*c1.g*c2.g : 1.0 - 2.0*(1.0-c1.g)*(1.0-c2.g);
-	color.b = c1.b < 0.5 ? 2.0*c1.b*c2.b : 1.0 - 2.0*(1.0-c1.b)*(1.0-c2.b); 
-
-	return color;
-}
-
-vec4 screen(vec4 c1, vec4 c2) 
-{ 
-	return vec4(1.) - ((vec4(1.) - c1) * (vec4(1.) - c2)); 
-} 
-
-void main() 
-{ 
-	// iOS ImageLiveFilter adjustment
-	// begin 
-	
-	vec4 textureColor; 
-	
-	vec4 t0 = texture2D(mask, vec2(textureCoordinate.x, textureCoordinate.y)); 
-	
-	// naver skin 
-	vec4 c2 = texture2D(inputImageTexture, textureCoordinate); 
-	vec4 c5 = c2; 
-	
-	// healthy 
-	vec3 hsv = rgb2hsv(c5.rgb); 
-	lowp float h = hsv.x; 
-	lowp float s = hsv.y; 
-	lowp float v = hsv.z; 
-	
-	lowp float cF = 0.;   
-	// color strength 
-	lowp float cG = 0.;   
-	// color gap; 
-	lowp float sF = 0.06; 
-	// saturation strength; 
-	
-	if(h >= 0.125 && h <= 0.208) 
-	{ 
-		// 45 to 75 
-		s = s - (s * sF); 
-	} 
-	else if (h >= 0.208 && h < 0.292) 
-	{ 
-		// 75 to 105 
-		cG = abs(h - 0.208); 
-		cF = (cG / 0.0833); 
-		s = s - (s * sF * cF); 
-	} 
-	else if (h > 0.042 && h <=  0.125) 
-	{ 
-		// 15 to 45 
-		cG = abs(h - 0.125); 
-		cF = (cG / 0.0833); 
-		s = s - (s * sF * cF); 
-	} 
-	hsv.y = s; 
-	
-	vec4 c6 = vec4(hsv2rgb(hsv),1.); 
-	
-	c6 = normal(c6, screen  (c6, c6), 0.275); // screen 70./255. 
-	c6 = normal(c6, overlay (c6, vec4(1., 0.61176, 0.25098, 1.)), 0.04); // overlay 10./255. 
-	
-	c6 = normal(c6, multiply(c6, t0), 0.262); // multiply 67./255. 
-	
-	c6 = level1c(level0c(c6,curve),curve); 
-	
-	gl_FragColor = c6; 
-	// end
-} 
\ No newline at end of file
diff --git a/android/src/main/res/raw/hefe.glsl b/android/src/main/res/raw/hefe.glsl
deleted file mode 100755
index 77983ef..0000000
--- a/android/src/main/res/raw/hefe.glsl
+++ /dev/null
@@ -1,46 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;  //edgeBurn
-uniform sampler2D inputImageTexture3;  //hefeMap
-uniform sampler2D inputImageTexture4;  //hefeGradientMap
-uniform sampler2D inputImageTexture5;  //hefeSoftLight
-uniform sampler2D inputImageTexture6;  //hefeMetal
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-    vec3 edge = texture2D(inputImageTexture2, textureCoordinate).rgb;
-    texel = texel * edge;
-    
-    texel = vec3(
-                 texture2D(inputImageTexture3, vec2(texel.r, .16666)).r,
-                 texture2D(inputImageTexture3, vec2(texel.g, .5)).g,
-                 texture2D(inputImageTexture3, vec2(texel.b, .83333)).b);
-    
-    vec3 luma = vec3(.30, .59, .11);
-    vec3 gradSample = texture2D(inputImageTexture4, vec2(dot(luma, texel), .5)).rgb;
-    vec3 final = vec3(
-                      texture2D(inputImageTexture5, vec2(gradSample.r, texel.r)).r,
-                      texture2D(inputImageTexture5, vec2(gradSample.g, texel.g)).g,
-                      texture2D(inputImageTexture5, vec2(gradSample.b, texel.b)).b
-                      );
-    
-    vec3 metal = texture2D(inputImageTexture6, textureCoordinate).rgb;
-    vec3 metaled = vec3(
-                        texture2D(inputImageTexture5, vec2(metal.r, texel.r)).r,
-                        texture2D(inputImageTexture5, vec2(metal.g, texel.g)).g,
-                        texture2D(inputImageTexture5, vec2(metal.b, texel.b)).b
-                        );
-    
-    metaled.rgb = mix(originColor.rgb, metaled.rgb, strength);
-
-    gl_FragColor = vec4(metaled, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/hudson.glsl b/android/src/main/res/raw/hudson.glsl
deleted file mode 100755
index 76f5d79..0000000
--- a/android/src/main/res/raw/hudson.glsl
+++ /dev/null
@@ -1,35 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //blowout;
-uniform sampler2D inputImageTexture3; //overlay;
-uniform sampler2D inputImageTexture4; //map
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-
-    vec4 texel = texture2D(inputImageTexture, textureCoordinate);
-
-    vec3 bbTexel = texture2D(inputImageTexture2, textureCoordinate).rgb;
-
-    texel.r = texture2D(inputImageTexture3, vec2(bbTexel.r, texel.r)).r;
-    texel.g = texture2D(inputImageTexture3, vec2(bbTexel.g, texel.g)).g;
-    texel.b = texture2D(inputImageTexture3, vec2(bbTexel.b, texel.b)).b;
-
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture4, vec2(texel.r, .16666)).r;
-    mapped.g = texture2D(inputImageTexture4, vec2(texel.g, .5)).g;
-    mapped.b = texture2D(inputImageTexture4, vec2(texel.b, .83333)).b;
-    mapped.a = 1.0;
-
-    mapped.rgb = mix(originColor.rgb, mapped.rgb, strength);
-
-    gl_FragColor = mapped;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/hue.glsl b/android/src/main/res/raw/hue.glsl
deleted file mode 100755
index b053e23..0000000
--- a/android/src/main/res/raw/hue.glsl
+++ /dev/null
@@ -1,43 +0,0 @@
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform mediump float hueAdjust;
-const highp vec4 kRGBToYPrime = vec4 (0.299, 0.587, 0.114, 0.0);
-const highp vec4 kRGBToI = vec4 (0.595716, -0.274453, -0.321263, 0.0);
-const highp vec4 kRGBToQ = vec4 (0.211456, -0.522591, 0.31135, 0.0);
-
-const highp vec4 kYIQToR = vec4 (1.0, 0.9563, 0.6210, 0.0);
-const highp vec4 kYIQToG = vec4 (1.0, -0.2721, -0.6474, 0.0);
-const highp vec4 kYIQToB = vec4 (1.0, -1.1070, 1.7046, 0.0);
-
-void main () {
-    // Sample the input pixel
-    highp vec4 color = texture2D(inputImageTexture, textureCoordinate);
-
-    // Convert to YIQ
-    highp float YPrime = dot (color, kRGBToYPrime);
-    highp float I = dot (color, kRGBToI);
-    highp float Q = dot (color, kRGBToQ);
-
-    // Calculate the hue and chroma
-    highp float hue = atan (Q, I);
-    highp float chroma = sqrt (I * I + Q * Q);
-
-    // Make the user's adjustments
-    hue += (-hueAdjust); //why negative rotation?
-
-    // Convert back to YIQ
-    Q = chroma * sin (hue);
-    I = chroma * cos (hue);
-
-    // Convert back to RGB
-    highp vec4 yIQ = vec4 (YPrime, I, Q, 0.0);
-    color.r = dot (yIQ, kYIQToR);
-    color.g = dot (yIQ, kYIQToG);
-    color.b = dot (yIQ, kYIQToB);
-
-    // Save the result
-    gl_FragColor = color;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/inkwell.glsl b/android/src/main/res/raw/inkwell.glsl
deleted file mode 100755
index d1aef96..0000000
--- a/android/src/main/res/raw/inkwell.glsl
+++ /dev/null
@@ -1,16 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-void main()
-{
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-    texel = vec3(dot(vec3(0.3, 0.6, 0.1), texel));
-    texel = vec3(texture2D(inputImageTexture2, vec2(texel.r, .16666)).r);
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/kevin_new.glsl b/android/src/main/res/raw/kevin_new.glsl
deleted file mode 100755
index bfcd7f0..0000000
--- a/android/src/main/res/raw/kevin_new.glsl
+++ /dev/null
@@ -1,26 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-void main()
-{
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-    vec2 lookup;
-    lookup.y = .5;
-
-    lookup.x = texel.r;
-    texel.r = texture2D(inputImageTexture2, lookup).r;
-
-    lookup.x = texel.g;
-    texel.g = texture2D(inputImageTexture2, lookup).g;
-
-    lookup.x = texel.b;
-    texel.b = texture2D(inputImageTexture2, lookup).b;
-
-    gl_FragColor = vec4(texel, 1.0);
-}
diff --git a/android/src/main/res/raw/latte.glsl b/android/src/main/res/raw/latte.glsl
deleted file mode 100755
index e1f335e..0000000
--- a/android/src/main/res/raw/latte.glsl
+++ /dev/null
@@ -1,163 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-vec3 rgb2hsv(vec3 c) 
-{ 
-	vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0);
-	vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g));
-	vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r));
-	
-	float d = q.x - min(q.w, q.y);
-	float e = 1.0e-10;
-	return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x);
-} 
-
-vec3 hsv2rgb(vec3 c) 
-{ 
-	vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0);
-	vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www);
-	return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 
-}
-
-void main()
-{
-	float GreyVal;
-	lowp vec4 textureColor;
-	lowp vec4 textureColorOri;
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y; 
-
-	highp float redCurveValue; 
-	highp float greenCurveValue; 
-	highp float blueCurveValue;
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-	mediump vec4 base = textureColor; 
-	mediump vec4 overlay = vec4(0.792, 0.58, 0.372, 1.0); 
-	
-	// step1 overlay blending 
-	mediump float ra; 
-	if (base.r < 0.5) 
-	{ 
-		ra = overlay.r * base.r * 2.0; 
-	} 
-	else 
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-    } 
-	
-	mediump float ga; 
-	if (base.g < 0.5) 
-	{ 
-		ga = overlay.g * base.g * 2.0;
-    } 
-	else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-    } 
-	
-	mediump float ba; 
-	if (base.b < 0.5) 
-	{ 
-		ba = overlay.b * base.b * 2.0; 
-    } 
-	else 
-	{ 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-    } 
-	
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	textureColor = (textureColor - base) * 0.3 + base; 
-	
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-	
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).g; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).g; 
-	
-	
-	vec3 tColor = vec3(redCurveValue, greenCurveValue, blueCurveValue); 
-	tColor = rgb2hsv(tColor); 
-	
-	tColor.g = tColor.g * 0.6; 
-	
-	float dStrength = 1.0; 
-	float dSatStrength = 0.2; 
-	
-	float dGap = 0.0; 
-	
-	if( tColor.r >= 0.0 && tColor.r < 0.417) 
-	{ 
-		tColor.g = tColor.g + (tColor.g * dSatStrength); 
-    } 
-	else if( tColor.r > 0.958 && tColor.r <= 1.0) 
-	{ 
-		tColor.g = tColor.g + (tColor.g * dSatStrength); 
-    } 
-	else if( tColor.r >= 0.875 && tColor.r <= 0.958) 
-	{ 
-		dGap = abs(tColor.r - 0.875); 
-		dStrength = (dGap / 0.0833); 
-		
-		tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-    } 
-	else if( tColor.r >= 0.0417 && tColor.r <= 0.125) 
-	{ 
-		dGap = abs(tColor.r - 0.125);
-		dStrength = (dGap / 0.0833); 
-		
-		tColor.g = tColor.g + (tColor.g * dSatStrength * dStrength); 
-	} 
-	
-	
-	tColor = hsv2rgb(tColor); 
-	tColor = clamp(tColor, 0.0, 1.0); 
-	
-	redCurveValue = texture2D(curve, vec2(tColor.r, 1.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(tColor.g, 1.0)).r; 
-	blueCurveValue = texture2D(curve, vec2(tColor.b, 1.0)).r; 
-
-	base = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-	overlay = vec4(0.792, 0.494, 0.372, 1.0); 
-
-	// step5 overlay blending 
-	if (base.r < 0.5) 
-	{
-		ra = overlay.r * base.r * 2.0; 
-    } 
-	else 
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-    } 
-	
-	if (base.g < 0.5) 
-	{ 
-		ga = overlay.g * base.g * 2.0; 
-	} 
-	else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-	
-	if (base.b < 0.5) 
-	{ 
-		ba = overlay.b * base.b * 2.0; 
-	}
-	else 
-	{ 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0);
-    } 
-	
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	textureColor = (textureColor - base) * 0.15 + base; 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/lomo.glsl b/android/src/main/res/raw/lomo.glsl
deleted file mode 100755
index 1069724..0000000
--- a/android/src/main/res/raw/lomo.glsl
+++ /dev/null
@@ -1,29 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-uniform sampler2D inputImageTexture3;
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = vec4(0.2,0.6,0.9,1.0);
-    vec3 texel;
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-    vec2 lookup = vec2(d, originColor.r);
-    texel.r = texture2D(inputImageTexture3, lookup).r;
-    lookup.y = originColor.g;
-    texel.g = texture2D(inputImageTexture3, lookup).g;
-    lookup.y = originColor.b;
-    texel.b	= texture2D(inputImageTexture3, lookup).b;
-
-    texel.rgb = mix(originColor.rgb, texel.rgb, strength);
-
-    gl_FragColor = vec4(texel,1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/lookup.glsl b/android/src/main/res/raw/lookup.glsl
deleted file mode 100755
index e70e446..0000000
--- a/android/src/main/res/raw/lookup.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform sampler2D inputImageTexture2; // lookup texture\n" +
-
-void main() {
-
-    lowp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);\n" +
-
-    mediump float blueColor = textureColor.b * 63.0;
-
-    mediump vec2 quad1;
-    quad1.y = floor(floor(blueColor) / 8.0);
-    quad1.x = floor(blueColor) - (quad1.y * 8.0);
-
-    mediump vec2 quad2;\n" +
-    quad2.y = floor(ceil(blueColor) / 8.0);\n" +
-    quad2.x = ceil(blueColor) - (quad2.y * 8.0);\n" +
-
-    highp vec2 texPos1;
-    texPos1.x = (quad1.x * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.r);
-    texPos1.y = (quad1.y * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.g);
-
-    highp vec2 texPos2;\n" +
-    texPos2.x = (quad2.x * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.r);
-    texPos2.y = (quad2.y * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.g);
-
-    lowp vec4 newColor1 = texture2D(inputImageTexture2, texPos1);
-    lowp vec4 newColor2 = texture2D(inputImageTexture2, texPos2);
-
-    lowp vec4 newColor = mix(newColor1, newColor2, fract(blueColor));
-    gl_FragColor = vec4(newColor.rgb, textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/n1977.glsl b/android/src/main/res/raw/n1977.glsl
deleted file mode 100755
index 2b38a15..0000000
--- a/android/src/main/res/raw/n1977.glsl
+++ /dev/null
@@ -1,19 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-void main()
-{
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-    texel = vec3(
-                 texture2D(inputImageTexture2, vec2(texel.r, .16666)).r,
-                 texture2D(inputImageTexture2, vec2(texel.g, .5)).g,
-                 texture2D(inputImageTexture2, vec2(texel.b, .83333)).b);
-
-    gl_FragColor = vec4(texel, 1.0);
-}
diff --git a/android/src/main/res/raw/nashville.glsl b/android/src/main/res/raw/nashville.glsl
deleted file mode 100755
index c4dc45e..0000000
--- a/android/src/main/res/raw/nashville.glsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-void main()
-{
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-    texel = vec3(
-                 texture2D(inputImageTexture2, vec2(texel.r, .16666)).r,
-                 texture2D(inputImageTexture2, vec2(texel.g, .5)).g,
-                 texture2D(inputImageTexture2, vec2(texel.b, .83333)).b);
-    gl_FragColor = vec4(texel, 1.0);
-}
diff --git a/android/src/main/res/raw/nostalgia.glsl b/android/src/main/res/raw/nostalgia.glsl
deleted file mode 100755
index 1c5a327..0000000
--- a/android/src/main/res/raw/nostalgia.glsl
+++ /dev/null
@@ -1,108 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-uniform sampler2D curve2; 
-uniform highp float texelWidthOffset;
-uniform highp float texelHeightOffset;
-uniform highp float blurSize;
-
-vec4 OverlayBlendingVec4(vec4 down, vec4 up, float fAlpha)
-{ 
-	if ( down.r < 0.5 ) 
-	{ 
-		up.r = up.r * down.r * 2.0; 
-	}
-	else 
-	{ 
-		up.r = 1.0 - ( ( 1.0 - down.r) * ( 1.0 - up.r ) * 2.0 );
-	} 
-	if ( down.g < 0.5 ) 
-	{ 
-		up.g = up.g * down.g * 2.0; 
-	} 
-	else 
-	{
-		up.g = 1.0 - ( ( 1.0 - down.g) * ( 1.0 - up.g ) * 2.0 ); 
-	} 
-	
-    if ( down.b < 0.5 ) 
-	{ 
-		up.b = up.b * down.b * 2.0;
-	} 
-	else 
-	{ 
-		up.b = 1.0 - ( ( 1.0 - down.b) * ( 1.0 - up.b ) * 2.0 ); 
-	} 
-	
-    down = ( up - down ) * fAlpha + down;
-	
-	return down; 
-} 
-
-void main()
-{ 
-	float xCoordinate = textureCoordinate.x; 
-	float yCoordinate = textureCoordinate.y; 
-
-	vec4 textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-    highp vec2 firstOffset = vec2(1.3846153846 * texelWidthOffset, 1.3846153846 * texelHeightOffset) * blurSize; 
-	highp vec2 secondOffset = vec2(3.2307692308 * texelWidthOffset, 3.2307692308 * texelHeightOffset) * blurSize;
-	
-	highp vec2 centerTextureCoordinate = vec2(xCoordinate, yCoordinate); 
-	highp vec2 oneStepLeftTextureCoordinate = vec2(xCoordinate, yCoordinate) - firstOffset; 
-	highp vec2 twoStepsLeftTextureCoordinate = vec2(xCoordinate, yCoordinate) - secondOffset; 
-	highp vec2 oneStepRightTextureCoordinate = vec2(xCoordinate, yCoordinate) + firstOffset; 
-    highp vec2 twoStepsRightTextureCoordinate = vec2(xCoordinate, yCoordinate) + secondOffset; 
-
-	lowp vec4 fragmentColor = texture2D(inputImageTexture, vec2(centerTextureCoordinate.x, centerTextureCoordinate.y)) * 0.2270270270; 
-	fragmentColor += texture2D(inputImageTexture, vec2(oneStepLeftTextureCoordinate.x, oneStepLeftTextureCoordinate.y)) * 0.3162162162; 
-	fragmentColor += texture2D(inputImageTexture, vec2(oneStepRightTextureCoordinate.x, oneStepRightTextureCoordinate.y)) * 0.3162162162; 
-	fragmentColor += texture2D(inputImageTexture, vec2(twoStepsLeftTextureCoordinate.x, twoStepsLeftTextureCoordinate.y)) * 0.0702702703; 
-    fragmentColor += texture2D(inputImageTexture, vec2(twoStepsRightTextureCoordinate.x, twoStepsRightTextureCoordinate.y)) * 0.0702702703; 
-	
-	lowp vec4 blurColor = fragmentColor; 
-	
-    // step1 ScreenBlending 
-    blurColor = 1.0 - ((1.0 - textureColor) * (1.0 - blurColor)); 
-	blurColor =     clamp(blurColor, 0.0, 1.0);
-	textureColor = (blurColor - textureColor) * 0.7 + textureColor; 
-	textureColor =  clamp(textureColor, 0.0, 1.0); 
-
-	// step2 OverlayBlending 
-	textureColor = OverlayBlendingVec4(textureColor, vec4(0.0, 0.0, 0.0, 1.0), 0.3); 
-    textureColor = clamp(textureColor, vec4(0.0, 0.0, 0.0, 1.0), vec4(1.0, 1.0, 1.0, 1.0)); 
-	
-	// step3 curve 
-    highp float redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	highp float greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-	highp float blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b;
-
-    // step4 curve 
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).r; 
-    greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).r; 
-    blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).r; 
-
-	// step5 level 
-    redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).g; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).g; 
-    blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).g; 
-
-	// step6 curve 
-	redCurveValue = texture2D(curve2, vec2(redCurveValue, 1.0)).r; 
-	greenCurveValue = texture2D(curve2, vec2(greenCurveValue, 1.0)).g;
-    blueCurveValue = texture2D(curve2, vec2(blueCurveValue, 1.0)).b; 
-
-	// step7 curve 
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).b; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).b; 
-
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).b; 
-	
-	lowp vec4 BCSColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-	gl_FragColor =  vec4(BCSColor.r,BCSColor.g,BCSColor.b,1.0); 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/pixar.glsl b/android/src/main/res/raw/pixar.glsl
deleted file mode 100755
index 9a91be4..0000000
--- a/android/src/main/res/raw/pixar.glsl
+++ /dev/null
@@ -1,137 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2;
-
-uniform float strength;
-
-// gray
-float NCGray(vec4 color)
-{
-    float gray = 0.2125 * color.r + 0.7154 * color.g + 0.0721 * color.b;
-    return gray;
-}
-
-// tone mapping
-vec4 NCTonemapping(vec4 color)
-{
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture2, vec2(color.r, 0.0)).r;
-    mapped.g = texture2D(inputImageTexture2, vec2(color.g, 0.0)).g;
-    mapped.b = texture2D(inputImageTexture2, vec2(color.b, 0.0)).b;
-    mapped.a = color.a;
-    return mapped;
-}
-
-// color control
-vec4 NCColorControl(vec4 color, float saturation, float brightness, float contrast)
-{
-    float gray = NCGray(color);
-    
-    color.rgb = vec3(saturation) * color.rgb + vec3(1.0-saturation) * vec3(gray);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = vec3(contrast) * (color.rgb - vec3(0.5)) + vec3(0.5);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    color.rgb = color.rgb + vec3(brightness);
-    color.r = clamp(color.r, 0.0, 1.0);
-    color.g = clamp(color.g, 0.0, 1.0);
-    color.b = clamp(color.b, 0.0, 1.0);
-    
-    return color;
-}
-
-// hue adjust
-vec4 NCHueAdjust(vec4 color, float hueAdjust)
-{
-    vec3 kRGBToYPrime = vec3(0.299, 0.587, 0.114);
-    vec3 kRGBToI = vec3(0.595716, -0.274453, -0.321263);
-    vec3 kRGBToQ = vec3(0.211456, -0.522591, 0.31135);
-    
-    vec3 kYIQToR   = vec3(1.0, 0.9563, 0.6210);
-    vec3 kYIQToG   = vec3(1.0, -0.2721, -0.6474);
-    vec3 kYIQToB   = vec3(1.0, -1.1070, 1.7046);
-    
-    float yPrime = dot(color.rgb, kRGBToYPrime);
-    float I = dot(color.rgb, kRGBToI);
-    float Q = dot(color.rgb, kRGBToQ);
-    
-    float hue = atan(Q, I);
-    float chroma  = sqrt (I * I + Q * Q);
-    
-    hue -= hueAdjust;
-    
-    Q = chroma * sin (hue);
-    I = chroma * cos (hue);
-    
-    color.r = dot(vec3(yPrime, I, Q), kYIQToR);
-    color.g = dot(vec3(yPrime, I, Q), kYIQToG);
-    color.b = dot(vec3(yPrime, I, Q), kYIQToB);
-    
-    return color;
-}
- 
-// colorMatrix
-vec4 NCColorMatrix(vec4 color, float red, float green, float blue, float alpha, vec4 bias)
-{
-    color = color * vec4(red, green, blue, alpha) + bias;
-    
-    return color;
-}
- 
-// multiply blend
-vec4 NCMultiplyBlend(vec4 overlay, vec4 base)
-{
-    vec4 outputColor;
-    
-    float a = overlay.a + base.a * (1.0 - overlay.a);
-    
-    //    // normal blend
-    //    outputColor.r = (base.r * base.a + overlay.r * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.g = (base.g * base.a + overlay.g * overlay.a * (1.0 - base.a))/a;
-    //    outputColor.b = (base.b * base.a + overlay.b * overlay.a * (1.0 - base.a))/a;
-    
-    
-    // multiply blend
-    outputColor.rgb = ((1.0-base.a) * overlay.rgb * overlay.a + (1.0-overlay.a) * base.rgb * base.a + overlay.a * base.a * overlay.rgb * base.rgb) / a;
-    
-    
-    outputColor.a = a;
-    
-    return outputColor;
-}
- 
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 color = texture2D(inputImageTexture, textureCoordinate);
-    
-    color.a = 1.0;
-    
-    // tone mapping
-    color.r = texture2D(inputImageTexture2, vec2(color.r, 0.0)).r;
-    color.g = texture2D(inputImageTexture2, vec2(color.g, 0.0)).g;
-    color.b = texture2D(inputImageTexture2, vec2(color.b, 0.0)).b;
-    
-    // color control
-    color = NCColorControl(color, 1.0, 0.08, 1.0);
-    
-    // hue adjust
-    color = NCHueAdjust(color, 0.0556);
-    
-    // color matrix
-    color = NCColorMatrix(color, 1.0, 1.0, 1.0, 1.0, vec4(0.02, 0.02, 0.06, 0));
-    
-    color.rgb = mix(originColor.rgb, color.rgb, strength);
-
-    gl_FragColor = color;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/rise.glsl b/android/src/main/res/raw/rise.glsl
deleted file mode 100755
index 946279c..0000000
--- a/android/src/main/res/raw/rise.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //blowout;
-uniform sampler2D inputImageTexture3; //overlay;
-uniform sampler2D inputImageTexture4; //map
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 texel = texture2D(inputImageTexture, textureCoordinate);
-    vec3 bbTexel = texture2D(inputImageTexture2, textureCoordinate).rgb;
-
-    texel.r = texture2D(inputImageTexture3, vec2(bbTexel.r, texel.r)).r;
-    texel.g = texture2D(inputImageTexture3, vec2(bbTexel.g, texel.g)).g;
-    texel.b = texture2D(inputImageTexture3, vec2(bbTexel.b, texel.b)).b;
-
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture4, vec2(texel.r, .16666)).r;
-    mapped.g = texture2D(inputImageTexture4, vec2(texel.g, .5)).g;
-    mapped.b = texture2D(inputImageTexture4, vec2(texel.b, .83333)).b;
-    mapped.a = 1.0;
-
-    mapped.rgb = mix(originColor.rgb, mapped.rgb, strength);
-
-    gl_FragColor = mapped;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/romance.glsl b/android/src/main/res/raw/romance.glsl
deleted file mode 100755
index 6ebe86e..0000000
--- a/android/src/main/res/raw/romance.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-void main()
-{ 
-	lowp vec4 textureColor;
-	lowp vec4 textureColorRes; 
-	lowp vec4 textureColorOri; 
-	vec4 grey1Color; 
-	vec4 layerColor; 
-	mediump float satVal = 115.0 / 100.0; 
-
-    float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y; 
-	
-	highp float redCurveValue; 
-	highp float greenCurveValue; 
-	highp float blueCurveValue; 
-
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-    textureColorRes = textureColor; 
-	textureColorOri = textureColor; 
-
-	// step1. screen blending 
-	textureColor = 1.0 - ((1.0 - textureColorOri) * (1.0 - textureColorOri)); 
-	textureColor = (textureColor - textureColorOri) + textureColorOri; 
-
-	// step2. curve 
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-    blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-	// step3. saturation 
-	highp float G = (redCurveValue + greenCurveValue + blueCurveValue); 
-	G = G / 3.0; 
-
-    redCurveValue = ((1.0 - satVal) * G + satVal * redCurveValue); 
-	greenCurveValue = ((1.0 - satVal) * G + satVal * greenCurveValue); 
-	blueCurveValue = ((1.0 - satVal) * G + satVal * blueCurveValue); 
-
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-    gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
diff --git a/android/src/main/res/raw/sakura.glsl b/android/src/main/res/raw/sakura.glsl
deleted file mode 100755
index d34ec22..0000000
--- a/android/src/main/res/raw/sakura.glsl
+++ /dev/null
@@ -1,71 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-uniform float texelWidthOffset; 
-uniform float texelHeightOffset; 
-
-varying mediump vec2 textureCoordinate; 
-
-vec4 gaussianBlur(sampler2D sampler) { 
-	lowp float strength = 1.; 
-	vec4 color = vec4(0.); 
-	vec2 step  = vec2(0.); 
-
-	color += texture2D(sampler,textureCoordinate)* 0.25449 ; 
-	
-	step.x = 1.37754 * texelWidthOffset  * strength; 
-	step.y = 1.37754 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.24797; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.24797; 
-
-	step.x = 3.37754 * texelWidthOffset  * strength; 
-	step.y = 3.37754 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.09122; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.09122; 
-
-	step.x = 5.37754 * texelWidthOffset  * strength; 
-	step.y = 5.37754 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.03356; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.03356; 
-
-	return color; 
-} 
-
-vec4 overlay(vec4 c1, vec4 c2){ 
-	vec4 r1 = vec4(0.,0.,0.,1.); 
-
-	r1.r = c1.r < 0.5 ? 2.0*c1.r*c2.r : 1.0 - 2.0*(1.0-c1.r)*(1.0-c2.r); 
-	r1.g = c1.g < 0.5 ? 2.0*c1.g*c2.g : 1.0 - 2.0*(1.0-c1.g)*(1.0-c2.g); 
-	r1.b = c1.b < 0.5 ? 2.0*c1.b*c2.b : 1.0 - 2.0*(1.0-c1.b)*(1.0-c2.b); 
-
-	return r1; 
-} 
-
-vec4 level0c(vec4 color, sampler2D sampler) { 
-    color.r = texture2D(sampler, vec2(color.r, 0.)).r; 
-    color.g = texture2D(sampler, vec2(color.g, 0.)).r; 
-	color.b = texture2D(sampler, vec2(color.b, 0.)).r; 
-	return color; 
-} 
-
-vec4 normal(vec4 c1, vec4 c2, float alpha) { 
-    return (c2-c1) * alpha + c1; 
-} 
-
-vec4 screen(vec4 c1, vec4 c2) { 
-	vec4 r1 = vec4(1.) - ((vec4(1.) - c1) * (vec4(1.) - c2)); 
-	return r1; 
-} 
-
-void main() { 
-	// naver skin 
-	lowp vec4 c0 = texture2D(inputImageTexture, textureCoordinate);
-	lowp vec4 c1 = gaussianBlur(inputImageTexture); 
-	lowp vec4 c2 = overlay(c0, level0c(c1, curve)); 
-	lowp vec4 c3 = normal(c0,c2,0.15); 
-
-	gl_FragColor = c3; 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/saturation.glsl b/android/src/main/res/raw/saturation.glsl
deleted file mode 100755
index 936c032..0000000
--- a/android/src/main/res/raw/saturation.glsl
+++ /dev/null
@@ -1,15 +0,0 @@
-varying highp vec2 textureCoordinate;
-
-uniform sampler2D inputImageTexture;
-uniform lowp float saturation;
-
-// Values from \"Graphics Shaders: Theory and Practice\" by Bailey and Cunningham
-const mediump vec3 luminanceWeighting = vec3(0.2125, 0.7154, 0.0721);
-
-void main() {
-    lowp vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);
-    lowp float luminance = dot(textureColor.rgb, luminanceWeighting);
-    lowp vec3 greyScaleColor = vec3(luminance);
-
-    gl_FragColor = vec4(mix(greyScaleColor, textureColor.rgb, saturation), textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/sharpen.glsl b/android/src/main/res/raw/sharpen.glsl
deleted file mode 100755
index 642d7b8..0000000
--- a/android/src/main/res/raw/sharpen.glsl
+++ /dev/null
@@ -1,22 +0,0 @@
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-varying highp vec2 leftTextureCoordinate;
-varying highp vec2 rightTextureCoordinate;
-varying highp vec2 topTextureCoordinate;
-varying highp vec2 bottomTextureCoordinate;
- 
-varying highp float centerMultiplier;
-varying highp float edgeMultiplier;
-
-uniform sampler2D inputImageTexture;
-
-void main() { 
-    mediump vec3 textureColor = texture2D(inputImageTexture, textureCoordinate).rgb;
-    mediump vec3 leftTextureColor = texture2D(inputImageTexture, leftTextureCoordinate).rgb;
-    mediump vec3 rightTextureColor = texture2D(inputImageTexture, rightTextureCoordinate).rgb;
-    mediump vec3 topTextureColor = texture2D(inputImageTexture, topTextureCoordinate).rgb;
-    mediump vec3 bottomTextureColor = texture2D(inputImageTexture, bottomTextureCoordinate).rgb;
-
-    gl_FragColor = vec4((textureColor * centerMultiplier - (leftTextureColor * edgeMultiplier + rightTextureColor * edgeMultiplier + topTextureColor * edgeMultiplier + bottomTextureColor * edgeMultiplier)), texture2D(inputImageTexture, bottomTextureCoordinate).w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/sierra.glsl b/android/src/main/res/raw/sierra.glsl
deleted file mode 100755
index ce30bbc..0000000
--- a/android/src/main/res/raw/sierra.glsl
+++ /dev/null
@@ -1,32 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //blowout;
-uniform sampler2D inputImageTexture3; //overlay;
-uniform sampler2D inputImageTexture4; //map
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec4 texel = texture2D(inputImageTexture, textureCoordinate);
-    vec3 bbTexel = texture2D(inputImageTexture2, textureCoordinate).rgb;
-
-    texel.r = texture2D(inputImageTexture3, vec2(bbTexel.r, texel.r)).r;
-    texel.g = texture2D(inputImageTexture3, vec2(bbTexel.g, texel.g)).g;
-    texel.b = texture2D(inputImageTexture3, vec2(bbTexel.b, texel.b)).b;
-
-    vec4 mapped;
-    mapped.r = texture2D(inputImageTexture4, vec2(texel.r, .16666)).r;
-    mapped.g = texture2D(inputImageTexture4, vec2(texel.g, .5)).g;
-    mapped.b = texture2D(inputImageTexture4, vec2(texel.b, .83333)).b;
-    mapped.a = 1.0;
-
-    mapped.rgb = mix(originColor.rgb, mapped.rgb, strength);
-    gl_FragColor = mapped;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/sketch.glsl b/android/src/main/res/raw/sketch.glsl
deleted file mode 100755
index 71cd794..0000000
--- a/android/src/main/res/raw/sketch.glsl
+++ /dev/null
@@ -1,49 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform vec2 singleStepOffset; 
-uniform float strength;
-
-const highp vec3 W = vec3(0.299,0.587,0.114);
-
-
-void main()
-{ 
-	float threshold = 0.0;
-	//pic1
-	vec4 oralColor = texture2D(inputImageTexture, textureCoordinate);
-	
-	//pic2
-	vec3 maxValue = vec3(0.,0.,0.);
-	
-	for(int i = -2; i<=2; i++)
-	{
-		for(int j = -2; j<=2; j++)
-		{
-			vec4 tempColor = texture2D(inputImageTexture, textureCoordinate+singleStepOffset*vec2(i,j));
-			maxValue.r = max(maxValue.r,tempColor.r);
-			maxValue.g = max(maxValue.g,tempColor.g);
-			maxValue.b = max(maxValue.b,tempColor.b);
-			threshold += dot(tempColor.rgb, W);
-		}
-	}
-	//pic3
-	float gray1 = dot(oralColor.rgb, W);
-	
-	//pic4
-	float gray2 = dot(maxValue, W);
-	
-	//pic5
-	float contour = gray1 / gray2;
-
-	threshold = threshold / 25.;
-	float alpha = max(1.0,gray1>threshold?1.0:(gray1/threshold));
-	
-	float result = contour * alpha + (1.0-alpha)*gray1;
-	
-	gl_FragColor = vec4(vec3(result,result,result), oralColor.w);
-} 
\ No newline at end of file
diff --git a/android/src/main/res/raw/skinwhiten.glsl b/android/src/main/res/raw/skinwhiten.glsl
deleted file mode 100755
index d4a2aac..0000000
--- a/android/src/main/res/raw/skinwhiten.glsl
+++ /dev/null
@@ -1,98 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-
-uniform float texelWidthOffset; 
-uniform float texelHeightOffset; 
-
-varying mediump vec2 textureCoordinate; 
-
-const mediump vec3 luminanceWeighting = vec3(0.2125, 0.7154, 0.0721); 
-
-vec4 gaussianBlur(sampler2D sampler) { 
-	lowp float strength = 1.; 
-	vec4 color = vec4(0.); 
-	vec2 step  = vec2(0.); 
-
-	color += texture2D(sampler,textureCoordinate)* 0.25449 ; 
-
-	step.x = 1.37754 * texelWidthOffset  * strength; 
-	step.y = 1.37754 * texelHeightOffset * strength;
-	color += texture2D(sampler,textureCoordinate+step) * 0.24797; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.24797; 
-
-	step.x = 3.37754 * texelWidthOffset  * strength; 
-	step.y = 3.37754 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.09122; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.09122; 
-
-	step.x = 5.37754 * texelWidthOffset  * strength; 
-	step.y = 5.37754 * texelHeightOffset * strength; 
-
-	color += texture2D(sampler,textureCoordinate+step) * 0.03356; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.03356; 
-
-	return color; 
-} 
-
-void main() { 
-	vec4 blurColor; 
-	lowp vec4 textureColor; 
-	lowp float strength = -1.0 / 510.0; 
-
-	float xCoordinate = textureCoordinate.x; 
-	float yCoordinate = textureCoordinate.y;
-
-	lowp float satura = 0.7; 
-	// naver skin 
-	textureColor = texture2D(inputImageTexture, textureCoordinate); 
-	blurColor = gaussianBlur(inputImageTexture); 
-
-	//saturation 
-    lowp float luminance = dot(blurColor.rgb, luminanceWeighting); 
-	lowp vec3 greyScaleColor = vec3(luminance); 
-
-	blurColor = vec4(mix(greyScaleColor, blurColor.rgb, satura), blurColor.w); 
-     
-	lowp float redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	lowp float greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).r; 
-    lowp float blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).r; 
-
-	redCurveValue = min(1.0, redCurveValue + strength); 
-	greenCurveValue = min(1.0, greenCurveValue + strength); 
-	blueCurveValue = min(1.0, blueCurveValue + strength); 
-
-    mediump vec4 overlay = blurColor;
-
-	mediump vec4 base = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-    //gl_FragColor = overlay; 
-
-    // step4 overlay blending 
-	mediump float ra; 
-	if (base.r < 0.5) { 
-		ra = overlay.r * base.r * 2.0; 
-	} else { 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-	} 
-
-    mediump float ga; 
-	if (base.g < 0.5) { 
-		ga = overlay.g * base.g * 2.0; 
-	} else { 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-	mediump float ba; 
-	if (base.b < 0.5) { 
-		ba = overlay.b * base.b * 2.0;
-	} else { 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-
-	textureColor = vec4(ra, ga, ba, 1.0); 
-
-    gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
diff --git a/android/src/main/res/raw/suger_tablets.glsl b/android/src/main/res/raw/suger_tablets.glsl
deleted file mode 100755
index 1b73425..0000000
--- a/android/src/main/res/raw/suger_tablets.glsl
+++ /dev/null
@@ -1,41 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-varying mediump vec2 textureCoordinate;
-varying mediump vec2 textureCoordinate2; // TODO: This is not used
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; // lookup texture
-uniform mediump float strength;
-
-void main()
-{
-    mediump vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    mediump vec4 textureColor = texture2D(inputImageTexture, textureCoordinate);
-
-    mediump float blueColor = textureColor.b * 63.0;
-
-    mediump vec2 quad1;
-    quad1.y = floor(floor(blueColor) / 8.0);
-    quad1.x = floor(blueColor) - (quad1.y * 8.0);
-
-    mediump vec2 quad2;
-    quad2.y = floor(ceil(blueColor) / 8.0);
-    quad2.x = ceil(blueColor) - (quad2.y * 8.0);
-
-    mediump vec2 texPos1;
-    texPos1.x = (quad1.x * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.r);
-    texPos1.y = (quad1.y * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.g);
-
-    mediump vec2 texPos2;
-    texPos2.x = (quad2.x * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.r);
-    texPos2.y = (quad2.y * 0.125) + 0.5/512.0 + ((0.125 - 1.0/512.0) * textureColor.g);
-
-    mediump vec4 newColor1 = texture2D(inputImageTexture2, texPos1);
-    mediump vec4 newColor2 = texture2D(inputImageTexture2, texPos2);
-
-    mediump vec4 newColor = mix(newColor1, newColor2, fract(blueColor));
-
-    newColor.rgb = mix(originColor.rgb, newColor.rgb, strength);
-
-    gl_FragColor = vec4(newColor.rgb, textureColor.w);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/sunrise.glsl b/android/src/main/res/raw/sunrise.glsl
deleted file mode 100755
index c09952d..0000000
--- a/android/src/main/res/raw/sunrise.glsl
+++ /dev/null
@@ -1,150 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-
-uniform sampler2D grey1Frame; 
-uniform sampler2D grey2Frame;
-uniform sampler2D grey3Frame;
-
-void main() 
-{ 
-	float GreyVal; 
-	lowp vec4 textureColor; 
-    lowp vec4 textureColorOri; 
-    float xCoordinate = textureCoordinate.x; 
-    float yCoordinate = textureCoordinate.y;
-
-    highp float redCurveValue; 
-	highp float greenCurveValue;
-    highp float blueCurveValue; 
-
-	vec4 grey1Color;
-    vec4 grey2Color; 
-	vec4 grey3Color;
-
-    textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-
-	grey1Color = texture2D(grey1Frame, vec2(xCoordinate, yCoordinate)); 
-	grey2Color = texture2D(grey2Frame, vec2(xCoordinate, yCoordinate)); 
-	grey3Color = texture2D(grey3Frame, vec2(xCoordinate, yCoordinate)); 
-
-	mediump vec4 overlay = vec4(0, 0, 0, 1.0); 
-	mediump vec4 base = textureColor; 
-
-	// overlay blending 
-    mediump float ra; 
-    if (base.r < 0.5) 
-	{ 
-		ra = overlay.r * base.r * 2.0; 
-    } 
-	else 
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-    } 
-
-    mediump float ga; 
-	if (base.g < 0.5)
-	{ 
-		ga = overlay.g * base.g * 2.0; 
-    }
-	else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-    } 
-
-	mediump float ba;
-    if (base.b < 0.5) 
-	{ 
-		ba = overlay.b * base.b * 2.0; 
-	} 
-	else 
-	{
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	base = (textureColor - base) * (grey1Color.r*0.1019) + base; 
-
-
-	// step2 60% opacity  ExclusionBlending 
-	textureColor = vec4(base.r, base.g, base.b, 1.0); 
-    mediump vec4 textureColor2 = vec4(0.098, 0.0, 0.1843, 1.0); 
-    textureColor2 = textureColor + textureColor2 - (2.0 * textureColor2 * textureColor); 
-
-	textureColor = (textureColor2 - textureColor) * 0.6 + textureColor; 
-
-    // step3 normal blending with original 
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-    blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-    textureColorOri = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	textureColor = (textureColorOri - textureColor) * grey2Color.r + textureColor; 
-
-	// step4 normal blending with original
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 1.0)).r; 
-    greenCurveValue = texture2D(curve, vec2(textureColor.g, 1.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 1.0)).b; 
-
-	textureColorOri = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-	textureColor = (textureColorOri - textureColor) * (grey3Color.r) * 1.0 + textureColor; 
-
-
-    overlay = vec4(0.6117, 0.6117, 0.6117, 1.0); 
-	base = textureColor;
-    // overlay blending 
-	if (base.r < 0.5) 
-	{ 
-		ra = overlay.r * base.r * 2.0; 
-	} 
-	else 
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-	} 
-
-	if (base.g < 0.5) 
-	{ 
-		ga = overlay.g * base.g * 2.0; 
-	} 
-	else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-	if (base.b < 0.5) 
-	{
-		ba = overlay.b * base.b * 2.0; 
-    } 
-	else 
-	{ 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0);
-	}
-	
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	base = (textureColor - base) + base; 
-
-	// step5-2 30% opacity  ExclusionBlending 
-	textureColor = vec4(base.r, base.g, base.b, 1.0); 
-    textureColor2 = vec4(0.113725, 0.0039, 0.0, 1.0); 
-    textureColor2 = textureColor + textureColor2 - (2.0 * textureColor2 * textureColor); 
-
-	base = (textureColor2 - textureColor) * 0.3 + textureColor; 
-	redCurveValue = texture2D(curve, vec2(base.r, 1.0)).a; 
-	greenCurveValue = texture2D(curve, vec2(base.g, 1.0)).a; 
-    blueCurveValue = texture2D(curve, vec2(base.b, 1.0)).a; 
-
-	// step6 screen with 60%
-    base = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-	overlay = vec4(1.0, 1.0, 1.0, 1.0); 
-
-	// screen blending 
-    textureColor = 1.0 - ((1.0 - base) * (1.0 - overlay)); 
-    textureColor = (textureColor - base) * 0.05098 + base; 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0);
-} 
\ No newline at end of file
diff --git a/android/src/main/res/raw/sunset.glsl b/android/src/main/res/raw/sunset.glsl
deleted file mode 100755
index e44363b..0000000
--- a/android/src/main/res/raw/sunset.glsl
+++ /dev/null
@@ -1,108 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-
-uniform sampler2D grey1Frame; 
-uniform sampler2D grey2Frame; 
-
-void main() 
-{ 
-	float GreyVal; 
-	lowp vec4 textureColor; 
-	lowp vec4 textureColorOri; 
-	float xCoordinate = textureCoordinate.x; 
-	float yCoordinate = textureCoordinate.y; 
-
-	highp float redCurveValue; 
-    highp float greenCurveValue; 
-	highp float blueCurveValue; 
-
-    vec4 grey1Color; 
-	vec4 grey2Color; 
-
-    textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-	grey1Color = texture2D(grey2Frame, vec2(xCoordinate, yCoordinate)); 
-	grey2Color = texture2D(grey1Frame, vec2(xCoordinate, yCoordinate)); 
-
-	// step1 normal blending with original 
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-    textureColorOri = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-    textureColor = (textureColorOri - textureColor) * grey1Color.r + textureColor; 
-
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).a; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).a; 
-    blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).a; 
-
-	//textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-    // step3 60% opacity  ExclusionBlending 
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-    mediump vec4 textureColor2 = vec4(0.08627, 0.03529, 0.15294, 1.0); 
-	textureColor2 = textureColor + textureColor2 - (2.0 * textureColor2 * textureColor); 
-
-    textureColor = (textureColor2 - textureColor) * 0.6784 + textureColor; 
-
-
-	mediump vec4 overlay = vec4(0.6431, 0.5882, 0.5803, 1.0); 
-	mediump vec4 base = textureColor;
-
-	// overlay blending 
-    mediump float ra; 
-	if (base.r < 0.5) { 
-		ra = overlay.r * base.r * 2.0; 
-	} else {
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-	} 
-
-	mediump float ga; 
-	if (base.g < 0.5) { 
-		ga = overlay.g * base.g * 2.0; 
-	} else { 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-	mediump float ba; 
-	if (base.b < 0.5) {
-		ba = overlay.b * base.b * 2.0; 
-	} else { 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	base = (textureColor - base) + base;
-
-    // again overlay blending 
-    overlay = vec4(0.0, 0.0, 0.0, 1.0);
-
-	// overlay blending 
-	if (base.r < 0.5) { 
-		ra = overlay.r * base.r * 2.0; 
-	} else { 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0); 
-	} 
-
-	if (base.g < 0.5) { 
-		ga = overlay.g * base.g * 2.0;
-	} else { 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-	if (base.b < 0.5) { 
-		ba = overlay.b * base.b * 2.0; 
-	} else { 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-
-    textureColor = vec4(ra, ga, ba, 1.0); 
-	textureColor = (textureColor - base) * (grey2Color * 0.549) + base; 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-} 
\ No newline at end of file
diff --git a/android/src/main/res/raw/sutro.glsl b/android/src/main/res/raw/sutro.glsl
deleted file mode 100755
index 832585d..0000000
--- a/android/src/main/res/raw/sutro.glsl
+++ /dev/null
@@ -1,48 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //sutroMap;
-uniform sampler2D inputImageTexture3; //sutroMetal;
-uniform sampler2D inputImageTexture4; //softLight
-uniform sampler2D inputImageTexture5; //sutroEdgeburn
-uniform sampler2D inputImageTexture6; //sutroCurves
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-    vec2 lookup = vec2(d, texel.r);
-    texel.r = texture2D(inputImageTexture2, lookup).r;
-    lookup.y = texel.g;
-    texel.g = texture2D(inputImageTexture2, lookup).g;
-    lookup.y = texel.b;
-    texel.b	= texture2D(inputImageTexture2, lookup).b;
-
-    vec3 rgbPrime = vec3(0.1019, 0.0, 0.0);
-    float m = dot(vec3(.3, .59, .11), texel.rgb) - 0.03058;
-    texel = mix(texel, rgbPrime + m, 0.32);
-
-    vec3 metal = texture2D(inputImageTexture3, textureCoordinate).rgb;
-    texel.r = texture2D(inputImageTexture4, vec2(metal.r, texel.r)).r;
-    texel.g = texture2D(inputImageTexture4, vec2(metal.g, texel.g)).g;
-    texel.b = texture2D(inputImageTexture4, vec2(metal.b, texel.b)).b;
-
-    texel = texel * texture2D(inputImageTexture5, textureCoordinate).rgb;
-
-    texel.r = texture2D(inputImageTexture6, vec2(texel.r, .16666)).r;
-    texel.g = texture2D(inputImageTexture6, vec2(texel.g, .5)).g;
-    texel.b = texture2D(inputImageTexture6, vec2(texel.b, .83333)).b;
-
-    texel.rgb = mix(originColor.rgb, texel.rgb, strength);
-
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/sweets.glsl b/android/src/main/res/raw/sweets.glsl
deleted file mode 100755
index bced1c3..0000000
--- a/android/src/main/res/raw/sweets.glsl
+++ /dev/null
@@ -1,156 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-uniform lowp samplerExternalOES inputImageTexture;
-uniform lowp sampler2D curve;
-uniform lowp sampler2D samplerMask;
-uniform lowp int lowPerformance;
-
-uniform float texelWidthOffset ;
-uniform float texelHeightOffset;
-
-varying mediump vec2 textureCoordinate;
-
-vec4 sharpen(sampler2D sampler) 
-{ 
-	vec4 color = texture2D(sampler, textureCoordinate) * 2.; 
-	
-	color -= texture2D(sampler, textureCoordinate-vec2(texelWidthOffset, 0. )) * 0.25;
-	color -= texture2D(sampler, textureCoordinate+vec2(texelWidthOffset, 0. )) * 0.25; 
-	color -= texture2D(sampler, textureCoordinate-vec2(0., texelHeightOffset)) * 0.25; 
-	color -= texture2D(sampler, textureCoordinate+vec2(0., texelHeightOffset)) * 0.25; 
-
-    return color; 
-} 
-
-vec4 gaussianBlur(sampler2D sampler) 
-{ 
-	lowp float strength = 1.; 
-	
-	vec4 color = vec4(0.); 
-	vec2 step  = vec2(0.);
-	
-	color += texture2D(sampler,textureCoordinate)* 0.0443 ; 
-	
-	step.x = 1.49583 * texelWidthOffset  * strength; 
-	step.y = 1.49583 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+vec2(step.x, 0.)) * 0.04321; 
-	color += texture2D(sampler,textureCoordinate-vec2(step.x, 0.)) * 0.04321; 
-	color += texture2D(sampler,textureCoordinate+vec2(0., step.y)) * 0.04321; 
-	color += texture2D(sampler,textureCoordinate-vec2(0., step.y)) * 0.04321; 
-	
-	step.x = 2.4719250988753685 * texelWidthOffset  * strength; 
-	step.y = 2.4719250988753685 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.041795; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.041795; 
-	color += texture2D(sampler,textureCoordinate+vec2(-step.x, step.y)) * 0.041795; 
-	color += texture2D(sampler,textureCoordinate+vec2( step.x,-step.y)) * 0.041795; 
-	
-	step.x = 5.49583 * texelWidthOffset  * strength; 
-	step.y = 5.49583 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+vec2(step.x, 0.)) * 0.040425; 
-	color += texture2D(sampler,textureCoordinate-vec2(step.x, 0.)) * 0.040425; 
-	color += texture2D(sampler,textureCoordinate+vec2(0., step.y)) * 0.040425; 
-	color += texture2D(sampler,textureCoordinate-vec2(0., step.y)) * 0.040425; 
-
-	step.x = 5.300352223621558 * texelWidthOffset  * strength; 
-	step.y = 5.300352223621558 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.0391; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.0391; 
-	color += texture2D(sampler,textureCoordinate+vec2(-step.x, step.y)) * 0.0391; 
-	color += texture2D(sampler,textureCoordinate+vec2( step.x,-step.y)) * 0.0391; 
-
-	step.x = 9.49583 * texelWidthOffset  * strength; 
-	step.y = 9.49583 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+vec2(step.x, 0.)) * 0.037815; 
-	color += texture2D(sampler,textureCoordinate-vec2(step.x, 0.)) * 0.037815; 
-	color += texture2D(sampler,textureCoordinate+vec2(0., step.y)) * 0.037815; 
-	color += texture2D(sampler,textureCoordinate-vec2(0., step.y)) * 0.037815; 
-	
-	step.x = 8.128779348367749 * texelWidthOffset  * strength; 
-	step.y = 8.128779348367749 * texelHeightOffset * strength; 
-	color += texture2D(sampler,textureCoordinate+step) * 0.03658; 
-	color += texture2D(sampler,textureCoordinate-step) * 0.03658; 
-	color += texture2D(sampler,textureCoordinate+vec2(-step.x, step.y)) * 0.03658; 
-	color += texture2D(sampler,textureCoordinate+vec2( step.x,-step.y)) * 0.03658; 
-
-	return color; 
-} 
-
-vec4 level(vec4 color, sampler2D sampler) 
-{ 
-	color.r = texture2D(sampler, vec2(color.r, 0.)).r; 
-	color.g = texture2D(sampler, vec2(color.g, 0.)).g;
-	color.b = texture2D(sampler, vec2(color.b, 0.)).b; 
-
-	return color; 
-} 
-   
-vec4 normal(vec4 c1, vec4 c2, float alpha) 
-{ 
-	return (c2-c1) * alpha + c1;
-}   
-
-vec4 lighten(vec4 c1, vec4 c2) 
-{ 
-	return max(c1,c2);
-}
-
-vec4 overlay(vec4 c1, vec4 c2)
-{
-	vec4 r1 = vec4(0.,0.,0.,1.); 
-	r1.r = c1.r < 0.5 ? 2.0*c1.r*c2.r : 1.0 - 2.0*(1.0-c1.r)*(1.0-c2.r);
-	r1.g = c1.g < 0.5 ? 2.0*c1.g*c2.g : 1.0 - 2.0*(1.0-c1.g)*(1.0-c2.g);
-	r1.b = c1.b < 0.5 ? 2.0*c1.b*c2.b : 1.0 - 2.0*(1.0-c1.b)*(1.0-c2.b);
-	
-	return r1;
-} 
-
-vec3 lerp (vec3 x, vec3 y, float s) 
-{
-	return x+s*(y-x);
-} 
-
-vec4 adjust (vec4 color, float brightness, float contrast, float saturation)
-{
-	vec3 averageLuminance = vec3(0.5);
-	vec3 brightedColor    = color.rgb * (brightness+1.);
-	vec3 intensity        = vec3(dot(brightedColor, vec3(0.299, 0.587, 0.114)));
-	vec3 saturatedColor   = lerp(intensity, brightedColor, saturation+1.);
-	vec3 contrastedColor  = lerp(averageLuminance, saturatedColor, contrast+1.);
-	
-	return vec4(contrastedColor,1.); 
-}
-
-vec4 vibrance(vec4 color, float strength)
-{ 
-	float luminance = (color.r+color.g+color.b)/3.;
-	//dot(color.rgb, vec3(0.299,0.587,0.114)); 
-	float maximum   = max(color.r, max(color.g, color.b));
-	float amount    = (maximum-luminance)*-strength; 
-	
-	return vec4(color.rgb * (1.-amount) + maximum*amount, 1.); 
-} 
-  
-void main() 
-{ 
-	vec4 c1; 
-	vec4 c2; 
-	if (lowPerformance == 1) 
-	{ 	
-		c1 = texture2D(inputImageTexture, textureCoordinate); 	
-		c2 = texture2D(inputImageTexture, textureCoordinate); 
-    } 
-	else 
-	{ 
-		c1 = sharpen(inputImageTexture); 
-		c2 = normal(c1, gaussianBlur(inputImageTexture), 0.8); // radius = 13. sharpen?? gaussian blur? ???? ??, ?? blending?? ?? 
-	} 
-	vec4 c3 = normal(c1, lighten(c1,c2), 0.6); // lighten (0.6) 
-    c3 = adjust(c3, 0.12, 0., 0.05); // brightness = 12, saturation = 0.5; 
-    c3 = vibrance(level(c3, curve), 0.5); // vibrance = 0.5; 
-	c3 = normal(c3, overlay(c3, vec4(0.)), 1.-texture2D(samplerMask, textureCoordinate).r); // vignetting 
-	
-	gl_FragColor = c3;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/tender.glsl b/android/src/main/res/raw/tender.glsl
deleted file mode 100755
index 0247379..0000000
--- a/android/src/main/res/raw/tender.glsl
+++ /dev/null
@@ -1,92 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-uniform sampler2D grey1Frame; 
-
-void main()
-{ 
-	mediump vec4 textureColor;
-	mediump vec4 textureColorRes;
-	vec4 grey1Color;
-	mediump float satVal = 65.0 / 100.0; 
-	mediump float mask1R = 29.0 / 255.0; 
-	mediump float mask1G = 43.0 / 255.0; 
-	mediump float mask1B = 95.0 / 255.0;
-	
-	highp float xCoordinate = textureCoordinate.x;
-	highp float yCoordinate = textureCoordinate.y;
-	
-	highp float redCurveValue;
-	highp float greenCurveValue; 
-	highp float blueCurveValue; 
-
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate));
-	textureColorRes = textureColor;
-
-	grey1Color = texture2D(grey1Frame, vec2(xCoordinate, yCoordinate)); 
-
-	// step1. saturation
-    highp float G = (textureColor.r + textureColor.g + textureColor.b); 
-	G = G / 3.0; 
-
-	redCurveValue = ((1.0 - satVal) * G + satVal * textureColor.r);
-	greenCurveValue = ((1.0 - satVal) * G + satVal * textureColor.g); 
-	blueCurveValue = ((1.0 - satVal) * G + satVal * textureColor.b); 
-
-	// step2 curve 
-    redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r;
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b;
-
-	// step3 30% opacity  ExclusionBlending
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-	mediump vec4 textureColor2 = vec4(mask1R, mask1G, mask1B, 1.0);
-    textureColor2 = textureColor + textureColor2 - (2.0 * textureColor2 * textureColor); 
-
-	textureColor = (textureColor2 - textureColor) * 0.3 + textureColor; 
-
-	mediump vec4 overlay = vec4(0, 0, 0, 1.0); 
-	mediump vec4 base = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-
-	// step4 overlay blending 
-	mediump float ra; 
-    if (base.r < 0.5) 
-	{ 
-		ra = overlay.r * base.r * 2.0; 
-	} 
-	else
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0);
-	} 
-
-	mediump float ga; 
-	if (base.g < 0.5)
-	{ 
-		ga = overlay.g * base.g * 2.0;
-	} 
-	else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-    mediump float ba; 
-	if (base.b < 0.5) 
-	{ 
-		ba = overlay.b * base.b * 2.0; 
-	} 
-	else 
-	{ 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-
-    textureColor = vec4(ra, ga, ba, 1.0); 
-	base = (textureColor - base) * (grey1Color.r/2.0) + base; 
-
-	gl_FragColor = vec4(base.r, base.g, base.b, 1.0);
-}
-  
\ No newline at end of file
diff --git a/android/src/main/res/raw/toaster2_filter_shader.glsl b/android/src/main/res/raw/toaster2_filter_shader.glsl
deleted file mode 100755
index a16f2a9..0000000
--- a/android/src/main/res/raw/toaster2_filter_shader.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //toaster_metal
-uniform sampler2D inputImageTexture3; //toaster_soft_light
-uniform sampler2D inputImageTexture4; //toaster_curves
-uniform sampler2D inputImageTexture5; //toaster_overlay_map_warm
-uniform sampler2D inputImageTexture6; //toaster_color_shift
-
-void main()
-{
-    mediump vec3 texel;
-    mediump vec2 lookup;
-    vec2 blue;
-    vec2 green;
-    vec2 red;
-    mediump vec4 tmpvar_1;
-    tmpvar_1 = texture2D (inputImageTexture, textureCoordinate);
-    texel = tmpvar_1.xyz;
-    mediump vec4 tmpvar_2;
-    tmpvar_2 = texture2D (inputImageTexture2, textureCoordinate);
-    mediump vec2 tmpvar_3;
-    tmpvar_3.x = tmpvar_2.x;
-    tmpvar_3.y = tmpvar_1.x;
-    texel.x = texture2D (inputImageTexture3, tmpvar_3).x;
-    mediump vec2 tmpvar_4;
-    tmpvar_4.x = tmpvar_2.y;
-    tmpvar_4.y = tmpvar_1.y;
-    texel.y = texture2D (inputImageTexture3, tmpvar_4).y;
-    mediump vec2 tmpvar_5;
-    tmpvar_5.x = tmpvar_2.z;
-    tmpvar_5.y = tmpvar_1.z;
-    texel.z = texture2D (inputImageTexture3, tmpvar_5).z;
-    red.x = texel.x;
-    red.y = 0.16666;
-    green.x = texel.y;
-    green.y = 0.5;
-    blue.x = texel.z;
-    blue.y = 0.833333;
-    texel.x = texture2D (inputImageTexture4, red).x;
-    texel.y = texture2D (inputImageTexture4, green).y;
-    texel.z = texture2D (inputImageTexture4, blue).z;
-    mediump vec2 tmpvar_6;
-    tmpvar_6 = ((2.0 * textureCoordinate) - 1.0);
-    mediump vec2 tmpvar_7;
-    tmpvar_7.x = dot (tmpvar_6, tmpvar_6);
-    tmpvar_7.y = texel.x;
-    lookup = tmpvar_7;
-    texel.x = texture2D (inputImageTexture5, tmpvar_7).x;
-    lookup.y = texel.y;
-    texel.y = texture2D (inputImageTexture5, lookup).y;
-    lookup.y = texel.z;
-    texel.z = texture2D (inputImageTexture5, lookup).z;
-    red.x = texel.x;
-    green.x = texel.y;
-    blue.x = texel.z;
-    texel.x = texture2D (inputImageTexture6, red).x;
-    texel.y = texture2D (inputImageTexture6, green).y;
-    texel.z = texture2D (inputImageTexture6, blue).z;
-    mediump vec4 tmpvar_8;
-    tmpvar_8.w = 1.0;
-    tmpvar_8.xyz = texel;
-    gl_FragColor = tmpvar_8;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/valencia.glsl b/android/src/main/res/raw/valencia.glsl
deleted file mode 100755
index de7e429..0000000
--- a/android/src/main/res/raw/valencia.glsl
+++ /dev/null
@@ -1,46 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //map
-uniform sampler2D inputImageTexture3; //gradMap
-
-mat3 saturateMatrix = mat3(
-                           1.1402,
-                           -0.0598,
-                           -0.061,
-                           -0.1174,
-                           1.0826,
-                           -0.1186,
-                           -0.0228,
-                           -0.0228,
-                           1.1772);
-
-vec3 lumaCoeffs = vec3(.3, .59, .11);
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    texel = vec3(
-                 texture2D(inputImageTexture2, vec2(texel.r, .1666666)).r,
-                 texture2D(inputImageTexture2, vec2(texel.g, .5)).g,
-                 texture2D(inputImageTexture2, vec2(texel.b, .8333333)).b
-                 );
-
-    texel = saturateMatrix * texel;
-    float luma = dot(lumaCoeffs, texel);
-    texel = vec3(
-                 texture2D(inputImageTexture3, vec2(luma, texel.r)).r,
-                 texture2D(inputImageTexture3, vec2(luma, texel.g)).g,
-                 texture2D(inputImageTexture3, vec2(luma, texel.b)).b);
-
-    texel.rgb = mix(originColor.rgb, texel.rgb, strength);
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/vertex.glsl b/android/src/main/res/raw/vertex.glsl
deleted file mode 100755
index da85635..0000000
--- a/android/src/main/res/raw/vertex.glsl
+++ /dev/null
@@ -1,11 +0,0 @@
-attribute vec4 position;
-attribute vec4 inputTextureCoordinate;
-
-varying vec2 textureCoordinate;
-
-uniform mat4 textureTransform;
-
-void main() {
-    textureCoordinate = (textureTransform * inputTextureCoordinate).xy;
-    gl_Position = position;
-}
diff --git a/android/src/main/res/raw/vertex_sharpen.glsl b/android/src/main/res/raw/vertex_sharpen.glsl
deleted file mode 100755
index d1050ae..0000000
--- a/android/src/main/res/raw/vertex_sharpen.glsl
+++ /dev/null
@@ -1,31 +0,0 @@
-attribute vec4 position;
-attribute vec4 inputTextureCoordinate;
-
-uniform float imageWidthFactor;
-uniform float imageHeightFactor;
-uniform float sharpness;
-
-varying vec2 textureCoordinate;
-varying vec2 leftTextureCoordinate;
-varying vec2 rightTextureCoordinate;
-varying vec2 topTextureCoordinate;
-varying vec2 bottomTextureCoordinate;
-
-varying float centerMultiplier;
-varying float edgeMultiplier;
-
-void main() {
-    gl_Position = position;
-
-    mediump vec2 widthStep = vec2(imageWidthFactor, 0.0);
-    mediump vec2 heightStep = vec2(0.0, imageHeightFactor);
-
-    textureCoordinate = inputTextureCoordinate.xy;
-    leftTextureCoordinate = inputTextureCoordinate.xy - widthStep;
-    rightTextureCoordinate = inputTextureCoordinate.xy + widthStep;
-    topTextureCoordinate = inputTextureCoordinate.xy + heightStep;
-    bottomTextureCoordinate = inputTextureCoordinate.xy - heightStep;
-
-    centerMultiplier = 1.0 + 4.0 * sharpness;
-    edgeMultiplier = sharpness;
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/walden.glsl b/android/src/main/res/raw/walden.glsl
deleted file mode 100755
index 19c9b5e..0000000
--- a/android/src/main/res/raw/walden.glsl
+++ /dev/null
@@ -1,35 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //map
-uniform sampler2D inputImageTexture3; //vigMap
-
-uniform float strength;
- 
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-     
-    texel = vec3(
-                  texture2D(inputImageTexture2, vec2(texel.r, .16666)).r,
-                  texture2D(inputImageTexture2, vec2(texel.g, .5)).g,
-                  texture2D(inputImageTexture2, vec2(texel.b, .83333)).b);
-     
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-    vec2 lookup = vec2(d, texel.r);
-    texel.r = texture2D(inputImageTexture3, lookup).r;
-    lookup.y = texel.g;
-    texel.g = texture2D(inputImageTexture3, lookup).g;
-    lookup.y = texel.b;
-    texel.b	= texture2D(inputImageTexture3, lookup).b;
-     
-    texel.rgb = mix(originColor.rgb, texel.rgb, strength);
-
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/warm.glsl b/android/src/main/res/raw/warm.glsl
deleted file mode 100755
index d2f4048..0000000
--- a/android/src/main/res/raw/warm.glsl
+++ /dev/null
@@ -1,58 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve;
-uniform sampler2D greyFrame;
-uniform sampler2D layerImage;
-
-void main()
-{ 
-	lowp vec4 textureColor; 
-	vec4 greyColor;
-	vec4 layerColor;
-	
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y;
-	
-	highp float redCurveValue;
-	highp float greenCurveValue; 
-	highp float blueCurveValue;
-	
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate));
-	
-	greyColor = texture2D(greyFrame, vec2(xCoordinate, yCoordinate));
-	layerColor = texture2D(layerImage, vec2(xCoordinate, yCoordinate));
-
-	// step1 curve
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g;
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-    // step2 curve with mask 
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0);
-
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).a;
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).a; 
-    blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).a; 
-
-	lowp vec4 textureColor2 = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-	// step3 screen with 60% 
-	lowp vec4 base = vec4(mix(textureColor.rgb, textureColor2.rgb, 1.0 - greyColor.r), textureColor.a); 
-	lowp vec4 overlayer = vec4(layerColor.r, layerColor.g, layerColor.b, 1.0);
-
-    // screen blending 
-	textureColor = 1.0 - ((1.0 - base) * (1.0 - overlayer));
-	textureColor = (textureColor - base) * 0.6 + base;
-	
-	redCurveValue = texture2D(curve, vec2(textureColor.r, 1.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 1.0)).g;
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 1.0)).b; 
-	textureColor = vec4(redCurveValue, greenCurveValue, blueCurveValue, 1.0); 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-}
\ No newline at end of file
diff --git a/android/src/main/res/raw/whitecat.glsl b/android/src/main/res/raw/whitecat.glsl
deleted file mode 100755
index fab69d6..0000000
--- a/android/src/main/res/raw/whitecat.glsl
+++ /dev/null
@@ -1,104 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision highp float;
-
-varying highp vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D curve; 
-
-vec3 rgb2hsv(vec3 c) 
-{ 
-	vec4 K = vec4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0); 
-	vec4 p = mix(vec4(c.bg, K.wz), vec4(c.gb, K.xy), step(c.b, c.g));
-	vec4 q = mix(vec4(p.xyw, c.r), vec4(c.r, p.yzx), step(p.x, c.r)); 
-	
-	float d = q.x - min(q.w, q.y); 
-	float e = 1.0e-10; 
-	return vec3(abs(q.z + (q.w - q.y) / (6.0 * d + e)), d / (q.x + e), q.x); 
-} 
-
-vec3 hsv2rgb(vec3 c) 
-{ 
-	vec4 K = vec4(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0); 
-	vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 
-	return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 
-} 
-
-void main() 
-{ 
-    float GreyVal; 
-	lowp vec4 textureColor; 
-	lowp vec4 textureColorOri;
-	float xCoordinate = textureCoordinate.x;
-	float yCoordinate = textureCoordinate.y; 
-
-	highp float redCurveValue;
-	highp float greenCurveValue;
-	highp float blueCurveValue; 
-
-	textureColor = texture2D( inputImageTexture, vec2(xCoordinate, yCoordinate)); 
-
-	// step1 20% opacity  ExclusionBlending 
-    mediump vec4 textureColor2 = textureColor; 
-	textureColor2 = textureColor + textureColor2 - (2.0 * textureColor2 * textureColor); 
-
-	textureColor = (textureColor2 - textureColor) * 0.2 + textureColor; 
-
-    // step2 curve 
-    redCurveValue = texture2D(curve, vec2(textureColor.r, 0.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(textureColor.g, 0.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(textureColor.b, 0.0)).b; 
-
-    redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).r; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).r;
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).r; 
-
-	redCurveValue = texture2D(curve, vec2(redCurveValue, 1.0)).g; 
-	greenCurveValue = texture2D(curve, vec2(greenCurveValue, 1.0)).g; 
-	blueCurveValue = texture2D(curve, vec2(blueCurveValue, 1.0)).g; 
-
-
-	vec3 tColor = vec3(redCurveValue, greenCurveValue, blueCurveValue); 
-	tColor = rgb2hsv(tColor); 
-
-	tColor.g = tColor.g * 0.65; 
-
-	tColor = hsv2rgb(tColor); 
-    tColor = clamp(tColor, 0.0, 1.0); 
-
-    mediump vec4 base = vec4(tColor, 1.0); 
-	mediump vec4 overlay = vec4(0.62, 0.6, 0.498, 1.0); 
-	// step6 overlay blending 
-    mediump float ra; 
-	if (base.r < 0.5) 
-	{ 
-		ra = overlay.r * base.r * 2.0;
-	} else 
-	{ 
-		ra = 1.0 - ((1.0 - base.r) * (1.0 - overlay.r) * 2.0);
-	}
-
-    mediump float ga; 
-	if (base.g < 0.5) 
-	{ 
-		ga = overlay.g * base.g * 2.0; 
-	} else 
-	{ 
-		ga = 1.0 - ((1.0 - base.g) * (1.0 - overlay.g) * 2.0); 
-	} 
-
-	mediump float ba; 
-	if (base.b < 0.5) 
-	{ 
-		ba = overlay.b * base.b * 2.0; 
-	} else 
-	{ 
-		ba = 1.0 - ((1.0 - base.b) * (1.0 - overlay.b) * 2.0); 
-	} 
-	textureColor = vec4(ra, ga, ba, 1.0); 
-	textureColor = (textureColor - base) * 0.1 + base; 
-
-	gl_FragColor = vec4(textureColor.r, textureColor.g, textureColor.b, 1.0); 
-} 
-	
\ No newline at end of file
diff --git a/android/src/main/res/raw/xproii_filter_shader.glsl b/android/src/main/res/raw/xproii_filter_shader.glsl
deleted file mode 100755
index 98d9a9a..0000000
--- a/android/src/main/res/raw/xproii_filter_shader.glsl
+++ /dev/null
@@ -1,37 +0,0 @@
-#extension GL_OES_EGL_image_external : require
-
-precision mediump float;
-
-varying mediump vec2 textureCoordinate;
-
-uniform samplerExternalOES inputImageTexture;
-uniform sampler2D inputImageTexture2; //map
-uniform sampler2D inputImageTexture3; //vigMap
-
-uniform float strength;
-
-void main()
-{
-    vec4 originColor = texture2D(inputImageTexture, textureCoordinate);
-    vec3 texel = texture2D(inputImageTexture, textureCoordinate).rgb;
-
-    vec2 tc = (2.0 * textureCoordinate) - 1.0;
-    float d = dot(tc, tc);
-    vec2 lookup = vec2(d, texel.r);
-    texel.r = texture2D(inputImageTexture3, lookup).r;
-    lookup.y = texel.g;
-    texel.g = texture2D(inputImageTexture3, lookup).g;
-    lookup.y = texel.b;
-    texel.b	= texture2D(inputImageTexture3, lookup).b;
-
-    vec2 red = vec2(texel.r, 0.16666);
-    vec2 green = vec2(texel.g, 0.5);
-    vec2 blue = vec2(texel.b, .83333);
-    texel.r = texture2D(inputImageTexture2, red).r;
-    texel.g = texture2D(inputImageTexture2, green).g;
-    texel.b = texture2D(inputImageTexture2, blue).b;
-    
-    texel.rgb = mix(originColor.rgb, texel.rgb, strength);
-     
-    gl_FragColor = vec4(texel, 1.0);
-}
\ No newline at end of file
diff --git a/android/src/main/res/values/strings.xml b/android/src/main/res/values/strings.xml
index af596fc..7f6c63a 100755
--- a/android/src/main/res/values/strings.xml
+++ b/android/src/main/res/values/strings.xml
@@ -1,3 +1,3 @@
 <resources>
-    <string name="app_name">yasea</string>
+    <string name="app_name">pedro</string>
 </resources>
diff --git a/index.js b/index.js
index 74aa14c..c2225fa 100644
--- a/index.js
+++ b/index.js
@@ -7,17 +7,6 @@ class BroadcastView extends React.Component {
   }
 }
 
-BroadcastView.propTypes = {
-  /**
-   * cameraPosition: 'front' or 'back'
-   * publish: If an rtmpURL is provided, will start publishing.
-              If empty string provided, will stop publishing.
-   */
-  ...View.propTypes,
-  cameraPosition: React.PropTypes.string,
-  publish: React.PropTypes.string
-}
-
 var RNBroadcastView = requireNativeComponent('RNBroadcastView', BroadcastView)
 
 module.exports = BroadcastView
diff --git a/ios/RNBroadcast.podspec b/ios/RNBroadcast.podspec
deleted file mode 100644
index c089040..0000000
--- a/ios/RNBroadcast.podspec
+++ /dev/null
@@ -1,24 +0,0 @@
-
-Pod::Spec.new do |s|
-  s.name         = "RNBroadcast"
-  s.version      = "1.0.0"
-  s.summary      = "RNBroadcast"
-  s.description  = <<-DESC
-                  RNBroadcast
-                   DESC
-  s.homepage     = ""
-  s.license      = "MIT"
-  # s.license      = { :type => "MIT", :file => "FILE_LICENSE" }
-  s.author             = { "author" => "author@domain.cn" }
-  s.platform     = :ios, "7.0"
-  s.source       = { :git => "https://github.com/author/RNBroadcast.git", :tag => "master" }
-  s.source_files  = "RNBroadcast/**/*.{h,m}"
-  s.requires_arc = true
-
-
-  s.dependency "React"
-  #s.dependency "others"
-
-end
-
-  
\ No newline at end of file
diff --git a/ios/RNBroadcast.xcodeproj/project.pbxproj b/ios/RNBroadcast.xcodeproj/project.pbxproj
index bba5f5c..a8e3c34 100644
--- a/ios/RNBroadcast.xcodeproj/project.pbxproj
+++ b/ios/RNBroadcast.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		3AEF1C1D233A8A4C00E57B41 /* RNBroadcast.m in Sources */ = {isa = PBXBuildFile; fileRef = 3AEF1C1C233A8A4C00E57B41 /* RNBroadcast.m */; };
 		4B00137B1E92A69900FE1DD6 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4B00137A1E92A69900FE1DD6 /* UIKit.framework */; };
 		4B00137D1E92A69F00FE1DD6 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4B00137C1E92A69F00FE1DD6 /* Foundation.framework */; };
 		4B00137F1E92A6A800FE1DD6 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4B00137E1E92A6A800FE1DD6 /* AVFoundation.framework */; };
@@ -215,7 +216,6 @@
 		4BC760361E92CA67000373DA /* LFLivePreview.m in Sources */ = {isa = PBXBuildFile; fileRef = 4BC760301E92CA67000373DA /* LFLivePreview.m */; };
 		4BC760371E92CA67000373DA /* UIControl+YYAdd.m in Sources */ = {isa = PBXBuildFile; fileRef = 4BC760331E92CA67000373DA /* UIControl+YYAdd.m */; };
 		4BC760381E92CA67000373DA /* UIView+YYAdd.m in Sources */ = {isa = PBXBuildFile; fileRef = 4BC760351E92CA67000373DA /* UIView+YYAdd.m */; };
-		B3E7B58A1CC2AC0600A0062D /* RNBroadcast.m in Sources */ = {isa = PBXBuildFile; fileRef = B3E7B5891CC2AC0600A0062D /* RNBroadcast.m */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -232,6 +232,7 @@
 
 /* Begin PBXFileReference section */
 		134814201AA4EA6300B7C361 /* libRNBroadcast.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNBroadcast.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		3AEF1C1C233A8A4C00E57B41 /* RNBroadcast.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNBroadcast.m; sourceTree = "<group>"; };
 		4B00137A1E92A69900FE1DD6 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
 		4B00137C1E92A69F00FE1DD6 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4B00137E1E92A6A800FE1DD6 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
@@ -662,7 +663,6 @@
 		4BC760331E92CA67000373DA /* UIControl+YYAdd.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "UIControl+YYAdd.m"; sourceTree = "<group>"; };
 		4BC760341E92CA67000373DA /* UIView+YYAdd.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "UIView+YYAdd.h"; sourceTree = "<group>"; };
 		4BC760351E92CA67000373DA /* UIView+YYAdd.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "UIView+YYAdd.m"; sourceTree = "<group>"; };
-		B3E7B5891CC2AC0600A0062D /* RNBroadcast.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNBroadcast.m; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -689,6 +689,19 @@
 			name = Products;
 			sourceTree = "<group>";
 		};
+		3AEF1C1B233A89D300E57B41 /* RNBroadcast */ = {
+			isa = PBXGroup;
+			children = (
+				4B90FEA51E92E0BD00B96D9B /* images */,
+				4BC760311E92CA67000373DA /* category */,
+				4BC75DC01E92CA28000373DA /* LFLiveKit */,
+				3AEF1C1C233A8A4C00E57B41 /* RNBroadcast.m */,
+				4BC760301E92CA67000373DA /* LFLivePreview.m */,
+				4BC7602F1E92CA67000373DA /* LFLivePreview.h */,
+			);
+			path = RNBroadcast;
+			sourceTree = "<group>";
+		};
 		4B0013791E92A69800FE1DD6 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
@@ -1244,12 +1257,7 @@
 		58B511D21A9E6C8500147676 = {
 			isa = PBXGroup;
 			children = (
-				B3E7B5891CC2AC0600A0062D /* RNBroadcast.m */,
-				4BC7602F1E92CA67000373DA /* LFLivePreview.h */,
-				4BC760301E92CA67000373DA /* LFLivePreview.m */,
-				4B90FEA51E92E0BD00B96D9B /* images */,
-				4BC760311E92CA67000373DA /* category */,
-				4BC75DC01E92CA28000373DA /* LFLiveKit */,
+				3AEF1C1B233A89D300E57B41 /* RNBroadcast */,
 				134814211AA4EA7D00B7C361 /* Products */,
 				4B0013791E92A69800FE1DD6 /* Frameworks */,
 			);
@@ -1294,6 +1302,7 @@
 			developmentRegion = English;
 			hasScannedForEncodings = 0;
 			knownRegions = (
+				English,
 				en,
 			);
 			mainGroup = 58B511D21A9E6C8500147676;
@@ -1333,7 +1342,6 @@
 				4BC760071E92CA29000373DA /* GPUImageSourceOverBlendFilter.m in Sources */,
 				4BC75FDB1E92CA29000373DA /* GPUImageMotionBlurFilter.m in Sources */,
 				4BC75F761E92CA29000373DA /* LFFrame.m in Sources */,
-				B3E7B58A1CC2AC0600A0062D /* RNBroadcast.m in Sources */,
 				4BC75F8B1E92CA29000373DA /* GPUImageCannyEdgeDetectionFilter.m in Sources */,
 				4BC75F6D1E92CA29000373DA /* LFH264VideoEncoder.mm in Sources */,
 				4BC75F6C1E92CA29000373DA /* LFVideoEncoder.m in Sources */,
@@ -1357,6 +1365,7 @@
 				4BC75F7A1E92CA29000373DA /* LFStreamingBuffer.m in Sources */,
 				4BC760381E92CA67000373DA /* UIView+YYAdd.m in Sources */,
 				4BC75FB11E92CA29000373DA /* GPUImageGammaFilter.m in Sources */,
+				3AEF1C1D233A8A4C00E57B41 /* RNBroadcast.m in Sources */,
 				4BC760081E92CA29000373DA /* GPUImageSphereRefractionFilter.m in Sources */,
 				4BC75FF61E92CA29000373DA /* GPUImageRGBFilter.m in Sources */,
 				4BC75FBA1E92CA29000373DA /* GPUImageHazeFilter.m in Sources */,
diff --git a/ios/LFLiveKit/LFLiveKit.h b/ios/RNBroadcast/LFLiveKit/LFLiveKit.h
similarity index 100%
rename from ios/LFLiveKit/LFLiveKit.h
rename to ios/RNBroadcast/LFLiveKit/LFLiveKit.h
diff --git a/ios/LFLiveKit/LFLiveSession.h b/ios/RNBroadcast/LFLiveKit/LFLiveSession.h
similarity index 100%
rename from ios/LFLiveKit/LFLiveSession.h
rename to ios/RNBroadcast/LFLiveKit/LFLiveSession.h
diff --git a/ios/LFLiveKit/LFLiveSession.m b/ios/RNBroadcast/LFLiveKit/LFLiveSession.m
similarity index 100%
rename from ios/LFLiveKit/LFLiveSession.m
rename to ios/RNBroadcast/LFLiveKit/LFLiveSession.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GLProgram.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GLProgram.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GLProgram.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GLProgram.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GLProgram.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GLProgram.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GLProgram.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GLProgram.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImage.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImage.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3ConvolutionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImage3x3TextureSamplingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAdaptiveThresholdFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAddBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAlphaBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAmatorkaFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageColor.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageAverageLuminanceThresholdFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBilateralFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBoxBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBrightnessFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBuffer.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageBulgeDistortionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCGAColorspaceFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCannyEdgeDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageChromaKeyFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageClosingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorBurnBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorConversion.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorDodgeBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorInvertFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorLocalBinaryPatternFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorMatrixFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColorPackingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTFeatureDetector.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageColourFASTSamplingOperation.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageContrastFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCropFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshairGenerator.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageCrosshatchFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDarkenBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDifferenceBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDilationFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalNonMaximumSuppressionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDirectionalSobelEdgeDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDissolveBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageDivideBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageEmbossFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageErosionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExclusionBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageExposureFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFASTCornerDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFalseColorFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterGroup.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFilterPipeline.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFourInputFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebuffer.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageFramebufferCache.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGammaFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianBlurPositionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGaussianSelectiveBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGlassSphereFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageGrayscaleFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHSBFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHalftoneFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHardLightBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHarrisCornerDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHazeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighPassFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHighlightShadowTintFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramEqualizationFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHistogramGenerator.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHoughTransformLineDetector.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageHueFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageJFAVoronoiFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageKuwaharaRadius3Filter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLanczosResamplingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLaplacianFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLevelsFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLightenBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLineGenerator.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLinearBurnBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLocalBinaryPatternFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLookupFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLowPassFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceRangeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminanceThresholdFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosity.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageLuminosityBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMaskFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMedianFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMissEtikateFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMonochromeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMosaicFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMotionDetector.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMovie.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovie.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMovie.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovie.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMovie.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovie.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMovie.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovie.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMovieComposition.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageMultiplyBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNobleCornerDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNonMaximumSuppressionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageNormalBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpacityFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOpeningFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOutput.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOutput.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOutput.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOutput.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOutput.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOutput.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOutput.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOutput.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageOverlayBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageParallelCoordinateLineTransformFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePerlinNoiseFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePinchDistortionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellateFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePixellatePositionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePoissonBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolarPixellateFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePolkaDotFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePosterizeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImagePrewittEdgeDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBClosingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBDilationFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBErosionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRGBOpeningFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataInput.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageRawDataOutput.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSaturationFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageScreenBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSepiaFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSharpenFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageShiTomasiFeatureDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSingleComponentGaussianBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSketchFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSkinToneFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSmoothToonFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSobelEdgeDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftEleganceFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSoftLightBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolarizeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSolidColorGenerator.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSourceOverBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSphereRefractionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStillCamera.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageStretchDistortionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSubtractBlendFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageSwirlFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureInput.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTextureOutput.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThreeInputFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdEdgeDetectionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdSketchFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageThresholdedNonMaximumSuppressionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTiltShiftFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToneCurveFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageToonFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTransformFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputCrossTextureSamplingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoInputFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageTwoPassTextureSamplingFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUIElement.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageUnsharpMaskFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVideoCamera.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVignetteFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageVoronoiConsumerFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWeakPixelInclusionFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageWhiteBalanceFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageXYDerivativeFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageZoomBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/GPUImageiOSBlurFilter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/Framework/GPUImageFramework.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/Framework/GPUImageFramework.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/Framework/GPUImageFramework.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/Framework/GPUImageFramework.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageContext.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageMovieWriter.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture+TextureSubimage.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImagePicture.m
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.h b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.h
diff --git a/ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.m b/ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.m
similarity index 100%
rename from ios/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.m
rename to ios/RNBroadcast/LFLiveKit/Vendor/GPUImage/iOS/GPUImageView.m
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/amf.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/amf.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/amf.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/amf.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/amf.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/amf.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/amf.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/amf.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/bytes.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/bytes.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/bytes.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/bytes.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/dh.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/dh.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/dh.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/dh.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/dhgroups.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/dhgroups.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/dhgroups.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/dhgroups.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/error.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/error.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/error.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/error.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/error.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/error.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/error.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/error.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/handshake.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/handshake.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/handshake.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/handshake.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/hashswf.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/hashswf.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/hashswf.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/hashswf.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/http.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/http.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/http.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/http.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/log.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/log.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/log.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/log.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/log.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/log.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/log.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/log.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/parseurl.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/parseurl.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/parseurl.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/parseurl.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/rtmp.c b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp.c
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/rtmp.c
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp.c
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/rtmp.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/rtmp.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp.h
diff --git a/ios/LFLiveKit/Vendor/pili-librtmp/rtmp_sys.h b/ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp_sys.h
similarity index 100%
rename from ios/LFLiveKit/Vendor/pili-librtmp/rtmp_sys.h
rename to ios/RNBroadcast/LFLiveKit/Vendor/pili-librtmp/rtmp_sys.h
diff --git a/ios/LFLiveKit/capture/LFAudioCapture.h b/ios/RNBroadcast/LFLiveKit/capture/LFAudioCapture.h
similarity index 100%
rename from ios/LFLiveKit/capture/LFAudioCapture.h
rename to ios/RNBroadcast/LFLiveKit/capture/LFAudioCapture.h
diff --git a/ios/LFLiveKit/capture/LFAudioCapture.m b/ios/RNBroadcast/LFLiveKit/capture/LFAudioCapture.m
similarity index 100%
rename from ios/LFLiveKit/capture/LFAudioCapture.m
rename to ios/RNBroadcast/LFLiveKit/capture/LFAudioCapture.m
diff --git a/ios/LFLiveKit/capture/LFVideoCapture.h b/ios/RNBroadcast/LFLiveKit/capture/LFVideoCapture.h
similarity index 100%
rename from ios/LFLiveKit/capture/LFVideoCapture.h
rename to ios/RNBroadcast/LFLiveKit/capture/LFVideoCapture.h
diff --git a/ios/LFLiveKit/capture/LFVideoCapture.m b/ios/RNBroadcast/LFLiveKit/capture/LFVideoCapture.m
similarity index 100%
rename from ios/LFLiveKit/capture/LFVideoCapture.m
rename to ios/RNBroadcast/LFLiveKit/capture/LFVideoCapture.m
diff --git a/ios/LFLiveKit/coder/H264/LFAVEncoder.h b/ios/RNBroadcast/LFLiveKit/coder/H264/LFAVEncoder.h
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFAVEncoder.h
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFAVEncoder.h
diff --git a/ios/LFLiveKit/coder/H264/LFAVEncoder.mm b/ios/RNBroadcast/LFLiveKit/coder/H264/LFAVEncoder.mm
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFAVEncoder.mm
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFAVEncoder.mm
diff --git a/ios/LFLiveKit/coder/H264/LFMP4Atom.h b/ios/RNBroadcast/LFLiveKit/coder/H264/LFMP4Atom.h
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFMP4Atom.h
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFMP4Atom.h
diff --git a/ios/LFLiveKit/coder/H264/LFMP4Atom.m b/ios/RNBroadcast/LFLiveKit/coder/H264/LFMP4Atom.m
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFMP4Atom.m
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFMP4Atom.m
diff --git a/ios/LFLiveKit/coder/H264/LFNALUnit.cpp b/ios/RNBroadcast/LFLiveKit/coder/H264/LFNALUnit.cpp
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFNALUnit.cpp
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFNALUnit.cpp
diff --git a/ios/LFLiveKit/coder/H264/LFNALUnit.h b/ios/RNBroadcast/LFLiveKit/coder/H264/LFNALUnit.h
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFNALUnit.h
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFNALUnit.h
diff --git a/ios/LFLiveKit/coder/H264/LFVideoEncoder.h b/ios/RNBroadcast/LFLiveKit/coder/H264/LFVideoEncoder.h
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFVideoEncoder.h
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFVideoEncoder.h
diff --git a/ios/LFLiveKit/coder/H264/LFVideoEncoder.m b/ios/RNBroadcast/LFLiveKit/coder/H264/LFVideoEncoder.m
similarity index 100%
rename from ios/LFLiveKit/coder/H264/LFVideoEncoder.m
rename to ios/RNBroadcast/LFLiveKit/coder/H264/LFVideoEncoder.m
diff --git a/ios/LFLiveKit/coder/LFAudioEncoding.h b/ios/RNBroadcast/LFLiveKit/coder/LFAudioEncoding.h
similarity index 100%
rename from ios/LFLiveKit/coder/LFAudioEncoding.h
rename to ios/RNBroadcast/LFLiveKit/coder/LFAudioEncoding.h
diff --git a/ios/LFLiveKit/coder/LFH264VideoEncoder.h b/ios/RNBroadcast/LFLiveKit/coder/LFH264VideoEncoder.h
similarity index 100%
rename from ios/LFLiveKit/coder/LFH264VideoEncoder.h
rename to ios/RNBroadcast/LFLiveKit/coder/LFH264VideoEncoder.h
diff --git a/ios/LFLiveKit/coder/LFH264VideoEncoder.mm b/ios/RNBroadcast/LFLiveKit/coder/LFH264VideoEncoder.mm
similarity index 100%
rename from ios/LFLiveKit/coder/LFH264VideoEncoder.mm
rename to ios/RNBroadcast/LFLiveKit/coder/LFH264VideoEncoder.mm
diff --git a/ios/LFLiveKit/coder/LFHardwareAudioEncoder.h b/ios/RNBroadcast/LFLiveKit/coder/LFHardwareAudioEncoder.h
similarity index 100%
rename from ios/LFLiveKit/coder/LFHardwareAudioEncoder.h
rename to ios/RNBroadcast/LFLiveKit/coder/LFHardwareAudioEncoder.h
diff --git a/ios/LFLiveKit/coder/LFHardwareAudioEncoder.m b/ios/RNBroadcast/LFLiveKit/coder/LFHardwareAudioEncoder.m
similarity index 100%
rename from ios/LFLiveKit/coder/LFHardwareAudioEncoder.m
rename to ios/RNBroadcast/LFLiveKit/coder/LFHardwareAudioEncoder.m
diff --git a/ios/LFLiveKit/coder/LFHardwareVideoEncoder.h b/ios/RNBroadcast/LFLiveKit/coder/LFHardwareVideoEncoder.h
similarity index 100%
rename from ios/LFLiveKit/coder/LFHardwareVideoEncoder.h
rename to ios/RNBroadcast/LFLiveKit/coder/LFHardwareVideoEncoder.h
diff --git a/ios/LFLiveKit/coder/LFHardwareVideoEncoder.m b/ios/RNBroadcast/LFLiveKit/coder/LFHardwareVideoEncoder.m
similarity index 100%
rename from ios/LFLiveKit/coder/LFHardwareVideoEncoder.m
rename to ios/RNBroadcast/LFLiveKit/coder/LFHardwareVideoEncoder.m
diff --git a/ios/LFLiveKit/coder/LFVideoEncoding.h b/ios/RNBroadcast/LFLiveKit/coder/LFVideoEncoding.h
similarity index 100%
rename from ios/LFLiveKit/coder/LFVideoEncoding.h
rename to ios/RNBroadcast/LFLiveKit/coder/LFVideoEncoding.h
diff --git a/ios/LFLiveKit/configuration/LFLiveAudioConfiguration.h b/ios/RNBroadcast/LFLiveKit/configuration/LFLiveAudioConfiguration.h
similarity index 100%
rename from ios/LFLiveKit/configuration/LFLiveAudioConfiguration.h
rename to ios/RNBroadcast/LFLiveKit/configuration/LFLiveAudioConfiguration.h
diff --git a/ios/LFLiveKit/configuration/LFLiveAudioConfiguration.m b/ios/RNBroadcast/LFLiveKit/configuration/LFLiveAudioConfiguration.m
similarity index 100%
rename from ios/LFLiveKit/configuration/LFLiveAudioConfiguration.m
rename to ios/RNBroadcast/LFLiveKit/configuration/LFLiveAudioConfiguration.m
diff --git a/ios/LFLiveKit/configuration/LFLiveVideoConfiguration.h b/ios/RNBroadcast/LFLiveKit/configuration/LFLiveVideoConfiguration.h
similarity index 100%
rename from ios/LFLiveKit/configuration/LFLiveVideoConfiguration.h
rename to ios/RNBroadcast/LFLiveKit/configuration/LFLiveVideoConfiguration.h
diff --git a/ios/LFLiveKit/configuration/LFLiveVideoConfiguration.m b/ios/RNBroadcast/LFLiveKit/configuration/LFLiveVideoConfiguration.m
similarity index 100%
rename from ios/LFLiveKit/configuration/LFLiveVideoConfiguration.m
rename to ios/RNBroadcast/LFLiveKit/configuration/LFLiveVideoConfiguration.m
diff --git a/ios/LFLiveKit/filter/LFGPUImageBeautyFilter.h b/ios/RNBroadcast/LFLiveKit/filter/LFGPUImageBeautyFilter.h
similarity index 100%
rename from ios/LFLiveKit/filter/LFGPUImageBeautyFilter.h
rename to ios/RNBroadcast/LFLiveKit/filter/LFGPUImageBeautyFilter.h
diff --git a/ios/LFLiveKit/filter/LFGPUImageBeautyFilter.m b/ios/RNBroadcast/LFLiveKit/filter/LFGPUImageBeautyFilter.m
similarity index 100%
rename from ios/LFLiveKit/filter/LFGPUImageBeautyFilter.m
rename to ios/RNBroadcast/LFLiveKit/filter/LFGPUImageBeautyFilter.m
diff --git a/ios/LFLiveKit/filter/LFGPUImageEmptyFilter.h b/ios/RNBroadcast/LFLiveKit/filter/LFGPUImageEmptyFilter.h
similarity index 100%
rename from ios/LFLiveKit/filter/LFGPUImageEmptyFilter.h
rename to ios/RNBroadcast/LFLiveKit/filter/LFGPUImageEmptyFilter.h
diff --git a/ios/LFLiveKit/filter/LFGPUImageEmptyFilter.m b/ios/RNBroadcast/LFLiveKit/filter/LFGPUImageEmptyFilter.m
similarity index 100%
rename from ios/LFLiveKit/filter/LFGPUImageEmptyFilter.m
rename to ios/RNBroadcast/LFLiveKit/filter/LFGPUImageEmptyFilter.m
diff --git a/ios/LFLiveKit/objects/LFAudioFrame.h b/ios/RNBroadcast/LFLiveKit/objects/LFAudioFrame.h
similarity index 100%
rename from ios/LFLiveKit/objects/LFAudioFrame.h
rename to ios/RNBroadcast/LFLiveKit/objects/LFAudioFrame.h
diff --git a/ios/LFLiveKit/objects/LFAudioFrame.m b/ios/RNBroadcast/LFLiveKit/objects/LFAudioFrame.m
similarity index 100%
rename from ios/LFLiveKit/objects/LFAudioFrame.m
rename to ios/RNBroadcast/LFLiveKit/objects/LFAudioFrame.m
diff --git a/ios/LFLiveKit/objects/LFFrame.h b/ios/RNBroadcast/LFLiveKit/objects/LFFrame.h
similarity index 100%
rename from ios/LFLiveKit/objects/LFFrame.h
rename to ios/RNBroadcast/LFLiveKit/objects/LFFrame.h
diff --git a/ios/LFLiveKit/objects/LFFrame.m b/ios/RNBroadcast/LFLiveKit/objects/LFFrame.m
similarity index 100%
rename from ios/LFLiveKit/objects/LFFrame.m
rename to ios/RNBroadcast/LFLiveKit/objects/LFFrame.m
diff --git a/ios/LFLiveKit/objects/LFLiveDebug.h b/ios/RNBroadcast/LFLiveKit/objects/LFLiveDebug.h
similarity index 100%
rename from ios/LFLiveKit/objects/LFLiveDebug.h
rename to ios/RNBroadcast/LFLiveKit/objects/LFLiveDebug.h
diff --git a/ios/LFLiveKit/objects/LFLiveDebug.m b/ios/RNBroadcast/LFLiveKit/objects/LFLiveDebug.m
similarity index 100%
rename from ios/LFLiveKit/objects/LFLiveDebug.m
rename to ios/RNBroadcast/LFLiveKit/objects/LFLiveDebug.m
diff --git a/ios/LFLiveKit/objects/LFLiveStreamInfo.h b/ios/RNBroadcast/LFLiveKit/objects/LFLiveStreamInfo.h
similarity index 100%
rename from ios/LFLiveKit/objects/LFLiveStreamInfo.h
rename to ios/RNBroadcast/LFLiveKit/objects/LFLiveStreamInfo.h
diff --git a/ios/LFLiveKit/objects/LFLiveStreamInfo.m b/ios/RNBroadcast/LFLiveKit/objects/LFLiveStreamInfo.m
similarity index 100%
rename from ios/LFLiveKit/objects/LFLiveStreamInfo.m
rename to ios/RNBroadcast/LFLiveKit/objects/LFLiveStreamInfo.m
diff --git a/ios/LFLiveKit/objects/LFVideoFrame.h b/ios/RNBroadcast/LFLiveKit/objects/LFVideoFrame.h
similarity index 100%
rename from ios/LFLiveKit/objects/LFVideoFrame.h
rename to ios/RNBroadcast/LFLiveKit/objects/LFVideoFrame.h
diff --git a/ios/LFLiveKit/objects/LFVideoFrame.m b/ios/RNBroadcast/LFLiveKit/objects/LFVideoFrame.m
similarity index 100%
rename from ios/LFLiveKit/objects/LFVideoFrame.m
rename to ios/RNBroadcast/LFLiveKit/objects/LFVideoFrame.m
diff --git a/ios/LFLiveKit/publish/LFStreamRTMPSocket.h b/ios/RNBroadcast/LFLiveKit/publish/LFStreamRTMPSocket.h
similarity index 100%
rename from ios/LFLiveKit/publish/LFStreamRTMPSocket.h
rename to ios/RNBroadcast/LFLiveKit/publish/LFStreamRTMPSocket.h
diff --git a/ios/LFLiveKit/publish/LFStreamRtmpSocket.m b/ios/RNBroadcast/LFLiveKit/publish/LFStreamRtmpSocket.m
similarity index 100%
rename from ios/LFLiveKit/publish/LFStreamRtmpSocket.m
rename to ios/RNBroadcast/LFLiveKit/publish/LFStreamRtmpSocket.m
diff --git a/ios/LFLiveKit/publish/LFStreamSocket.h b/ios/RNBroadcast/LFLiveKit/publish/LFStreamSocket.h
similarity index 100%
rename from ios/LFLiveKit/publish/LFStreamSocket.h
rename to ios/RNBroadcast/LFLiveKit/publish/LFStreamSocket.h
diff --git a/ios/LFLiveKit/publish/LFStreamingBuffer.h b/ios/RNBroadcast/LFLiveKit/publish/LFStreamingBuffer.h
similarity index 100%
rename from ios/LFLiveKit/publish/LFStreamingBuffer.h
rename to ios/RNBroadcast/LFLiveKit/publish/LFStreamingBuffer.h
diff --git a/ios/LFLiveKit/publish/LFStreamingBuffer.m b/ios/RNBroadcast/LFLiveKit/publish/LFStreamingBuffer.m
similarity index 100%
rename from ios/LFLiveKit/publish/LFStreamingBuffer.m
rename to ios/RNBroadcast/LFLiveKit/publish/LFStreamingBuffer.m
diff --git a/ios/LFLiveKit/publish/NSMutableArray+LFAdd.h b/ios/RNBroadcast/LFLiveKit/publish/NSMutableArray+LFAdd.h
similarity index 100%
rename from ios/LFLiveKit/publish/NSMutableArray+LFAdd.h
rename to ios/RNBroadcast/LFLiveKit/publish/NSMutableArray+LFAdd.h
diff --git a/ios/LFLiveKit/publish/NSMutableArray+LFAdd.m b/ios/RNBroadcast/LFLiveKit/publish/NSMutableArray+LFAdd.m
similarity index 100%
rename from ios/LFLiveKit/publish/NSMutableArray+LFAdd.m
rename to ios/RNBroadcast/LFLiveKit/publish/NSMutableArray+LFAdd.m
diff --git a/ios/LFLivePreview.h b/ios/RNBroadcast/LFLivePreview.h
similarity index 100%
rename from ios/LFLivePreview.h
rename to ios/RNBroadcast/LFLivePreview.h
diff --git a/ios/LFLivePreview.m b/ios/RNBroadcast/LFLivePreview.m
similarity index 96%
rename from ios/LFLivePreview.m
rename to ios/RNBroadcast/LFLivePreview.m
index 0d3e4a9..01a23b2 100755
--- a/ios/LFLivePreview.m
+++ b/ios/RNBroadcast/LFLivePreview.m
@@ -40,8 +40,8 @@ - (LFLiveSession*)session {
         videoConfiguration.videoBitRate = 800*1024;
         videoConfiguration.videoMaxBitRate = 1000*1024;
         videoConfiguration.videoMinBitRate = 500*1024;
-        videoConfiguration.videoFrameRate = 24;
-        videoConfiguration.videoMaxKeyframeInterval = 48;
+        videoConfiguration.videoFrameRate = 30;
+        videoConfiguration.videoMaxKeyframeInterval = 30;
         videoConfiguration.outputImageOrientation = UIInterfaceOrientationLandscapeRight;
         videoConfiguration.autorotate = YES;
         videoConfiguration.sessionPreset = LFCaptureSessionPreset720x1280;
diff --git a/ios/RNBroadcast.m b/ios/RNBroadcast/RNBroadcast.m
similarity index 94%
rename from ios/RNBroadcast.m
rename to ios/RNBroadcast/RNBroadcast.m
index 5694f9e..8d26dac 100644
--- a/ios/RNBroadcast.m
+++ b/ios/RNBroadcast/RNBroadcast.m
@@ -14,7 +14,7 @@ @interface RNBroadcastViewManager : RCTViewManager
 @end
 
 @implementation RNBroadcastViewManager
-RCT_EXPORT_MODULE()
+RCT_EXPORT_MODULE(RNBroadcastView)
 
 - (UIView *)view
 {
diff --git a/ios/category/UIControl+YYAdd.h b/ios/RNBroadcast/category/UIControl+YYAdd.h
similarity index 100%
rename from ios/category/UIControl+YYAdd.h
rename to ios/RNBroadcast/category/UIControl+YYAdd.h
diff --git a/ios/category/UIControl+YYAdd.m b/ios/RNBroadcast/category/UIControl+YYAdd.m
similarity index 100%
rename from ios/category/UIControl+YYAdd.m
rename to ios/RNBroadcast/category/UIControl+YYAdd.m
diff --git a/ios/category/UIView+YYAdd.h b/ios/RNBroadcast/category/UIView+YYAdd.h
similarity index 100%
rename from ios/category/UIView+YYAdd.h
rename to ios/RNBroadcast/category/UIView+YYAdd.h
diff --git a/ios/category/UIView+YYAdd.m b/ios/RNBroadcast/category/UIView+YYAdd.m
similarity index 100%
rename from ios/category/UIView+YYAdd.m
rename to ios/RNBroadcast/category/UIView+YYAdd.m
diff --git a/ios/images/camra_beauty@2x.png b/ios/RNBroadcast/images/camra_beauty@2x.png
similarity index 100%
rename from ios/images/camra_beauty@2x.png
rename to ios/RNBroadcast/images/camra_beauty@2x.png
diff --git a/ios/images/camra_beauty@3x.png b/ios/RNBroadcast/images/camra_beauty@3x.png
similarity index 100%
rename from ios/images/camra_beauty@3x.png
rename to ios/RNBroadcast/images/camra_beauty@3x.png
diff --git a/ios/images/camra_beauty_close@2x.png b/ios/RNBroadcast/images/camra_beauty_close@2x.png
similarity index 100%
rename from ios/images/camra_beauty_close@2x.png
rename to ios/RNBroadcast/images/camra_beauty_close@2x.png
diff --git a/ios/images/camra_beauty_close@3x.png b/ios/RNBroadcast/images/camra_beauty_close@3x.png
similarity index 100%
rename from ios/images/camra_beauty_close@3x.png
rename to ios/RNBroadcast/images/camra_beauty_close@3x.png
diff --git a/ios/images/camra_preview@2x.png b/ios/RNBroadcast/images/camra_preview@2x.png
similarity index 100%
rename from ios/images/camra_preview@2x.png
rename to ios/RNBroadcast/images/camra_preview@2x.png
diff --git a/ios/images/camra_preview@3x.png b/ios/RNBroadcast/images/camra_preview@3x.png
similarity index 100%
rename from ios/images/camra_preview@3x.png
rename to ios/RNBroadcast/images/camra_preview@3x.png
diff --git a/ios/images/close_preview@2x.png b/ios/RNBroadcast/images/close_preview@2x.png
similarity index 100%
rename from ios/images/close_preview@2x.png
rename to ios/RNBroadcast/images/close_preview@2x.png
diff --git a/ios/images/close_preview@3x.png b/ios/RNBroadcast/images/close_preview@3x.png
similarity index 100%
rename from ios/images/close_preview@3x.png
rename to ios/RNBroadcast/images/close_preview@3x.png
diff --git a/ios/images/ios-29x29.png b/ios/RNBroadcast/images/ios-29x29.png
similarity index 100%
rename from ios/images/ios-29x29.png
rename to ios/RNBroadcast/images/ios-29x29.png
diff --git a/package.json b/package.json
index 755db22..6603957 100644
--- a/package.json
+++ b/package.json
@@ -1,8 +1,8 @@
 
 {
   "name": "react-native-broadcast",
-  "version": "1.0.0",
-  "description": "",
+  "version": "1.0.11",
+  "description": "RTMP Broadcaster library for React Native",
   "main": "index.js",
   "scripts": {
     "test": "echo \"Error: no test specified\" && exit 1"
@@ -10,6 +10,7 @@
   "keywords": [
     "react-native"
   ],
-  "author": "",
+  "author": "BehaviorCloud LLC",
+  "homepage": "https://github.com/BehaviorCloud/react-native-broadcast#readme",
   "license": ""
 }